def genRecipFeatures(year, weightF, graphFiles=None, bigraph=None):
    timing = Timer('Generating recip features for %d %s' % (year, weightF))

    if not graphFiles: graphFiles = getGraphFiles(year, weightF)
    if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)

    receiptsFromDonor, totalReceipts, totalDonations = \
            recip_feature_extractor.getDonationAmounts(bigraph)
    partialFeatures, fullFeatures = \
            recip_feature_extractor.getCategoricalGraphFeatures(bigraph)

    timing.markEvent('Loaded bigraph, donor amounts, and categorical feature funcs')

    for gf in graphFiles:
        donorFeatures = pickler.load('Data/Features/%s.features' % gf)
        timing.markEvent('Loaded donor features for graph %s' % gf)

        recipFeatures = recip_feature_extractor.getRecipFeatures(
                bigraph, donorFeatures, receiptsFromDonor, totalReceipts,
                totalDonations, partialFeatures, fullFeatures)
        timing.markEvent('Calculated recip features')

        recip_feature_extractor.saveFeatures(bigraph, recipFeatures, 'Data/Recip-Features/%s' % gf)
        timing.markEvent('Saved recip features')

    timing.finish()
def genDonorFeatures(year, weightF, graphFiles=None, bigraph=None, adjMat=None, newToOldIDs=None):
    timing = Timer('Generating donor features for %d %s' % (year, weightF))

    if not graphFiles:
        graphFiles = getGraphFiles(year, weightF)
    if not bigraph:
        bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)
    if adjMat is None:
        adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
        adjMat = adjMat.tocsc()
    if newToOldIDs is None:
        newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year)
    timing.markEvent('Loaded bigraph, adj matrix, and newToOld mapping')

    for gf in graphFiles:
        unigraph = graph_funcs.loadGraph('Data/Unipartite-Graphs/%s' % gf, snap.TUNGraph)
        timing.markEvent('Loaded graph %s' % gf)

        features = feature_extractor.generateFeatures(year, bigraph, unigraph, newToOldIDs, adjMat)
        timing.markEvent('Generated features')

        pickler.save(features, 'Data/Features/%s.features' % gf)
        timing.markEvent('Saved features')

    timing.finish()
Exemple #3
0
def getDonationAmounts(graph):
    timing = Timer('Getting candidate, donor, and cand-donor donation amounts')
    # A dictionary from rnodeids to dictionaries from cnodeids to floats indicating
    # the total donations from that donor to that candidate
    receiptsFromDonor = defaultdict(lambda: defaultdict(int))

    # A dictionary from rnodeids to ints indicating the total amount donated to
    # that candidate.
    totalReceipts = defaultdict(int)

    # A dictionary from cnodeids to ints indicating the total amount donated by
    # that donor.
    totalDonations = defaultdict(int)

    # For each donation, note it in the relevant dictionaries
    for edge in graph.Edges():
        donor = edge.GetSrcNId()
        recip = edge.GetDstNId()
        amount = graph.GetIntAttrDatE(edge.GetId(), 'amount')

        receiptsFromDonor[recip][donor] += amount
        totalReceipts[recip] += amount
        totalDonations[donor] += amount

    timing.finish()
    return receiptsFromDonor, totalReceipts, totalDonations
Exemple #4
0
 def set_binary_model(self, model_file_path: str):
     timer: Timer = Timer()
     Logger().start_analyzing("Loading binary Word2VecModel")
     self.model = KeyedVectors.load_word2vec_format(model_file_path,
                                                    binary=True)
     Logger().finish_analyzing(timer.get_duration(),
                               "Loading binary Word2VecModel")
def generateFeatures(year, bipartite, unipartite, newToOldIDs, adjMatrix):
    timing = Timer('generating features for %d' % year)

    bipartiteFeatures = extractBipartiteFeatures(bipartite)
    timing.markEvent('Extracted bipartite features.')

    # rawUnifeatures, componentFeatureFunc, communityFeatureFuncn = extractUnipartiteFeatures(unipartite, adjMatrix)
    rawUnifeatures, componentFeatureFunc, CNMFeatureFunc = extractUnipartiteFeatures(
        unipartite, adjMatrix)
    unipartiteFeatures = convertNewToOldIDs(rawUnifeatures, newToOldIDs)
    timing.markEvent('Extracted unipartite features.')

    # append unipartite features to bipartite features for each node, returning combined feature dictionary.
    # If the donor is not in the unipartite feature graph then we just take the default values (since the
    # node falls below the unipartite threshold from sqlToGraphs):
    features = {}
    for donorNode in graph_funcs.getDonors(bipartite):
        oldNID = donorNode.GetId()
        if oldNID in unipartiteFeatures:
            features[oldNID] = bipartiteFeatures[oldNID] + unipartiteFeatures[
                oldNID]
        else:
            features[oldNID] = bipartiteFeatures[
                oldNID] + defaultUnipartiteFeatures(
                    componentFeatureFunc,
                    CNMFeatureFunc)  #, communityFeatureFuncn)
    timing.finish()

    return features
Exemple #6
0
def loadRecipients(dbNames, filepath):
    timing = Timer('loading Recipients table')
    extractors = [
        0, 7, 8, 10, 12, 13, 14, 15, 16, 22, 23, 39, 46, 47, 61, 62, 63, 64, 65
    ]
    transforms = [
        int, str, safeInt, party, str, str, incumb, float, float, int, gender,
        safeInt, winner, safeFloat, safeFloat, safeFloat, candStatus, int,
        candOrComm
    ]
    observedKeys = set()

    for db in dbNames:
        initRecipientTable(db)

    with open(filepath, 'r') as f:
        reader = csv.reader(f)
        reader.next()  # skip column headers
        for i, block in enumerate(generateChunk(reader, extractors,
                                                transforms)):
            newBlock = filterRecipients(block, observedKeys)
            for db in dbNames:
                commitRecipBlock(db, newBlock)

    timing.finish()
Exemple #7
0
def trainAndTestModels(year,
                       extension,
                       X=None,
                       Y=None,
                       k=10,
                       clf=linear_model.LinearRegression(),
                       transF=None,
                       decomp_func=None):
    timing = Timer('Running regression for %d.%s' % (year, extension))
    if X is None or Y is None:
        X, Y = pickler.load('Data/Recip-Features/%d.%s' % (year, extension))
    if transF: Y = transF(Y)
    timing.markEvent('Loaded X and Y')
    rsquareds = []

    # Train and test the regression model on each k-fold set
    kf = KFold(len(Y), k)
    for train, test in kf:
        X_train, X_test = X[train], X[test]
        Y_train, Y_test = Y[train], Y[test]

        if decomp_func:
            decomp_func.fit(X_train)
            X_train = decomp_func.transform(X_train)
            X_test = decomp_func.transform(X_test)

        clf.fit(X_train, Y_train)
        rsquareds.append(clf.score(X_test, Y_test))
    timing.markEvent('Ran regression')

    timing.finish()
    return rsquareds
Exemple #8
0
 def __init__(self, path: str, supported_extensions: [int]):
     self.timer = Timer()
     self.path = path
     self.relative_path = PathExtractor().get_relative_path(path)
     self.supported_extensions = supported_extensions
     self.file_name = PathExtractor().get_file_name(path)
     self.extension = PathExtractor().get_file_extension(self.file_name)
Exemple #9
0
def loadContributors(dbNames, filepath):
    timing = Timer('loading Contributors table')
    extractors = [0, 1, 2, 3]
    transforms = [int, indiv, str, safeFloat]

    for db in dbNames:
        initContributorsTable(db)
    reader = csv.reader(open(filepath, 'rb'))
    reader.next()  # skip column headers
    for i, block in enumerate(generateChunk(reader, extractors, transforms)):
        for db in dbNames:
            commitContribBlock(db, block)

    timing.finish()
Exemple #10
0
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])

    if len(sys.argv) != 2:
        Logger().usage(f'python {script_name} <wiki.en.raw.txt>')
        return

    file_path = sys.argv[1]

    if PathValidator().is_valid_files([file_path]):
        Logger().info(f'Input file: "{file_path}"')
        Logger().info("Starting to remove stopwords")
        timer = Timer()
        remove_stopwords(file_path)
        Logger().finish_script(timer.get_duration(), script_name)
Exemple #11
0
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])

    if len(sys.argv) != 2:
        Logger().usage(f"python {script_name} <wiki.en.filtered.txt>")
        return

    file_path = sys.argv[1]

    if PathValidator().is_valid_files([file_path]):
        Logger().info(f'Input file: "{file_path}"')
        Logger().info("Starting to lemmatize text")
        timer = Timer()
        lemmatize_text(file_path, timer)
        Logger().finish_script(timer.get_duration(), script_name)
def getResults(year, weightF, graphFiles=None):
    timing = Timer('Running regressions for %d %s' % (year, weightF))

    results = []

    if not graphFiles: graphFiles = getGraphFiles(year, weightF)

    for gf in graphFiles:
        X, Y = pickler.load('Data/Recip-Features/%s' % gf)
        rsquareds = cfscore_predictions.trainAndTestModels(year, weightF, X=X, Y=Y)
        results.append([weightF, gf, rsquareds])

    timing.finish()

    return results
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])

    if len(sys.argv) != 2:
        Logger().usage(
            f'python {script_name} <en.wiki-latest-pages-articles.xml.bz2>')
        return

    file_path: str = sys.argv[1]

    if PathValidator().is_valid_files([file_path]):
        Logger().info(f'Input file: "{file_path}"')
        Logger().info(f'Starting to create wiki corpus from "{file_path}"')
        timer = Timer()
        get_corpus(file_path)
        Logger().finish_script(timer.get_duration(), script_name)
Exemple #14
0
def loadTransactionFile(dbName, csvName, year):
    timing = Timer('loading Transactions_%d into table' % year)
    extractors = [0, 1, 2, 3, 4, 5, 13, 27, 28, 29, 33, 34, 36, 37]
    transforms = [
        int, str, str, strToFltToInt, str, strToFltToInt, indiv, str, party,
        candOrComm, str, str, safeFloat, safeFloat
    ]
    initTransactionsTable(dbName)

    with open(csvName, 'r') as f:
        reader = csv.reader(f)
        reader.next()  # skip column headers
        for i, block in enumerate(generateChunk(reader, extractors,
                                                transforms)):
            newBlock = filterTransactions(block)
            commitTransBlock(dbName, newBlock)

    timing.finish()
Exemple #15
0
def getSortedMatrixVals(filename):
    timing = Timer('Gettin sorted matrix vals')
    adjMat = pickler.load(filename)
    timing.markEvent('Loaded adjacency matrix')
    N = adjMat.shape[0]
    xIndices, yIndices = adjMat.nonzero()
    timing.markEvent('Loaded nonzero indices')
    data = adjMat[xIndices, yIndices]
    timing.markEvent('Loaded nonzero vals')
    flat = np.ravel(data)
    timing.markEvent('Flattened data')

    vals = zip(xIndices, yIndices, flat)
    timing.markEvent('Zipped values')
    vals.sort(key=lambda v: v[2], reverse=True)
    timing.markEvent('Sorted values')
    print vals[0][2] > vals[1][2]
    print vals[0][2], vals[1][2]
    raise ValueError("LOL")
    return vals, N
Exemple #16
0
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])
    timer = Timer()

    if len(sys.argv) == 3:
        model_path = PathExtractor().get_absolute_path(sys.argv[2])
        if not PathValidator().is_valid_files([model_path]):
            return
        Word2VecModel.instance.set_model(model_path)

    if len(sys.argv) < 2 or len(sys.argv) > 3:
        Logger().usage(
            f"python {script_name} <file_or_directory_path> [<word2vec.model>]"
        )
        return

    project_path = PathExtractor().get_absolute_path(sys.argv[1])

    if PathValidator().is_valid_paths([project_path]):
        parse(project_path)
        Logger().finish_script(timer.get_duration(), script_name)
def calcAverageWeights(graph, adjMat):
    neighbors = defaultdict(list)
    timing = Timer('Calculating average weights')
    # Get all the nodes that a node borders in the graph
    for edge in graph.Edges():
        nodeid1 = edge.GetSrcNId()
        nodeid2 = edge.GetDstNId()
        neighbors[nodeid1].append(nodeid2)
        neighbors[nodeid2].append(nodeid1)
    timing.markEvent('Gotten all neighbors')

    # Get the average weight per node connected to
    weights = {}
    i = 0
    for nodeid in neighbors:
        rows = neighbors[nodeid]
        weights[nodeid] = adjMat[rows, nodeid].sum() / float(len(rows))
        i += 1
        if i % 1000 == 0:
            timing.markEvent('Done with %d out of %d' % (i, len(neighbors)))

    return weights
def extractUnipartiteFeatures(unipartiteGraph, adjMat):
    timing = Timer('extracting unipartite features')

    features = defaultdict(list)
    #componentFeatureFunc, communityFeatureFuncn, idToCommunity = getUnipartiteSurfaceFeatures(unipartiteGraph, adjMat, features)
    componentFeatureFunc, CNMFeatureFunc, idToCNM = getUnipartiteSurfaceFeatures(
        unipartiteGraph, adjMat, features)

    timing.markEvent('1. Extracted surface features')

    # Average weight of edges:
    avgWeights = calcAverageWeights(unipartiteGraph, adjMat)
    #totalWeights = {adjMat
    timing.markEvent('2. Computed average weights.')

    # Size of connected component:
    #cnctComponents = calcCnctComponents(unipartiteGraph)
    timing.markEvent('3. Computed connected components.')

    # Size of CNM community:
    communities = calcCommunities(idToCNM)
    timing.markEvent('4. Computed CNM communities.')

    # Pagerank:
    pageRanks = snap.TIntFltH()
    snap.GetPageRank(unipartiteGraph, pageRanks)
    timing.markEvent('5. Computed PageRank.')

    # combine the graph wide features with the existing surface features:
    for nid in features:
        features[nid].append(avgWeights[nid])
        #features[nid].append(cnctComponents[nid])
        features[nid].append(communities[nid])
        features[nid].append(pageRanks[nid])

    timing.finish()

    return features, componentFeatureFunc, CNMFeatureFunc
Exemple #19
0
def processYearAndWeight(year, weighting, percents=None, thresholds=None):
    timing = Timer('Running for year %d and weight %s' % (year, weighting))
    adjMatFile = 'Data/Unipartite-Matrix/%d.%s' % (year, weighting)
    sortedVals, N = getSortedMatrixVals(adjMatFile)
    timing.markEvent('Got sorted vals')

    if percents:
        for p in percents:
            outfile = 'Data/Unipartite-Graphs/%d.%s_percent_%f.graph' \
                    % (year, weighting, p)
            graph = pruneGraphByPercent(sortedVals, N, p)
            graph_funcs.saveGraph(graph, outfile)
            timing.markEvent('Finished for %f percent' % p)

    if thresholds:
        for t in thresholds:
            outfile = 'Data/Unipartite-Graphs/%d.%s_threshold_%f.graph' \
                    % (year, weighting, t)
            graph = pruneGraphByThreshold(sortedVals, N, t)
            graph_funcs.saveGraph(graph, outfile)
            timing.markEvent('Finished for threshold %f' % t)

    timing.finish()
Exemple #20
0
def getRecipFeatures(graph,
                     donorFeatures,
                     receiptsFromDonor,
                     totalReceipts,
                     totalDonations,
                     partialFeatures,
                     fullFeatures,
                     includeDonorFeatures=False):
    timing = Timer('Getting recipient features')
    recipFeatures = {}

    for recipNode in graph_funcs.getRecipients(graph, cfs=True):
        rnodeid = recipNode.GetId()

        # Add a donor feature indicating what percent of this donor's donations
        # went to this candidate.
        for donor in receiptsFromDonor[rnodeid]:
            pct = receiptsFromDonor[rnodeid][donor] / float(
                totalDonations[donor])
            donorFeatures[donor].append(pct)

        if includeDonorFeatures:
            recipFeatures[rnodeid] = np.append(
                getPartialNodeRecipSpecificFeatures(graph, rnodeid),
                processDonorFeaturesForRecip(donorFeatures,
                                             receiptsFromDonor[rnodeid]))
        else:
            recipFeatures[rnodeid] = \
                processDonorFeaturesForRecip(donorFeatures, receiptsFromDonor[rnodeid])

        # Remove the temporarily added feature for what percent of this donor's
        # donations went to this candidate.
        for donor in receiptsFromDonor[rnodeid]:
            donorFeatures[donor].pop()

    timing.finish()
    return recipFeatures
def runFullPipeline(year):
    timing = Timer('Running pipeline for %d' % year)

    weightings = ('adamic', 'cosine', 'jaccard', 'jaccard2', 'weighted_adamic')
    bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph'% year)
    newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year)

    for weightF in weightings:

        graphFiles = getGraphFiles(year, weightF)

        adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
        timing.markEvent('Loaded everything for donor features')
        genDonorFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph,\
                adjMat=adjMat, newToOldIDs=newToOldIDs)
        del adjMat # free the incredible amount of memory for the adjacency matrix


        genRecipFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph)
        results = getResults(year, weightF, graphFiles=graphFiles)
        pickler.save(results, 'Data/pruning_optimizations.%d.%s' % (year, weightF))
        timing.markEvent('Finished with %s' % weightF)

    timing.finish()
Exemple #22
0
 def init(self):
     self.timer = Timer(3000000).start()
Exemple #23
0
        rsquareds.append(clf.score(X_test, Y_test))
    timing.markEvent('Ran regression')

    timing.finish()
    return rsquareds


################################################################################
# Module command-line behavior #
################################################################################

if __name__ == '__main__':
    #extensions = ('jaccard', 'jaccard2', 'affinity', 'cosine', 'adamic', 'weighted_adamic', 'baseline')
    #extensions = ('jaccard2',)
    extensions = (
        'jaccard2',
        'baseline',
    )
    for year in sys.argv[1:]:
        year = int(year)
        timing = Timer('Running regressions for %d' % year)
        for extension in extensions:
            rsquareds = trainAndTestModels(year, extension)
            avgRSq = sum(rsquareds) / len(rsquareds)
            print '%d %s: %f' % (year, extension, avgRSq)
            with open('Data/Results/%d.%s' % (year, extension), 'w') as f:
                f.write('K-fold validation results:\n')
                f.write('Average: %f\n\n' % avgRSq)
                for i, r in enumerate(rsquareds):
                    f.write('%d: %f\n' % (i, r))
Exemple #24
0
    #'Factor Analysis': FactorAnalysis(n_components='mle'),
    #'ICA': FastICA(n_components='mle'),
}

clfs = {
    'OLS': linear_model.LinearRegression(),
    'Random Forest': ensemble.RandomForestRegressor(),
}

extensions = ('jaccard', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic')

results = {}
resultsList = []

years = [int(arg) for arg in sys.argv[1:]]
timing = Timer('Runnign everything')
for year in years:
    timing.markEvent('Running for year %d' % year)
    results[year] = {}
    for extension in extensions:
        timing.markEvent('Running for extension %s' % extension)
        results[year][extension] = {}
        for clfname, clf in clfs.iteritems():
            timing.markEvent('Running for classifier %s' % clfname)
            results[year][extension][clfname] = {}
            for decompname, decompFunction in decompFunctions.iteritems():
                timing.markEvent('Running for decomp function %s' % decompname)
                rsquareds = cfscore_predictions.trainAndTestModels(
                    year, extension, clf=clf, decomp_func=decompFunction)
                resultsList.append(
                    (year, extension, clfname, decompname, tuple(rsquareds)))
                communities[nid] = 0.0
            else:
                communities[nid] = communityIndex
        communityIndex += 1

    return communities


################################################################################
# Module command-line behavior #
################################################################################

if __name__ == '__main__':
    for arg in sys.argv[1:]:
        year = int(arg)
        timing = Timer('creating unipartite graph for %d' % year)

        bipartiteGraph = graph_funcs.loadGraph(
            'Data/Bipartite-Graphs/%d.graph' % year)
        unipartiteGraph = graph_funcs.loadGraph(
            'Data/Unipartite-Graphs/%d.graph' % year, snap.TUNGraph)
        newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' %
                                   year)
        timing.markEvent('Loaded input graphs/matrices.')

        #for weightF in ['jaccard', 'affinity', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic']:
        for weightF in ['jaccard2']:
            print '******* %s *******' % weightF
            adjMatrix = pickler.load('Data/Unipartite-Matrix/%d.%s' %
                                     (year, weightF))
            adjMatrix = adjMatrix.tocsc()
Exemple #26
0
    fullFeatures['winner'] = getIntAttrFeatureVec(graph, 'winner', full=True)

    return partialFeatures, fullFeatures


################################################################################
# Module command-line behavior #
################################################################################

if __name__ == '__main__':
    #weightings = ('jaccard', 'jaccard2', 'affinity', 'cosine', 'adamic', 'weighted_adamic')
    #weightings = ('adamic', 'weighted_adamic')
    weightings = ('jaccard2', )
    for year in sys.argv[1:]:
        year = int(year)
        timing = Timer('Generating features for %d' % year)
        graph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)
        receiptsFromDonor, totalReceipts, totalDonations = getDonationAmounts(
            graph)
        partialFeatures, fullFeatures = getCategoricalGraphFeatures(graph)

        baselineFeatures = \
            getBaselineFeatures(graph, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures)
        saveFeatures(graph, baselineFeatures,
                     'Data/Recip-Features/%d.baseline' % year)
        timing.markEvent('Generated baseline features')

        for weighting in weightings:
            donorFeatures = pickler.load('Data/Features/%d%s.features' \
                    % (year, weighting))
            recipFeatures = getRecipFeatures(graph, donorFeatures,
Exemple #27
0
def getNonzeroElems(year, weightF):
    timing = Timer('Loading nonzero elems for year %d and weightf %s ' % (year, weightF))
    adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
    timing.finish()
    return adjMat[adjMat.nonzero()]
def createDonorDonorGraph(year, weightF):
    timing = Timer('creating donor-donor graph for %d' % year)

    # Load the old bipartite graph graph
    bipartiteGraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' %
                                           year)

    # Load the info about each donor and their recipients
    numDonations, totalAmount, cands, transactions, amounts, totalReceipts = getDonorInfos(
        bipartiteGraph)
    timing.markEvent('Got info about donor nodes')

    # Create initial unipartite graph with just nodes and node attributes
    unipartiteGraph, oldToNew, newToOld = cloneBipartiteNodes(
        bipartiteGraph, cands)
    timing.markEvent('Finished cloning nodes')

    jaccardData = []
    jaccard2Data = []
    affinityData = []
    cosineData = []
    adamicData = []
    weightedAdamicData = []
    r = []
    c = []

    # Add the weighted edges for every relevant pair of donor nodes
    nodesDone = 0

    for i, newID1 in enumerate(newToOld.keys()):
        oldID1 = newToOld[newID1]
        for newID2 in newToOld.keys()[i + 1:]:
            oldID2 = newToOld[newID2]

            sharedCands = cands[oldID1].intersection(cands[oldID2])
            if not sharedCands: continue

            # Calculate the weight
            weights = weightF(oldID1, oldID2, sharedCands, numDonations,
                              totalAmount, cands, transactions, amounts,
                              totalReceipts)

            r.append(newID1)
            r.append(newID2)
            c.append(newID2)
            c.append(newID1)
            jaccardData.append(weights['jaccard'])
            jaccardData.append(weights['jaccard'])
            jaccard2Data.append(weights['jaccard2'])
            jaccard2Data.append(weights['jaccard2'])
            affinityData.append(weights['affinity'])
            affinityData.append(weights['affinity'])
            cosineData.append(weights['cosine'])
            cosineData.append(weights['cosine'])
            adamicData.append(weights['adamic'])
            adamicData.append(weights['adamic'])
            weightedAdamicData.append(weights['weighted_adamic'])
            weightedAdamicData.append(weights['weighted_adamic'])

            # Add the edges between the two nodes and their weights
            unipartiteGraph.AddEdge(newID1, newID2)

        nodesDone += 1
        if nodesDone % 100 == 0:
            timing.markEvent('Finished %d outer loops out of %d' % \
                    (nodesDone, unipartiteGraph.GetNodes()))

    N = len(newToOld)
    jaccardAdjMat = sp.csr_matrix((jaccardData, (r, c)), shape=(N, N))
    jaccard2AdjMat = sp.csr_matrix((jaccard2Data, (r, c)), shape=(N, N))
    affinityAdjMat = sp.csr_matrix((affinityData, (r, c)), shape=(N, N))
    cosineAdjMat = sp.csr_matrix((cosineData, (r, c)), shape=(N, N))
    adamicAdjMat = sp.csr_matrix((adamicData, (r, c)), shape=(N, N))
    weightedAdamicAdjMat = sp.csr_matrix((weightedAdamicData, (r, c)),
                                         shape=(N, N))

    timing.finish()
    return unipartiteGraph, jaccardAdjMat, jaccard2AdjMat, affinityAdjMat, cosineAdjMat, adamicAdjMat, weightedAdamicAdjMat, newToOld, oldToNew
Exemple #29
0
def getCorrel(year, weightFs):
    timing = Timer('Getting correlation matrix for year %d' % year)
    append = lambda x, y: np.append(x, y, axis=0)
    data = reduce(append, [getNonzeroElems(year, weightF) for weightF in weightFs])
    timing.finish()
    return np.corrcoef(data)
# Weighted Adamic Adar Similarity Index: (http://www.slideshare.net/hajimesasaki1/picmet15sasaki20150805ppt)
# <On slide 8>
def weightedAdamic(id1, id2, sharedCands, numDonations, totalAmount, cands,
                   transactions, amounts, totalReceipts):
    score = sum([(amounts[id1][cand] + amounts[id2][cand]) /
                 (1.0 + math.log(totalReceipts[cand], 10))
                 for cand in sharedCands])
    return 'weighted_adamic', score


################################################################################
# Module command-line behavior #
################################################################################

if __name__ == '__main__':
    overallTiming = Timer('all unipartite graphs')
    for arg in sys.argv[1:]:
        year = int(arg)
        timing = Timer('Creating unipartite graph for %d' % year)

        graph, wmat1, wmat2, wmat3, wmat4, wmat5, wmat6, newToOld, oldToNew = createDonorDonorGraph(
            year, getWeightScores)

        # Save the SNAP graph:
        outfile = 'Data/Unipartite-Graphs/%d.graph' % year
        graph_funcs.saveGraph(graph, outfile)

        # Save the weight matrices:
        matrixPrefix = 'Data/Unipartite-Matrix/%d' % year
        pickler.save(wmat1, matrixPrefix + '.jaccard')
        pickler.save(wmat2, matrixPrefix + '.jaccard2')