def genRecipFeatures(year, weightF, graphFiles=None, bigraph=None): timing = Timer('Generating recip features for %d %s' % (year, weightF)) if not graphFiles: graphFiles = getGraphFiles(year, weightF) if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) receiptsFromDonor, totalReceipts, totalDonations = \ recip_feature_extractor.getDonationAmounts(bigraph) partialFeatures, fullFeatures = \ recip_feature_extractor.getCategoricalGraphFeatures(bigraph) timing.markEvent('Loaded bigraph, donor amounts, and categorical feature funcs') for gf in graphFiles: donorFeatures = pickler.load('Data/Features/%s.features' % gf) timing.markEvent('Loaded donor features for graph %s' % gf) recipFeatures = recip_feature_extractor.getRecipFeatures( bigraph, donorFeatures, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures) timing.markEvent('Calculated recip features') recip_feature_extractor.saveFeatures(bigraph, recipFeatures, 'Data/Recip-Features/%s' % gf) timing.markEvent('Saved recip features') timing.finish()
def loadRecipients(dbNames, filepath): timing = Timer('loading Recipients table') extractors = [ 0, 7, 8, 10, 12, 13, 14, 15, 16, 22, 23, 39, 46, 47, 61, 62, 63, 64, 65 ] transforms = [ int, str, safeInt, party, str, str, incumb, float, float, int, gender, safeInt, winner, safeFloat, safeFloat, safeFloat, candStatus, int, candOrComm ] observedKeys = set() for db in dbNames: initRecipientTable(db) with open(filepath, 'r') as f: reader = csv.reader(f) reader.next() # skip column headers for i, block in enumerate(generateChunk(reader, extractors, transforms)): newBlock = filterRecipients(block, observedKeys) for db in dbNames: commitRecipBlock(db, newBlock) timing.finish()
def trainAndTestModels(year, extension, X=None, Y=None, k=10, clf=linear_model.LinearRegression(), transF=None, decomp_func=None): timing = Timer('Running regression for %d.%s' % (year, extension)) if X is None or Y is None: X, Y = pickler.load('Data/Recip-Features/%d.%s' % (year, extension)) if transF: Y = transF(Y) timing.markEvent('Loaded X and Y') rsquareds = [] # Train and test the regression model on each k-fold set kf = KFold(len(Y), k) for train, test in kf: X_train, X_test = X[train], X[test] Y_train, Y_test = Y[train], Y[test] if decomp_func: decomp_func.fit(X_train) X_train = decomp_func.transform(X_train) X_test = decomp_func.transform(X_test) clf.fit(X_train, Y_train) rsquareds.append(clf.score(X_test, Y_test)) timing.markEvent('Ran regression') timing.finish() return rsquareds
def getDonationAmounts(graph): timing = Timer('Getting candidate, donor, and cand-donor donation amounts') # A dictionary from rnodeids to dictionaries from cnodeids to floats indicating # the total donations from that donor to that candidate receiptsFromDonor = defaultdict(lambda: defaultdict(int)) # A dictionary from rnodeids to ints indicating the total amount donated to # that candidate. totalReceipts = defaultdict(int) # A dictionary from cnodeids to ints indicating the total amount donated by # that donor. totalDonations = defaultdict(int) # For each donation, note it in the relevant dictionaries for edge in graph.Edges(): donor = edge.GetSrcNId() recip = edge.GetDstNId() amount = graph.GetIntAttrDatE(edge.GetId(), 'amount') receiptsFromDonor[recip][donor] += amount totalReceipts[recip] += amount totalDonations[donor] += amount timing.finish() return receiptsFromDonor, totalReceipts, totalDonations
def generateFeatures(year, bipartite, unipartite, newToOldIDs, adjMatrix): timing = Timer('generating features for %d' % year) bipartiteFeatures = extractBipartiteFeatures(bipartite) timing.markEvent('Extracted bipartite features.') # rawUnifeatures, componentFeatureFunc, communityFeatureFuncn = extractUnipartiteFeatures(unipartite, adjMatrix) rawUnifeatures, componentFeatureFunc, CNMFeatureFunc = extractUnipartiteFeatures( unipartite, adjMatrix) unipartiteFeatures = convertNewToOldIDs(rawUnifeatures, newToOldIDs) timing.markEvent('Extracted unipartite features.') # append unipartite features to bipartite features for each node, returning combined feature dictionary. # If the donor is not in the unipartite feature graph then we just take the default values (since the # node falls below the unipartite threshold from sqlToGraphs): features = {} for donorNode in graph_funcs.getDonors(bipartite): oldNID = donorNode.GetId() if oldNID in unipartiteFeatures: features[oldNID] = bipartiteFeatures[oldNID] + unipartiteFeatures[ oldNID] else: features[oldNID] = bipartiteFeatures[ oldNID] + defaultUnipartiteFeatures( componentFeatureFunc, CNMFeatureFunc) #, communityFeatureFuncn) timing.finish() return features
def genDonorFeatures(year, weightF, graphFiles=None, bigraph=None, adjMat=None, newToOldIDs=None): timing = Timer('Generating donor features for %d %s' % (year, weightF)) if not graphFiles: graphFiles = getGraphFiles(year, weightF) if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) if adjMat is None: adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) adjMat = adjMat.tocsc() if newToOldIDs is None: newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) timing.markEvent('Loaded bigraph, adj matrix, and newToOld mapping') for gf in graphFiles: unigraph = graph_funcs.loadGraph('Data/Unipartite-Graphs/%s' % gf, snap.TUNGraph) timing.markEvent('Loaded graph %s' % gf) features = feature_extractor.generateFeatures(year, bigraph, unigraph, newToOldIDs, adjMat) timing.markEvent('Generated features') pickler.save(features, 'Data/Features/%s.features' % gf) timing.markEvent('Saved features') timing.finish()
def loadContributors(dbNames, filepath): timing = Timer('loading Contributors table') extractors = [0, 1, 2, 3] transforms = [int, indiv, str, safeFloat] for db in dbNames: initContributorsTable(db) reader = csv.reader(open(filepath, 'rb')) reader.next() # skip column headers for i, block in enumerate(generateChunk(reader, extractors, transforms)): for db in dbNames: commitContribBlock(db, block) timing.finish()
def getResults(year, weightF, graphFiles=None): timing = Timer('Running regressions for %d %s' % (year, weightF)) results = [] if not graphFiles: graphFiles = getGraphFiles(year, weightF) for gf in graphFiles: X, Y = pickler.load('Data/Recip-Features/%s' % gf) rsquareds = cfscore_predictions.trainAndTestModels(year, weightF, X=X, Y=Y) results.append([weightF, gf, rsquareds]) timing.finish() return results
def loadTransactionFile(dbName, csvName, year): timing = Timer('loading Transactions_%d into table' % year) extractors = [0, 1, 2, 3, 4, 5, 13, 27, 28, 29, 33, 34, 36, 37] transforms = [ int, str, str, strToFltToInt, str, strToFltToInt, indiv, str, party, candOrComm, str, str, safeFloat, safeFloat ] initTransactionsTable(dbName) with open(csvName, 'r') as f: reader = csv.reader(f) reader.next() # skip column headers for i, block in enumerate(generateChunk(reader, extractors, transforms)): newBlock = filterTransactions(block) commitTransBlock(dbName, newBlock) timing.finish()
def extractUnipartiteFeatures(unipartiteGraph, adjMat): timing = Timer('extracting unipartite features') features = defaultdict(list) #componentFeatureFunc, communityFeatureFuncn, idToCommunity = getUnipartiteSurfaceFeatures(unipartiteGraph, adjMat, features) componentFeatureFunc, CNMFeatureFunc, idToCNM = getUnipartiteSurfaceFeatures( unipartiteGraph, adjMat, features) timing.markEvent('1. Extracted surface features') # Average weight of edges: avgWeights = calcAverageWeights(unipartiteGraph, adjMat) #totalWeights = {adjMat timing.markEvent('2. Computed average weights.') # Size of connected component: #cnctComponents = calcCnctComponents(unipartiteGraph) timing.markEvent('3. Computed connected components.') # Size of CNM community: communities = calcCommunities(idToCNM) timing.markEvent('4. Computed CNM communities.') # Pagerank: pageRanks = snap.TIntFltH() snap.GetPageRank(unipartiteGraph, pageRanks) timing.markEvent('5. Computed PageRank.') # combine the graph wide features with the existing surface features: for nid in features: features[nid].append(avgWeights[nid]) #features[nid].append(cnctComponents[nid]) features[nid].append(communities[nid]) features[nid].append(pageRanks[nid]) timing.finish() return features, componentFeatureFunc, CNMFeatureFunc
def processYearAndWeight(year, weighting, percents=None, thresholds=None): timing = Timer('Running for year %d and weight %s' % (year, weighting)) adjMatFile = 'Data/Unipartite-Matrix/%d.%s' % (year, weighting) sortedVals, N = getSortedMatrixVals(adjMatFile) timing.markEvent('Got sorted vals') if percents: for p in percents: outfile = 'Data/Unipartite-Graphs/%d.%s_percent_%f.graph' \ % (year, weighting, p) graph = pruneGraphByPercent(sortedVals, N, p) graph_funcs.saveGraph(graph, outfile) timing.markEvent('Finished for %f percent' % p) if thresholds: for t in thresholds: outfile = 'Data/Unipartite-Graphs/%d.%s_threshold_%f.graph' \ % (year, weighting, t) graph = pruneGraphByThreshold(sortedVals, N, t) graph_funcs.saveGraph(graph, outfile) timing.markEvent('Finished for threshold %f' % t) timing.finish()
def getRecipFeatures(graph, donorFeatures, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures, includeDonorFeatures=False): timing = Timer('Getting recipient features') recipFeatures = {} for recipNode in graph_funcs.getRecipients(graph, cfs=True): rnodeid = recipNode.GetId() # Add a donor feature indicating what percent of this donor's donations # went to this candidate. for donor in receiptsFromDonor[rnodeid]: pct = receiptsFromDonor[rnodeid][donor] / float( totalDonations[donor]) donorFeatures[donor].append(pct) if includeDonorFeatures: recipFeatures[rnodeid] = np.append( getPartialNodeRecipSpecificFeatures(graph, rnodeid), processDonorFeaturesForRecip(donorFeatures, receiptsFromDonor[rnodeid])) else: recipFeatures[rnodeid] = \ processDonorFeaturesForRecip(donorFeatures, receiptsFromDonor[rnodeid]) # Remove the temporarily added feature for what percent of this donor's # donations went to this candidate. for donor in receiptsFromDonor[rnodeid]: donorFeatures[donor].pop() timing.finish() return recipFeatures
def runFullPipeline(year): timing = Timer('Running pipeline for %d' % year) weightings = ('adamic', 'cosine', 'jaccard', 'jaccard2', 'weighted_adamic') bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph'% year) newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) for weightF in weightings: graphFiles = getGraphFiles(year, weightF) adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) timing.markEvent('Loaded everything for donor features') genDonorFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph,\ adjMat=adjMat, newToOldIDs=newToOldIDs) del adjMat # free the incredible amount of memory for the adjacency matrix genRecipFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph) results = getResults(year, weightF, graphFiles=graphFiles) pickler.save(results, 'Data/pruning_optimizations.%d.%s' % (year, weightF)) timing.markEvent('Finished with %s' % weightF) timing.finish()
def getCorrel(year, weightFs): timing = Timer('Getting correlation matrix for year %d' % year) append = lambda x, y: np.append(x, y, axis=0) data = reduce(append, [getNonzeroElems(year, weightF) for weightF in weightFs]) timing.finish() return np.corrcoef(data)
def getNonzeroElems(year, weightF): timing = Timer('Loading nonzero elems for year %d and weightf %s ' % (year, weightF)) adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) timing.finish() return adjMat[adjMat.nonzero()]
extensions = ('jaccard', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic') results = {} resultsList = [] years = [int(arg) for arg in sys.argv[1:]] timing = Timer('Runnign everything') for year in years: timing.markEvent('Running for year %d' % year) results[year] = {} for extension in extensions: timing.markEvent('Running for extension %s' % extension) results[year][extension] = {} for clfname, clf in clfs.iteritems(): timing.markEvent('Running for classifier %s' % clfname) results[year][extension][clfname] = {} for decompname, decompFunction in decompFunctions.iteritems(): timing.markEvent('Running for decomp function %s' % decompname) rsquareds = cfscore_predictions.trainAndTestModels( year, extension, clf=clf, decomp_func=decompFunction) resultsList.append( (year, extension, clfname, decompname, tuple(rsquareds))) results[year][extension][clfname][decompname] = tuple( rsquareds) timing.markEvent('Done') timing.finish() print results pickler.save(results, 'results') pickler.save(resultsList, 'resultsList')
# Save the weight matrices: matrixPrefix = 'Data/Unipartite-Matrix/%d' % year pickler.save(wmat1, matrixPrefix + '.jaccard') pickler.save(wmat2, matrixPrefix + '.jaccard2') pickler.save(wmat3, matrixPrefix + '.affinity') pickler.save(wmat4, matrixPrefix + '.cosine') pickler.save(wmat5, matrixPrefix + '.adamic') pickler.save(wmat6, matrixPrefix + '.weighted_adamic') # Save the bipartite-unipartite corresponding node ID dictionaries: mappingPrefix = 'Data/Unipartite-NodeMappings/%d' % year pickler.save(newToOld, mappingPrefix + '.newToOld') pickler.save(oldToNew, mappingPrefix + '.oldToNew') timing.finish() overallTiming.finish() # ------ OLD CODE: ------ # weight = weightF( # len(sharedCands), # sharedTransactions, # sharedAmount, # len(cands[id1].union(cands[id2])), # numDonations[id1] + numDonations[id2], # totalAmount[id1] + totalAmount[id2], # ) # Get the number of transactions and the total amount given to shared
def createDonorDonorGraph(year, weightF): timing = Timer('creating donor-donor graph for %d' % year) # Load the old bipartite graph graph bipartiteGraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) # Load the info about each donor and their recipients numDonations, totalAmount, cands, transactions, amounts, totalReceipts = getDonorInfos( bipartiteGraph) timing.markEvent('Got info about donor nodes') # Create initial unipartite graph with just nodes and node attributes unipartiteGraph, oldToNew, newToOld = cloneBipartiteNodes( bipartiteGraph, cands) timing.markEvent('Finished cloning nodes') jaccardData = [] jaccard2Data = [] affinityData = [] cosineData = [] adamicData = [] weightedAdamicData = [] r = [] c = [] # Add the weighted edges for every relevant pair of donor nodes nodesDone = 0 for i, newID1 in enumerate(newToOld.keys()): oldID1 = newToOld[newID1] for newID2 in newToOld.keys()[i + 1:]: oldID2 = newToOld[newID2] sharedCands = cands[oldID1].intersection(cands[oldID2]) if not sharedCands: continue # Calculate the weight weights = weightF(oldID1, oldID2, sharedCands, numDonations, totalAmount, cands, transactions, amounts, totalReceipts) r.append(newID1) r.append(newID2) c.append(newID2) c.append(newID1) jaccardData.append(weights['jaccard']) jaccardData.append(weights['jaccard']) jaccard2Data.append(weights['jaccard2']) jaccard2Data.append(weights['jaccard2']) affinityData.append(weights['affinity']) affinityData.append(weights['affinity']) cosineData.append(weights['cosine']) cosineData.append(weights['cosine']) adamicData.append(weights['adamic']) adamicData.append(weights['adamic']) weightedAdamicData.append(weights['weighted_adamic']) weightedAdamicData.append(weights['weighted_adamic']) # Add the edges between the two nodes and their weights unipartiteGraph.AddEdge(newID1, newID2) nodesDone += 1 if nodesDone % 100 == 0: timing.markEvent('Finished %d outer loops out of %d' % \ (nodesDone, unipartiteGraph.GetNodes())) N = len(newToOld) jaccardAdjMat = sp.csr_matrix((jaccardData, (r, c)), shape=(N, N)) jaccard2AdjMat = sp.csr_matrix((jaccard2Data, (r, c)), shape=(N, N)) affinityAdjMat = sp.csr_matrix((affinityData, (r, c)), shape=(N, N)) cosineAdjMat = sp.csr_matrix((cosineData, (r, c)), shape=(N, N)) adamicAdjMat = sp.csr_matrix((adamicData, (r, c)), shape=(N, N)) weightedAdamicAdjMat = sp.csr_matrix((weightedAdamicData, (r, c)), shape=(N, N)) timing.finish() return unipartiteGraph, jaccardAdjMat, jaccard2AdjMat, affinityAdjMat, cosineAdjMat, adamicAdjMat, weightedAdamicAdjMat, newToOld, oldToNew