def genDonorFeatures(year, weightF, graphFiles=None, bigraph=None, adjMat=None, newToOldIDs=None): timing = Timer('Generating donor features for %d %s' % (year, weightF)) if not graphFiles: graphFiles = getGraphFiles(year, weightF) if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) if adjMat is None: adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) adjMat = adjMat.tocsc() if newToOldIDs is None: newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) timing.markEvent('Loaded bigraph, adj matrix, and newToOld mapping') for gf in graphFiles: unigraph = graph_funcs.loadGraph('Data/Unipartite-Graphs/%s' % gf, snap.TUNGraph) timing.markEvent('Loaded graph %s' % gf) features = feature_extractor.generateFeatures(year, bigraph, unigraph, newToOldIDs, adjMat) timing.markEvent('Generated features') pickler.save(features, 'Data/Features/%s.features' % gf) timing.markEvent('Saved features') timing.finish()
def trainAndTestModels(year, extension, X=None, Y=None, k=10, clf=linear_model.LinearRegression(), transF=None, decomp_func=None): timing = Timer('Running regression for %d.%s' % (year, extension)) if X is None or Y is None: X, Y = pickler.load('Data/Recip-Features/%d.%s' % (year, extension)) if transF: Y = transF(Y) timing.markEvent('Loaded X and Y') rsquareds = [] # Train and test the regression model on each k-fold set kf = KFold(len(Y), k) for train, test in kf: X_train, X_test = X[train], X[test] Y_train, Y_test = Y[train], Y[test] if decomp_func: decomp_func.fit(X_train) X_train = decomp_func.transform(X_train) X_test = decomp_func.transform(X_test) clf.fit(X_train, Y_train) rsquareds.append(clf.score(X_test, Y_test)) timing.markEvent('Ran regression') timing.finish() return rsquareds
def genRecipFeatures(year, weightF, graphFiles=None, bigraph=None): timing = Timer('Generating recip features for %d %s' % (year, weightF)) if not graphFiles: graphFiles = getGraphFiles(year, weightF) if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) receiptsFromDonor, totalReceipts, totalDonations = \ recip_feature_extractor.getDonationAmounts(bigraph) partialFeatures, fullFeatures = \ recip_feature_extractor.getCategoricalGraphFeatures(bigraph) timing.markEvent('Loaded bigraph, donor amounts, and categorical feature funcs') for gf in graphFiles: donorFeatures = pickler.load('Data/Features/%s.features' % gf) timing.markEvent('Loaded donor features for graph %s' % gf) recipFeatures = recip_feature_extractor.getRecipFeatures( bigraph, donorFeatures, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures) timing.markEvent('Calculated recip features') recip_feature_extractor.saveFeatures(bigraph, recipFeatures, 'Data/Recip-Features/%s' % gf) timing.markEvent('Saved recip features') timing.finish()
def runFullPipeline(year): timing = Timer('Running pipeline for %d' % year) weightings = ('adamic', 'cosine', 'jaccard', 'jaccard2', 'weighted_adamic') bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph'% year) newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) for weightF in weightings: graphFiles = getGraphFiles(year, weightF) adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) timing.markEvent('Loaded everything for donor features') genDonorFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph,\ adjMat=adjMat, newToOldIDs=newToOldIDs) del adjMat # free the incredible amount of memory for the adjacency matrix genRecipFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph) results = getResults(year, weightF, graphFiles=graphFiles) pickler.save(results, 'Data/pruning_optimizations.%d.%s' % (year, weightF)) timing.markEvent('Finished with %s' % weightF) timing.finish()
def getResults(year, weightF, graphFiles=None): timing = Timer('Running regressions for %d %s' % (year, weightF)) results = [] if not graphFiles: graphFiles = getGraphFiles(year, weightF) for gf in graphFiles: X, Y = pickler.load('Data/Recip-Features/%s' % gf) rsquareds = cfscore_predictions.trainAndTestModels(year, weightF, X=X, Y=Y) results.append([weightF, gf, rsquareds]) timing.finish() return results
def getSortedMatrixVals(filename): timing = Timer('Gettin sorted matrix vals') adjMat = pickler.load(filename) timing.markEvent('Loaded adjacency matrix') N = adjMat.shape[0] xIndices, yIndices = adjMat.nonzero() timing.markEvent('Loaded nonzero indices') data = adjMat[xIndices, yIndices] timing.markEvent('Loaded nonzero vals') flat = np.ravel(data) timing.markEvent('Flattened data') vals = zip(xIndices, yIndices, flat) timing.markEvent('Zipped values') vals.sort(key=lambda v: v[2], reverse=True) timing.markEvent('Sorted values') print vals[0][2] > vals[1][2] print vals[0][2], vals[1][2] raise ValueError("LOL") return vals, N
def getNonzeroElems(year, weightF): timing = Timer('Loading nonzero elems for year %d and weightf %s ' % (year, weightF)) adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) timing.finish() return adjMat[adjMat.nonzero()]
#weightings = ('jaccard', 'jaccard2', 'affinity', 'cosine', 'adamic', 'weighted_adamic') #weightings = ('adamic', 'weighted_adamic') weightings = ('jaccard2', ) for year in sys.argv[1:]: year = int(year) timing = Timer('Generating features for %d' % year) graph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year) receiptsFromDonor, totalReceipts, totalDonations = getDonationAmounts( graph) partialFeatures, fullFeatures = getCategoricalGraphFeatures(graph) baselineFeatures = \ getBaselineFeatures(graph, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures) saveFeatures(graph, baselineFeatures, 'Data/Recip-Features/%d.baseline' % year) timing.markEvent('Generated baseline features') for weighting in weightings: donorFeatures = pickler.load('Data/Features/%d%s.features' \ % (year, weighting)) recipFeatures = getRecipFeatures(graph, donorFeatures, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures) saveFeatures(graph, recipFeatures, 'Data/Recip-Features/%d.%s' \ % (year, weighting)) timing.markEvent('Calculated main recipient features for %s' \ % weighting) timing.finish()
################################################################################ # Module command-line behavior # ################################################################################ if __name__ == '__main__': for arg in sys.argv[1:]: year = int(arg) timing = Timer('creating unipartite graph for %d' % year) bipartiteGraph = graph_funcs.loadGraph( 'Data/Bipartite-Graphs/%d.graph' % year) unipartiteGraph = graph_funcs.loadGraph( 'Data/Unipartite-Graphs/%d.graph' % year, snap.TUNGraph) newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year) timing.markEvent('Loaded input graphs/matrices.') #for weightF in ['jaccard', 'affinity', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic']: for weightF in ['jaccard2']: print '******* %s *******' % weightF adjMatrix = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF)) adjMatrix = adjMatrix.tocsc() features = generateFeatures(year, bipartiteGraph, unipartiteGraph, newToOldIDs, adjMatrix) pickler.save(features, 'Data/Features/%d%s.features' % (year, weightF)) timing.markEvent('Processed %s weight function' % weightF)