Ejemplo n.º 1
0
def genDonorFeatures(year, weightF, graphFiles=None, bigraph=None, adjMat=None, newToOldIDs=None):
    timing = Timer('Generating donor features for %d %s' % (year, weightF))

    if not graphFiles:
        graphFiles = getGraphFiles(year, weightF)
    if not bigraph:
        bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)
    if adjMat is None:
        adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
        adjMat = adjMat.tocsc()
    if newToOldIDs is None:
        newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year)
    timing.markEvent('Loaded bigraph, adj matrix, and newToOld mapping')

    for gf in graphFiles:
        unigraph = graph_funcs.loadGraph('Data/Unipartite-Graphs/%s' % gf, snap.TUNGraph)
        timing.markEvent('Loaded graph %s' % gf)

        features = feature_extractor.generateFeatures(year, bigraph, unigraph, newToOldIDs, adjMat)
        timing.markEvent('Generated features')

        pickler.save(features, 'Data/Features/%s.features' % gf)
        timing.markEvent('Saved features')

    timing.finish()
Ejemplo n.º 2
0
def trainAndTestModels(year,
                       extension,
                       X=None,
                       Y=None,
                       k=10,
                       clf=linear_model.LinearRegression(),
                       transF=None,
                       decomp_func=None):
    timing = Timer('Running regression for %d.%s' % (year, extension))
    if X is None or Y is None:
        X, Y = pickler.load('Data/Recip-Features/%d.%s' % (year, extension))
    if transF: Y = transF(Y)
    timing.markEvent('Loaded X and Y')
    rsquareds = []

    # Train and test the regression model on each k-fold set
    kf = KFold(len(Y), k)
    for train, test in kf:
        X_train, X_test = X[train], X[test]
        Y_train, Y_test = Y[train], Y[test]

        if decomp_func:
            decomp_func.fit(X_train)
            X_train = decomp_func.transform(X_train)
            X_test = decomp_func.transform(X_test)

        clf.fit(X_train, Y_train)
        rsquareds.append(clf.score(X_test, Y_test))
    timing.markEvent('Ran regression')

    timing.finish()
    return rsquareds
Ejemplo n.º 3
0
def genRecipFeatures(year, weightF, graphFiles=None, bigraph=None):
    timing = Timer('Generating recip features for %d %s' % (year, weightF))

    if not graphFiles: graphFiles = getGraphFiles(year, weightF)
    if not bigraph: bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)

    receiptsFromDonor, totalReceipts, totalDonations = \
            recip_feature_extractor.getDonationAmounts(bigraph)
    partialFeatures, fullFeatures = \
            recip_feature_extractor.getCategoricalGraphFeatures(bigraph)

    timing.markEvent('Loaded bigraph, donor amounts, and categorical feature funcs')

    for gf in graphFiles:
        donorFeatures = pickler.load('Data/Features/%s.features' % gf)
        timing.markEvent('Loaded donor features for graph %s' % gf)

        recipFeatures = recip_feature_extractor.getRecipFeatures(
                bigraph, donorFeatures, receiptsFromDonor, totalReceipts,
                totalDonations, partialFeatures, fullFeatures)
        timing.markEvent('Calculated recip features')

        recip_feature_extractor.saveFeatures(bigraph, recipFeatures, 'Data/Recip-Features/%s' % gf)
        timing.markEvent('Saved recip features')

    timing.finish()
Ejemplo n.º 4
0
def runFullPipeline(year):
    timing = Timer('Running pipeline for %d' % year)

    weightings = ('adamic', 'cosine', 'jaccard', 'jaccard2', 'weighted_adamic')
    bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph'% year)
    newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year)

    for weightF in weightings:

        graphFiles = getGraphFiles(year, weightF)

        adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
        timing.markEvent('Loaded everything for donor features')
        genDonorFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph,\
                adjMat=adjMat, newToOldIDs=newToOldIDs)
        del adjMat # free the incredible amount of memory for the adjacency matrix


        genRecipFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph)
        results = getResults(year, weightF, graphFiles=graphFiles)
        pickler.save(results, 'Data/pruning_optimizations.%d.%s' % (year, weightF))
        timing.markEvent('Finished with %s' % weightF)

    timing.finish()
Ejemplo n.º 5
0
def getResults(year, weightF, graphFiles=None):
    timing = Timer('Running regressions for %d %s' % (year, weightF))

    results = []

    if not graphFiles: graphFiles = getGraphFiles(year, weightF)

    for gf in graphFiles:
        X, Y = pickler.load('Data/Recip-Features/%s' % gf)
        rsquareds = cfscore_predictions.trainAndTestModels(year, weightF, X=X, Y=Y)
        results.append([weightF, gf, rsquareds])

    timing.finish()

    return results
Ejemplo n.º 6
0
def getSortedMatrixVals(filename):
    timing = Timer('Gettin sorted matrix vals')
    adjMat = pickler.load(filename)
    timing.markEvent('Loaded adjacency matrix')
    N = adjMat.shape[0]
    xIndices, yIndices = adjMat.nonzero()
    timing.markEvent('Loaded nonzero indices')
    data = adjMat[xIndices, yIndices]
    timing.markEvent('Loaded nonzero vals')
    flat = np.ravel(data)
    timing.markEvent('Flattened data')

    vals = zip(xIndices, yIndices, flat)
    timing.markEvent('Zipped values')
    vals.sort(key=lambda v: v[2], reverse=True)
    timing.markEvent('Sorted values')
    print vals[0][2] > vals[1][2]
    print vals[0][2], vals[1][2]
    raise ValueError("LOL")
    return vals, N
Ejemplo n.º 7
0
def getNonzeroElems(year, weightF):
    timing = Timer('Loading nonzero elems for year %d and weightf %s ' % (year, weightF))
    adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
    timing.finish()
    return adjMat[adjMat.nonzero()]
Ejemplo n.º 8
0
    #weightings = ('jaccard', 'jaccard2', 'affinity', 'cosine', 'adamic', 'weighted_adamic')
    #weightings = ('adamic', 'weighted_adamic')
    weightings = ('jaccard2', )
    for year in sys.argv[1:]:
        year = int(year)
        timing = Timer('Generating features for %d' % year)
        graph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)
        receiptsFromDonor, totalReceipts, totalDonations = getDonationAmounts(
            graph)
        partialFeatures, fullFeatures = getCategoricalGraphFeatures(graph)

        baselineFeatures = \
            getBaselineFeatures(graph, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures)
        saveFeatures(graph, baselineFeatures,
                     'Data/Recip-Features/%d.baseline' % year)
        timing.markEvent('Generated baseline features')

        for weighting in weightings:
            donorFeatures = pickler.load('Data/Features/%d%s.features' \
                    % (year, weighting))
            recipFeatures = getRecipFeatures(graph, donorFeatures,
                                             receiptsFromDonor, totalReceipts,
                                             totalDonations, partialFeatures,
                                             fullFeatures)
            saveFeatures(graph, recipFeatures, 'Data/Recip-Features/%d.%s' \
                    % (year, weighting))
            timing.markEvent('Calculated main recipient features for %s' \
                    % weighting)

        timing.finish()
Ejemplo n.º 9
0

################################################################################
# Module command-line behavior #
################################################################################

if __name__ == '__main__':
    for arg in sys.argv[1:]:
        year = int(arg)
        timing = Timer('creating unipartite graph for %d' % year)

        bipartiteGraph = graph_funcs.loadGraph(
            'Data/Bipartite-Graphs/%d.graph' % year)
        unipartiteGraph = graph_funcs.loadGraph(
            'Data/Unipartite-Graphs/%d.graph' % year, snap.TUNGraph)
        newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' %
                                   year)
        timing.markEvent('Loaded input graphs/matrices.')

        #for weightF in ['jaccard', 'affinity', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic']:
        for weightF in ['jaccard2']:
            print '******* %s *******' % weightF
            adjMatrix = pickler.load('Data/Unipartite-Matrix/%d.%s' %
                                     (year, weightF))
            adjMatrix = adjMatrix.tocsc()

            features = generateFeatures(year, bipartiteGraph, unipartiteGraph,
                                        newToOldIDs, adjMatrix)
            pickler.save(features,
                         'Data/Features/%d%s.features' % (year, weightF))

            timing.markEvent('Processed %s weight function' % weightF)