Esempio n. 1
0
def loadRecipients(dbNames, filepath):
    timing = Timer('loading Recipients table')
    extractors = [
        0, 7, 8, 10, 12, 13, 14, 15, 16, 22, 23, 39, 46, 47, 61, 62, 63, 64, 65
    ]
    transforms = [
        int, str, safeInt, party, str, str, incumb, float, float, int, gender,
        safeInt, winner, safeFloat, safeFloat, safeFloat, candStatus, int,
        candOrComm
    ]
    observedKeys = set()

    for db in dbNames:
        initRecipientTable(db)

    with open(filepath, 'r') as f:
        reader = csv.reader(f)
        reader.next()  # skip column headers
        for i, block in enumerate(generateChunk(reader, extractors,
                                                transforms)):
            newBlock = filterRecipients(block, observedKeys)
            for db in dbNames:
                commitRecipBlock(db, newBlock)

    timing.finish()
Esempio n. 2
0
 def __init__(self, path: str, supported_extensions: [int]):
     self.timer = Timer()
     self.path = path
     self.relative_path = PathExtractor().get_relative_path(path)
     self.supported_extensions = supported_extensions
     self.file_name = PathExtractor().get_file_name(path)
     self.extension = PathExtractor().get_file_extension(self.file_name)
Esempio n. 3
0
def getDonationAmounts(graph):
    timing = Timer('Getting candidate, donor, and cand-donor donation amounts')
    # A dictionary from rnodeids to dictionaries from cnodeids to floats indicating
    # the total donations from that donor to that candidate
    receiptsFromDonor = defaultdict(lambda: defaultdict(int))

    # A dictionary from rnodeids to ints indicating the total amount donated to
    # that candidate.
    totalReceipts = defaultdict(int)

    # A dictionary from cnodeids to ints indicating the total amount donated by
    # that donor.
    totalDonations = defaultdict(int)

    # For each donation, note it in the relevant dictionaries
    for edge in graph.Edges():
        donor = edge.GetSrcNId()
        recip = edge.GetDstNId()
        amount = graph.GetIntAttrDatE(edge.GetId(), 'amount')

        receiptsFromDonor[recip][donor] += amount
        totalReceipts[recip] += amount
        totalDonations[donor] += amount

    timing.finish()
    return receiptsFromDonor, totalReceipts, totalDonations
Esempio n. 4
0
def loadContributors(dbNames, filepath):
    timing = Timer('loading Contributors table')
    extractors = [0, 1, 2, 3]
    transforms = [int, indiv, str, safeFloat]

    for db in dbNames:
        initContributorsTable(db)
    reader = csv.reader(open(filepath, 'rb'))
    reader.next()  # skip column headers
    for i, block in enumerate(generateChunk(reader, extractors, transforms)):
        for db in dbNames:
            commitContribBlock(db, block)

    timing.finish()
Esempio n. 5
0
def trainAndTestModels(year,
                       extension,
                       X=None,
                       Y=None,
                       k=10,
                       clf=linear_model.LinearRegression(),
                       transF=None,
                       decomp_func=None):
    timing = Timer('Running regression for %d.%s' % (year, extension))
    if X is None or Y is None:
        X, Y = pickler.load('Data/Recip-Features/%d.%s' % (year, extension))
    if transF: Y = transF(Y)
    timing.markEvent('Loaded X and Y')
    rsquareds = []

    # Train and test the regression model on each k-fold set
    kf = KFold(len(Y), k)
    for train, test in kf:
        X_train, X_test = X[train], X[test]
        Y_train, Y_test = Y[train], Y[test]

        if decomp_func:
            decomp_func.fit(X_train)
            X_train = decomp_func.transform(X_train)
            X_test = decomp_func.transform(X_test)

        clf.fit(X_train, Y_train)
        rsquareds.append(clf.score(X_test, Y_test))
    timing.markEvent('Ran regression')

    timing.finish()
    return rsquareds
Esempio n. 6
0
def generateFeatures(year, bipartite, unipartite, newToOldIDs, adjMatrix):
    timing = Timer('generating features for %d' % year)

    bipartiteFeatures = extractBipartiteFeatures(bipartite)
    timing.markEvent('Extracted bipartite features.')

    # rawUnifeatures, componentFeatureFunc, communityFeatureFuncn = extractUnipartiteFeatures(unipartite, adjMatrix)
    rawUnifeatures, componentFeatureFunc, CNMFeatureFunc = extractUnipartiteFeatures(
        unipartite, adjMatrix)
    unipartiteFeatures = convertNewToOldIDs(rawUnifeatures, newToOldIDs)
    timing.markEvent('Extracted unipartite features.')

    # append unipartite features to bipartite features for each node, returning combined feature dictionary.
    # If the donor is not in the unipartite feature graph then we just take the default values (since the
    # node falls below the unipartite threshold from sqlToGraphs):
    features = {}
    for donorNode in graph_funcs.getDonors(bipartite):
        oldNID = donorNode.GetId()
        if oldNID in unipartiteFeatures:
            features[oldNID] = bipartiteFeatures[oldNID] + unipartiteFeatures[
                oldNID]
        else:
            features[oldNID] = bipartiteFeatures[
                oldNID] + defaultUnipartiteFeatures(
                    componentFeatureFunc,
                    CNMFeatureFunc)  #, communityFeatureFuncn)
    timing.finish()

    return features
Esempio n. 7
0
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])

    if len(sys.argv) != 2:
        Logger().usage(f"python {script_name} <wiki.en.filtered.txt>")
        return

    file_path = sys.argv[1]

    if PathValidator().is_valid_files([file_path]):
        Logger().info(f'Input file: "{file_path}"')
        Logger().info("Starting to lemmatize text")
        timer = Timer()
        lemmatize_text(file_path, timer)
        Logger().finish_script(timer.get_duration(), script_name)
Esempio n. 8
0
def getResults(year, weightF, graphFiles=None):
    timing = Timer('Running regressions for %d %s' % (year, weightF))

    results = []

    if not graphFiles: graphFiles = getGraphFiles(year, weightF)

    for gf in graphFiles:
        X, Y = pickler.load('Data/Recip-Features/%s' % gf)
        rsquareds = cfscore_predictions.trainAndTestModels(year, weightF, X=X, Y=Y)
        results.append([weightF, gf, rsquareds])

    timing.finish()

    return results
Esempio n. 9
0
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])

    if len(sys.argv) != 2:
        Logger().usage(f'python {script_name} <wiki.en.raw.txt>')
        return

    file_path = sys.argv[1]

    if PathValidator().is_valid_files([file_path]):
        Logger().info(f'Input file: "{file_path}"')
        Logger().info("Starting to remove stopwords")
        timer = Timer()
        remove_stopwords(file_path)
        Logger().finish_script(timer.get_duration(), script_name)
Esempio n. 10
0
 def set_binary_model(self, model_file_path: str):
     timer: Timer = Timer()
     Logger().start_analyzing("Loading binary Word2VecModel")
     self.model = KeyedVectors.load_word2vec_format(model_file_path,
                                                    binary=True)
     Logger().finish_analyzing(timer.get_duration(),
                               "Loading binary Word2VecModel")
Esempio n. 11
0
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])

    if len(sys.argv) != 2:
        Logger().usage(
            f'python {script_name} <en.wiki-latest-pages-articles.xml.bz2>')
        return

    file_path: str = sys.argv[1]

    if PathValidator().is_valid_files([file_path]):
        Logger().info(f'Input file: "{file_path}"')
        Logger().info(f'Starting to create wiki corpus from "{file_path}"')
        timer = Timer()
        get_corpus(file_path)
        Logger().finish_script(timer.get_duration(), script_name)
Esempio n. 12
0
def loadTransactionFile(dbName, csvName, year):
    timing = Timer('loading Transactions_%d into table' % year)
    extractors = [0, 1, 2, 3, 4, 5, 13, 27, 28, 29, 33, 34, 36, 37]
    transforms = [
        int, str, str, strToFltToInt, str, strToFltToInt, indiv, str, party,
        candOrComm, str, str, safeFloat, safeFloat
    ]
    initTransactionsTable(dbName)

    with open(csvName, 'r') as f:
        reader = csv.reader(f)
        reader.next()  # skip column headers
        for i, block in enumerate(generateChunk(reader, extractors,
                                                transforms)):
            newBlock = filterTransactions(block)
            commitTransBlock(dbName, newBlock)

    timing.finish()
Esempio n. 13
0
class TurnState(TaskState):
    def init(self):
        self.timer = Timer(3000000).start()

    def transition(self):
        if self.timer.finished():
            return WalkForwardState(self.parent)

        return self

    def tick(self):
        self.world.b_request.actions.body = actioncommand.walk(turn=1)
Esempio n. 14
0
def main():
    script_name: str = PathExtractor().get_file_name(sys.argv[0])
    timer = Timer()

    if len(sys.argv) == 3:
        model_path = PathExtractor().get_absolute_path(sys.argv[2])
        if not PathValidator().is_valid_files([model_path]):
            return
        Word2VecModel.instance.set_model(model_path)

    if len(sys.argv) < 2 or len(sys.argv) > 3:
        Logger().usage(
            f"python {script_name} <file_or_directory_path> [<word2vec.model>]"
        )
        return

    project_path = PathExtractor().get_absolute_path(sys.argv[1])

    if PathValidator().is_valid_paths([project_path]):
        parse(project_path)
        Logger().finish_script(timer.get_duration(), script_name)
Esempio n. 15
0
def lemmatize_text(file_path: str, timer: Timer):
    logger = Logger()
    output_file = FileOpener().get_new_file("wiki.en.lemmatized.txt", "a")

    with open(file_path, "r") as file:
        for line in file:
            lemmatized_list = [
                word.lemma_
                for word in SpacyModel.instance.get_en_spacy_line(line)
            ]
            lemmazized_line = " ".join(lemmatized_list)
            output_file.write(lemmazized_line)
            logger.every_n_wiki_status(10, timer.get_duration())
    logger.every_n_wiki_status(1)
class TurnState(TaskState):

    def init(self):
        self.timer = Timer(3000000).start()

    def transition(self):

        if self.timer.finished():
            return WalkForwardState(self.parent)

        return self

    def tick(self):
        self.world.b_request.actions.body = actioncommand.walk(turn = 1)
Esempio n. 17
0
def processYearAndWeight(year, weighting, percents=None, thresholds=None):
    timing = Timer('Running for year %d and weight %s' % (year, weighting))
    adjMatFile = 'Data/Unipartite-Matrix/%d.%s' % (year, weighting)
    sortedVals, N = getSortedMatrixVals(adjMatFile)
    timing.markEvent('Got sorted vals')

    if percents:
        for p in percents:
            outfile = 'Data/Unipartite-Graphs/%d.%s_percent_%f.graph' \
                    % (year, weighting, p)
            graph = pruneGraphByPercent(sortedVals, N, p)
            graph_funcs.saveGraph(graph, outfile)
            timing.markEvent('Finished for %f percent' % p)

    if thresholds:
        for t in thresholds:
            outfile = 'Data/Unipartite-Graphs/%d.%s_threshold_%f.graph' \
                    % (year, weighting, t)
            graph = pruneGraphByThreshold(sortedVals, N, t)
            graph_funcs.saveGraph(graph, outfile)
            timing.markEvent('Finished for threshold %f' % t)

    timing.finish()
Esempio n. 18
0
def getRecipFeatures(graph,
                     donorFeatures,
                     receiptsFromDonor,
                     totalReceipts,
                     totalDonations,
                     partialFeatures,
                     fullFeatures,
                     includeDonorFeatures=False):
    timing = Timer('Getting recipient features')
    recipFeatures = {}

    for recipNode in graph_funcs.getRecipients(graph, cfs=True):
        rnodeid = recipNode.GetId()

        # Add a donor feature indicating what percent of this donor's donations
        # went to this candidate.
        for donor in receiptsFromDonor[rnodeid]:
            pct = receiptsFromDonor[rnodeid][donor] / float(
                totalDonations[donor])
            donorFeatures[donor].append(pct)

        if includeDonorFeatures:
            recipFeatures[rnodeid] = np.append(
                getPartialNodeRecipSpecificFeatures(graph, rnodeid),
                processDonorFeaturesForRecip(donorFeatures,
                                             receiptsFromDonor[rnodeid]))
        else:
            recipFeatures[rnodeid] = \
                processDonorFeaturesForRecip(donorFeatures, receiptsFromDonor[rnodeid])

        # Remove the temporarily added feature for what percent of this donor's
        # donations went to this candidate.
        for donor in receiptsFromDonor[rnodeid]:
            donorFeatures[donor].pop()

    timing.finish()
    return recipFeatures
Esempio n. 19
0
def runFullPipeline(year):
    timing = Timer('Running pipeline for %d' % year)

    weightings = ('adamic', 'cosine', 'jaccard', 'jaccard2', 'weighted_adamic')
    bigraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph'% year)
    newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' % year)

    for weightF in weightings:

        graphFiles = getGraphFiles(year, weightF)

        adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
        timing.markEvent('Loaded everything for donor features')
        genDonorFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph,\
                adjMat=adjMat, newToOldIDs=newToOldIDs)
        del adjMat # free the incredible amount of memory for the adjacency matrix


        genRecipFeatures(year, weightF, graphFiles=graphFiles, bigraph=bigraph)
        results = getResults(year, weightF, graphFiles=graphFiles)
        pickler.save(results, 'Data/pruning_optimizations.%d.%s' % (year, weightF))
        timing.markEvent('Finished with %s' % weightF)

    timing.finish()
Esempio n. 20
0
def calcAverageWeights(graph, adjMat):
    neighbors = defaultdict(list)
    timing = Timer('Calculating average weights')
    # Get all the nodes that a node borders in the graph
    for edge in graph.Edges():
        nodeid1 = edge.GetSrcNId()
        nodeid2 = edge.GetDstNId()
        neighbors[nodeid1].append(nodeid2)
        neighbors[nodeid2].append(nodeid1)
    timing.markEvent('Gotten all neighbors')

    # Get the average weight per node connected to
    weights = {}
    i = 0
    for nodeid in neighbors:
        rows = neighbors[nodeid]
        weights[nodeid] = adjMat[rows, nodeid].sum() / float(len(rows))
        i += 1
        if i % 1000 == 0:
            timing.markEvent('Done with %d out of %d' % (i, len(neighbors)))

    return weights
Esempio n. 21
0
 def init(self):
     self.timer = Timer(3000000).start()
Esempio n. 22
0
def getSortedMatrixVals(filename):
    timing = Timer('Gettin sorted matrix vals')
    adjMat = pickler.load(filename)
    timing.markEvent('Loaded adjacency matrix')
    N = adjMat.shape[0]
    xIndices, yIndices = adjMat.nonzero()
    timing.markEvent('Loaded nonzero indices')
    data = adjMat[xIndices, yIndices]
    timing.markEvent('Loaded nonzero vals')
    flat = np.ravel(data)
    timing.markEvent('Flattened data')

    vals = zip(xIndices, yIndices, flat)
    timing.markEvent('Zipped values')
    vals.sort(key=lambda v: v[2], reverse=True)
    timing.markEvent('Sorted values')
    print vals[0][2] > vals[1][2]
    print vals[0][2], vals[1][2]
    raise ValueError("LOL")
    return vals, N
Esempio n. 23
0
class FileModel():
    relative_path: str = None
    identifier_list_model: IdentifierListModel = None
    identifier_dictionary_model: IdentifierDictionaryModel = None
    word_dictionary_model: WordDictionaryModel = None

    path: str = None
    supported_extensions: [str] = None
    file_name: str = None
    extension: str = None
    timer: Timer = None
    content: str = None

    def __init__(self, path: str, supported_extensions: [int]):
        self.timer = Timer()
        self.path = path
        self.relative_path = PathExtractor().get_relative_path(path)
        self.supported_extensions = supported_extensions
        self.file_name = PathExtractor().get_file_name(path)
        self.extension = PathExtractor().get_file_extension(self.file_name)

    def to_print(self):
        return {
            "relative_path": self.relative_path,
            "identifier_list": self.identifier_list_model.to_print(),
            "identifier_dictionary":
            self.identifier_dictionary_model.to_print(),
            "word_dictionary": self.word_dictionary_model.to_print()
        }

    def to_csv(self):
        content = [
            identifier.to_csv(self.relative_path, name) for (name, identifier)
            in self.identifier_dictionary_model.get_dictionary().items()
        ]
        return "".join(content)

    def is_valid(self):
        if self.extension in self.supported_extensions:
            self.content = FileOpener().get_file_content(self.path)
            return True if self.content else False

    def parse(self):
        Logger().start_analyzing(self.relative_path)
        self.identifier_list_model = LanguageParser().parse_file(
            self.extension, self.content)
        self.identifier_dictionary_model = IdentifierDictionaryModel(
            self.identifier_list_model)
        self.word_dictionary_model = WordDictionaryModel(
            self.identifier_dictionary_model)
        if Word2VecModel.instance.exists():
            self.calculate_semantic_metrics()
        self.identifier_dictionary_model.set_word_metrics(
            self.word_dictionary_model.get_dictionary())
        Logger().finish_analyzing(self.timer.get_duration(),
                                  self.relative_path)

    def calculate_semantic_metrics(self):
        self.set_word2vec_class_name()
        self.set_word2vec_file_context_name()
        self.word_dictionary_model.calculate_semantic_metrics()

    def set_word2vec_class_name(self):
        class_identifiers: [
            str
        ] = self.identifier_list_model.get_filtered_identfier_names(
            IdentifierType.Class)
        class_identifier_words: [
            str
        ] = self.identifier_dictionary_model.get_filtered_words(
            class_identifiers)
        Word2VecModel.instance.set_class_name(class_identifier_words)

    def set_word2vec_file_context_name(self):
        file_context_words: [
            str
        ] = self.word_dictionary_model.get_dictionary_keys()
        Word2VecModel.instance.set_file_context_name(file_context_words)
Esempio n. 24
0
    #'Factor Analysis': FactorAnalysis(n_components='mle'),
    #'ICA': FastICA(n_components='mle'),
}

clfs = {
    'OLS': linear_model.LinearRegression(),
    'Random Forest': ensemble.RandomForestRegressor(),
}

extensions = ('jaccard', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic')

results = {}
resultsList = []

years = [int(arg) for arg in sys.argv[1:]]
timing = Timer('Runnign everything')
for year in years:
    timing.markEvent('Running for year %d' % year)
    results[year] = {}
    for extension in extensions:
        timing.markEvent('Running for extension %s' % extension)
        results[year][extension] = {}
        for clfname, clf in clfs.iteritems():
            timing.markEvent('Running for classifier %s' % clfname)
            results[year][extension][clfname] = {}
            for decompname, decompFunction in decompFunctions.iteritems():
                timing.markEvent('Running for decomp function %s' % decompname)
                rsquareds = cfscore_predictions.trainAndTestModels(
                    year, extension, clf=clf, decomp_func=decompFunction)
                resultsList.append(
                    (year, extension, clfname, decompname, tuple(rsquareds)))
Esempio n. 25
0
                communities[nid] = 0.0
            else:
                communities[nid] = communityIndex
        communityIndex += 1

    return communities


################################################################################
# Module command-line behavior #
################################################################################

if __name__ == '__main__':
    for arg in sys.argv[1:]:
        year = int(arg)
        timing = Timer('creating unipartite graph for %d' % year)

        bipartiteGraph = graph_funcs.loadGraph(
            'Data/Bipartite-Graphs/%d.graph' % year)
        unipartiteGraph = graph_funcs.loadGraph(
            'Data/Unipartite-Graphs/%d.graph' % year, snap.TUNGraph)
        newToOldIDs = pickler.load('Data/Unipartite-NodeMappings/%d.newToOld' %
                                   year)
        timing.markEvent('Loaded input graphs/matrices.')

        #for weightF in ['jaccard', 'affinity', 'jaccard2', 'cosine', 'adamic', 'weighted_adamic']:
        for weightF in ['jaccard2']:
            print '******* %s *******' % weightF
            adjMatrix = pickler.load('Data/Unipartite-Matrix/%d.%s' %
                                     (year, weightF))
            adjMatrix = adjMatrix.tocsc()
Esempio n. 26
0
def extractUnipartiteFeatures(unipartiteGraph, adjMat):
    timing = Timer('extracting unipartite features')

    features = defaultdict(list)
    #componentFeatureFunc, communityFeatureFuncn, idToCommunity = getUnipartiteSurfaceFeatures(unipartiteGraph, adjMat, features)
    componentFeatureFunc, CNMFeatureFunc, idToCNM = getUnipartiteSurfaceFeatures(
        unipartiteGraph, adjMat, features)

    timing.markEvent('1. Extracted surface features')

    # Average weight of edges:
    avgWeights = calcAverageWeights(unipartiteGraph, adjMat)
    #totalWeights = {adjMat
    timing.markEvent('2. Computed average weights.')

    # Size of connected component:
    #cnctComponents = calcCnctComponents(unipartiteGraph)
    timing.markEvent('3. Computed connected components.')

    # Size of CNM community:
    communities = calcCommunities(idToCNM)
    timing.markEvent('4. Computed CNM communities.')

    # Pagerank:
    pageRanks = snap.TIntFltH()
    snap.GetPageRank(unipartiteGraph, pageRanks)
    timing.markEvent('5. Computed PageRank.')

    # combine the graph wide features with the existing surface features:
    for nid in features:
        features[nid].append(avgWeights[nid])
        #features[nid].append(cnctComponents[nid])
        features[nid].append(communities[nid])
        features[nid].append(pageRanks[nid])

    timing.finish()

    return features, componentFeatureFunc, CNMFeatureFunc
Esempio n. 27
0
    fullFeatures['winner'] = getIntAttrFeatureVec(graph, 'winner', full=True)

    return partialFeatures, fullFeatures


################################################################################
# Module command-line behavior #
################################################################################

if __name__ == '__main__':
    #weightings = ('jaccard', 'jaccard2', 'affinity', 'cosine', 'adamic', 'weighted_adamic')
    #weightings = ('adamic', 'weighted_adamic')
    weightings = ('jaccard2', )
    for year in sys.argv[1:]:
        year = int(year)
        timing = Timer('Generating features for %d' % year)
        graph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' % year)
        receiptsFromDonor, totalReceipts, totalDonations = getDonationAmounts(
            graph)
        partialFeatures, fullFeatures = getCategoricalGraphFeatures(graph)

        baselineFeatures = \
            getBaselineFeatures(graph, receiptsFromDonor, totalReceipts, totalDonations, partialFeatures, fullFeatures)
        saveFeatures(graph, baselineFeatures,
                     'Data/Recip-Features/%d.baseline' % year)
        timing.markEvent('Generated baseline features')

        for weighting in weightings:
            donorFeatures = pickler.load('Data/Features/%d%s.features' \
                    % (year, weighting))
            recipFeatures = getRecipFeatures(graph, donorFeatures,
Esempio n. 28
0
def getNonzeroElems(year, weightF):
    timing = Timer('Loading nonzero elems for year %d and weightf %s ' % (year, weightF))
    adjMat = pickler.load('Data/Unipartite-Matrix/%d.%s' % (year, weightF))
    timing.finish()
    return adjMat[adjMat.nonzero()]
Esempio n. 29
0
def createDonorDonorGraph(year, weightF):
    timing = Timer('creating donor-donor graph for %d' % year)

    # Load the old bipartite graph graph
    bipartiteGraph = graph_funcs.loadGraph('Data/Bipartite-Graphs/%d.graph' %
                                           year)

    # Load the info about each donor and their recipients
    numDonations, totalAmount, cands, transactions, amounts, totalReceipts = getDonorInfos(
        bipartiteGraph)
    timing.markEvent('Got info about donor nodes')

    # Create initial unipartite graph with just nodes and node attributes
    unipartiteGraph, oldToNew, newToOld = cloneBipartiteNodes(
        bipartiteGraph, cands)
    timing.markEvent('Finished cloning nodes')

    jaccardData = []
    jaccard2Data = []
    affinityData = []
    cosineData = []
    adamicData = []
    weightedAdamicData = []
    r = []
    c = []

    # Add the weighted edges for every relevant pair of donor nodes
    nodesDone = 0

    for i, newID1 in enumerate(newToOld.keys()):
        oldID1 = newToOld[newID1]
        for newID2 in newToOld.keys()[i + 1:]:
            oldID2 = newToOld[newID2]

            sharedCands = cands[oldID1].intersection(cands[oldID2])
            if not sharedCands: continue

            # Calculate the weight
            weights = weightF(oldID1, oldID2, sharedCands, numDonations,
                              totalAmount, cands, transactions, amounts,
                              totalReceipts)

            r.append(newID1)
            r.append(newID2)
            c.append(newID2)
            c.append(newID1)
            jaccardData.append(weights['jaccard'])
            jaccardData.append(weights['jaccard'])
            jaccard2Data.append(weights['jaccard2'])
            jaccard2Data.append(weights['jaccard2'])
            affinityData.append(weights['affinity'])
            affinityData.append(weights['affinity'])
            cosineData.append(weights['cosine'])
            cosineData.append(weights['cosine'])
            adamicData.append(weights['adamic'])
            adamicData.append(weights['adamic'])
            weightedAdamicData.append(weights['weighted_adamic'])
            weightedAdamicData.append(weights['weighted_adamic'])

            # Add the edges between the two nodes and their weights
            unipartiteGraph.AddEdge(newID1, newID2)

        nodesDone += 1
        if nodesDone % 100 == 0:
            timing.markEvent('Finished %d outer loops out of %d' % \
                    (nodesDone, unipartiteGraph.GetNodes()))

    N = len(newToOld)
    jaccardAdjMat = sp.csr_matrix((jaccardData, (r, c)), shape=(N, N))
    jaccard2AdjMat = sp.csr_matrix((jaccard2Data, (r, c)), shape=(N, N))
    affinityAdjMat = sp.csr_matrix((affinityData, (r, c)), shape=(N, N))
    cosineAdjMat = sp.csr_matrix((cosineData, (r, c)), shape=(N, N))
    adamicAdjMat = sp.csr_matrix((adamicData, (r, c)), shape=(N, N))
    weightedAdamicAdjMat = sp.csr_matrix((weightedAdamicData, (r, c)),
                                         shape=(N, N))

    timing.finish()
    return unipartiteGraph, jaccardAdjMat, jaccard2AdjMat, affinityAdjMat, cosineAdjMat, adamicAdjMat, weightedAdamicAdjMat, newToOld, oldToNew
Esempio n. 30
0
def getCorrel(year, weightFs):
    timing = Timer('Getting correlation matrix for year %d' % year)
    append = lambda x, y: np.append(x, y, axis=0)
    data = reduce(append, [getNonzeroElems(year, weightF) for weightF in weightFs])
    timing.finish()
    return np.corrcoef(data)
Esempio n. 31
0
# Weighted Adamic Adar Similarity Index: (http://www.slideshare.net/hajimesasaki1/picmet15sasaki20150805ppt)
# <On slide 8>
def weightedAdamic(id1, id2, sharedCands, numDonations, totalAmount, cands,
                   transactions, amounts, totalReceipts):
    score = sum([(amounts[id1][cand] + amounts[id2][cand]) /
                 (1.0 + math.log(totalReceipts[cand], 10))
                 for cand in sharedCands])
    return 'weighted_adamic', score


################################################################################
# Module command-line behavior #
################################################################################

if __name__ == '__main__':
    overallTiming = Timer('all unipartite graphs')
    for arg in sys.argv[1:]:
        year = int(arg)
        timing = Timer('Creating unipartite graph for %d' % year)

        graph, wmat1, wmat2, wmat3, wmat4, wmat5, wmat6, newToOld, oldToNew = createDonorDonorGraph(
            year, getWeightScores)

        # Save the SNAP graph:
        outfile = 'Data/Unipartite-Graphs/%d.graph' % year
        graph_funcs.saveGraph(graph, outfile)

        # Save the weight matrices:
        matrixPrefix = 'Data/Unipartite-Matrix/%d' % year
        pickler.save(wmat1, matrixPrefix + '.jaccard')
        pickler.save(wmat2, matrixPrefix + '.jaccard2')