コード例 #1
0
def assign_clusters():
    '''
    This function does the actual donor assignment, assigning unique donor IDs in the contribution table
    based on clusters of contributions that appear to have the same donor. It works pretty much the same
    as the mark_matches function above.
    '''
    # Again, instantiate and train our classifier
    clf = RandomForestClassifier(n_estimators=10, random_state=0)
    clf = clf.fit([eval(t.features) for t in TRAINING_DATA],
                  [int(t.same) for t in TRAINING_DATA])

    # Loop through the last name groups
    print 'Processing groups ...'
    for g in Contribution.objects.all().values('group_id').distinct():
        if not g['group_id']: continue
        toupdate = []
        G = nx.Graph(
        )  # Create an empty network graph for each last name group
        # We're using a simple hash function to help generate unique donor IDs
        nameid = hashlib.sha224(str(g['group_id'])).hexdigest()
        # For each match in a last name group
        for m in Match.objects.filter(c1__group_id=g['group_id']):
            # Do the two contributions have the same donor? Same as above.
            edge = clf.predict_proba(eval(m.features))
            if edge[0][1] > edge[0][0]:
                # If they do, add an edge between those contributions in the network graph we created
                # a few steps ago. This process is outlined in the steps here:
                # https://github.com/cjdd3b/fec-standardizer/wiki/Matching-donors
                G.add_edge(m.c1, m.c2)

        # Now we want to go through the graph we created and basically find all the contributions that are
        # connected. If the contributions were connected in the step above, that means they're probably from
        # the same donor. So a donor is basically defined by small networks of connected contributions. This
        # is described further here: https://github.com/cjdd3b/fec-standardizer/wiki/Defining-donor-clusters
        ccs = connected_components(G)

        # Now loop through each of the donor clusters generated by the connected_components function
        for c in enumerate(ccs):
            donor_id = c[0]
            for i in c[1]:
                # Create a donor ID based on our group hash above and the enumerated cluster number
                classifier_id = '%s%s' % (donor_id, nameid)
                i.classifier_id = classifier_id[:12]
                toupdate.append(i)
        # Bulk save the donor IDs to the contribution table
        commit_saves(toupdate)

    print 'Cleaning up the leftovers ...'
    tocleanup = []
    for record in Contribution.objects.filter(classifier_id__isnull=True):
        if not record.match_repr: continue
        classifier_id = '99%s' % hashlib.sha224(record.match_repr).hexdigest()
        record.classifier_id = classifier_id[:12]
        tocleanup.append(record)
    commit_saves(tocleanup)
    return
コード例 #2
0
def assign_clusters():
    '''
    This function does the actual donor assignment, assigning unique donor IDs in the contribution table
    based on clusters of contributions that appear to have the same donor. It works pretty much the same
    as the mark_matches function above.
    '''
    # Again, instantiate and train our classifier
    clf = RandomForestClassifier(n_estimators=10, random_state=0)
    clf = clf.fit([eval(t.features) for t in TRAINING_DATA], [int(t.same) for t in TRAINING_DATA])

    # Loop through the last name groups
    print 'Processing groups ...'
    for g in Contribution.objects.all().values('group_id').distinct():
        if not g['group_id']: continue
        toupdate = []
        G = nx.Graph() # Create an empty network graph for each last name group
        # We're using a simple hash function to help generate unique donor IDs
        nameid = hashlib.sha224(str(g['group_id'])).hexdigest()
        # For each match in a last name group
        for m in Match.objects.filter(c1__group_id=g['group_id']):
            # Do the two contributions have the same donor? Same as above.
            edge = clf.predict_proba(eval(m.features))
            if edge[0][1] > edge[0][0]:
                # If they do, add an edge between those contributions in the network graph we created
                # a few steps ago. This process is outlined in the steps here:
                # https://github.com/cjdd3b/fec-standardizer/wiki/Matching-donors
                G.add_edge(m.c1, m.c2)

        # Now we want to go through the graph we created and basically find all the contributions that are
        # connected. If the contributions were connected in the step above, that means they're probably from
        # the same donor. So a donor is basically defined by small networks of connected contributions. This
        # is described further here: https://github.com/cjdd3b/fec-standardizer/wiki/Defining-donor-clusters
        ccs = connected_components(G)

        # Now loop through each of the donor clusters generated by the connected_components function
        for c in enumerate(ccs):
            donor_id = c[0]
            for i in c[1]:
                # Create a donor ID based on our group hash above and the enumerated cluster number
                classifier_id = '%s%s' % (donor_id, nameid)
                i.classifier_id = classifier_id[:12]
                toupdate.append(i)
        # Bulk save the donor IDs to the contribution table
        commit_saves(toupdate)

    print 'Cleaning up the leftovers ...'
    tocleanup = []
    for record in Contribution.objects.filter(classifier_id__isnull=True):
        if not record.match_repr: continue
        classifier_id = '99%s' % hashlib.sha224(record.match_repr).hexdigest()
        record.classifier_id = classifier_id[:12]
        tocleanup.append(record)
    commit_saves(tocleanup)
    return
コード例 #3
0
def mark_matches():
    ''' 
    This function is mostly just here for testing whether the classifier and CRP agree at the match
    level. It doesn't do anything in terms of grouping donors, unlike the assign_clusters
    function below. You can run it or not -- it doesn't matter.
    '''
    # Prepare our Random Forest classifier and train using the training set above
    clf = RandomForestClassifier(n_estimators=10, random_state=0)
    # Training the Random Forest takes two inputs: a list of feature vectors and a list of correct
    # classifications associated with those vectors. More info here:
    # http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    clf = clf.fit([eval(t.features) for t in TRAINING_DATA],
                  [int(t.same) for t in TRAINING_DATA])

    # Loop through each of the initial groups we created
    for g in Contribution.objects.all().values('group_id').distinct():
        toupdate = []
        # Now go through all the contribution pairs within a given group
        for m in Match.objects.filter(c1__group_id=g['group_id']):
            # Use the classifier to predict, based on the match feature vector, whether the donors
            # are the same. This returns a probability score of yes vs. no
            edge = clf.predict_proba(eval(m.features))
            # If the yes probability outweighs the no
            if edge[0][1] > edge[0][0]:
                # Mark them as the same and record confidence score
                m.classifier_same = True
                m.score = edge[0][1]
            else:
                # Or else mark them as a non-match and record the score
                m.classifier_same = False
                m.score = edge[0][0]
            # Now compare our classifier's judgment vs. the ground truth CRP data and mark accordingly
            m.match_status = get_match_status(m)
            toupdate.append(m)
        # Using manual transaction management to speed things up. See utils/db.py
        commit_saves(toupdate)
    return
コード例 #4
0
def mark_matches():
    ''' 
    This function is mostly just here for testing whether the classifier and CRP agree at the match
    level. It doesn't do anything in terms of grouping donors, unlike the assign_clusters
    function below. You can run it or not -- it doesn't matter.
    '''
    # Prepare our Random Forest classifier and train using the training set above
    clf = RandomForestClassifier(n_estimators=10, random_state=0)
    # Training the Random Forest takes two inputs: a list of feature vectors and a list of correct
    # classifications associated with those vectors. More info here:
    # http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    clf = clf.fit([eval(t.features) for t in TRAINING_DATA], [int(t.same) for t in TRAINING_DATA])

    # Loop through each of the initial groups we created
    for g in Contribution.objects.all().values('group_id').distinct():
        toupdate = []
        # Now go through all the contribution pairs within a given group
        for m in Match.objects.filter(c1__group_id=g['group_id']):
            # Use the classifier to predict, based on the match feature vector, whether the donors
            # are the same. This returns a probability score of yes vs. no
            edge = clf.predict_proba(eval(m.features))
            # If the yes probability outweighs the no
            if edge[0][1] > edge[0][0]:
                # Mark them as the same and record confidence score
                m.classifier_same = True
                m.score = edge[0][1]
            else:
                # Or else mark them as a non-match and record the score
                m.classifier_same = False
                m.score = edge[0][0]
            # Now compare our classifier's judgment vs. the ground truth CRP data and mark accordingly
            m.match_status = get_match_status(m)
            toupdate.append(m)
        # Using manual transaction management to speed things up. See utils/db.py
        commit_saves(toupdate)
    return