def assign_clusters(): ''' This function does the actual donor assignment, assigning unique donor IDs in the contribution table based on clusters of contributions that appear to have the same donor. It works pretty much the same as the mark_matches function above. ''' # Again, instantiate and train our classifier clf = RandomForestClassifier(n_estimators=10, random_state=0) clf = clf.fit([eval(t.features) for t in TRAINING_DATA], [int(t.same) for t in TRAINING_DATA]) # Loop through the last name groups print 'Processing groups ...' for g in Contribution.objects.all().values('group_id').distinct(): if not g['group_id']: continue toupdate = [] G = nx.Graph( ) # Create an empty network graph for each last name group # We're using a simple hash function to help generate unique donor IDs nameid = hashlib.sha224(str(g['group_id'])).hexdigest() # For each match in a last name group for m in Match.objects.filter(c1__group_id=g['group_id']): # Do the two contributions have the same donor? Same as above. edge = clf.predict_proba(eval(m.features)) if edge[0][1] > edge[0][0]: # If they do, add an edge between those contributions in the network graph we created # a few steps ago. This process is outlined in the steps here: # https://github.com/cjdd3b/fec-standardizer/wiki/Matching-donors G.add_edge(m.c1, m.c2) # Now we want to go through the graph we created and basically find all the contributions that are # connected. If the contributions were connected in the step above, that means they're probably from # the same donor. So a donor is basically defined by small networks of connected contributions. This # is described further here: https://github.com/cjdd3b/fec-standardizer/wiki/Defining-donor-clusters ccs = connected_components(G) # Now loop through each of the donor clusters generated by the connected_components function for c in enumerate(ccs): donor_id = c[0] for i in c[1]: # Create a donor ID based on our group hash above and the enumerated cluster number classifier_id = '%s%s' % (donor_id, nameid) i.classifier_id = classifier_id[:12] toupdate.append(i) # Bulk save the donor IDs to the contribution table commit_saves(toupdate) print 'Cleaning up the leftovers ...' tocleanup = [] for record in Contribution.objects.filter(classifier_id__isnull=True): if not record.match_repr: continue classifier_id = '99%s' % hashlib.sha224(record.match_repr).hexdigest() record.classifier_id = classifier_id[:12] tocleanup.append(record) commit_saves(tocleanup) return
def assign_clusters(): ''' This function does the actual donor assignment, assigning unique donor IDs in the contribution table based on clusters of contributions that appear to have the same donor. It works pretty much the same as the mark_matches function above. ''' # Again, instantiate and train our classifier clf = RandomForestClassifier(n_estimators=10, random_state=0) clf = clf.fit([eval(t.features) for t in TRAINING_DATA], [int(t.same) for t in TRAINING_DATA]) # Loop through the last name groups print 'Processing groups ...' for g in Contribution.objects.all().values('group_id').distinct(): if not g['group_id']: continue toupdate = [] G = nx.Graph() # Create an empty network graph for each last name group # We're using a simple hash function to help generate unique donor IDs nameid = hashlib.sha224(str(g['group_id'])).hexdigest() # For each match in a last name group for m in Match.objects.filter(c1__group_id=g['group_id']): # Do the two contributions have the same donor? Same as above. edge = clf.predict_proba(eval(m.features)) if edge[0][1] > edge[0][0]: # If they do, add an edge between those contributions in the network graph we created # a few steps ago. This process is outlined in the steps here: # https://github.com/cjdd3b/fec-standardizer/wiki/Matching-donors G.add_edge(m.c1, m.c2) # Now we want to go through the graph we created and basically find all the contributions that are # connected. If the contributions were connected in the step above, that means they're probably from # the same donor. So a donor is basically defined by small networks of connected contributions. This # is described further here: https://github.com/cjdd3b/fec-standardizer/wiki/Defining-donor-clusters ccs = connected_components(G) # Now loop through each of the donor clusters generated by the connected_components function for c in enumerate(ccs): donor_id = c[0] for i in c[1]: # Create a donor ID based on our group hash above and the enumerated cluster number classifier_id = '%s%s' % (donor_id, nameid) i.classifier_id = classifier_id[:12] toupdate.append(i) # Bulk save the donor IDs to the contribution table commit_saves(toupdate) print 'Cleaning up the leftovers ...' tocleanup = [] for record in Contribution.objects.filter(classifier_id__isnull=True): if not record.match_repr: continue classifier_id = '99%s' % hashlib.sha224(record.match_repr).hexdigest() record.classifier_id = classifier_id[:12] tocleanup.append(record) commit_saves(tocleanup) return
def mark_matches(): ''' This function is mostly just here for testing whether the classifier and CRP agree at the match level. It doesn't do anything in terms of grouping donors, unlike the assign_clusters function below. You can run it or not -- it doesn't matter. ''' # Prepare our Random Forest classifier and train using the training set above clf = RandomForestClassifier(n_estimators=10, random_state=0) # Training the Random Forest takes two inputs: a list of feature vectors and a list of correct # classifications associated with those vectors. More info here: # http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html clf = clf.fit([eval(t.features) for t in TRAINING_DATA], [int(t.same) for t in TRAINING_DATA]) # Loop through each of the initial groups we created for g in Contribution.objects.all().values('group_id').distinct(): toupdate = [] # Now go through all the contribution pairs within a given group for m in Match.objects.filter(c1__group_id=g['group_id']): # Use the classifier to predict, based on the match feature vector, whether the donors # are the same. This returns a probability score of yes vs. no edge = clf.predict_proba(eval(m.features)) # If the yes probability outweighs the no if edge[0][1] > edge[0][0]: # Mark them as the same and record confidence score m.classifier_same = True m.score = edge[0][1] else: # Or else mark them as a non-match and record the score m.classifier_same = False m.score = edge[0][0] # Now compare our classifier's judgment vs. the ground truth CRP data and mark accordingly m.match_status = get_match_status(m) toupdate.append(m) # Using manual transaction management to speed things up. See utils/db.py commit_saves(toupdate) return