def contributor_name_similarity(c1, c2):
    '''
    How similar are the two contribution donors' full unprocessed contributor names?
    Based on jaccard similarity of 3-shingles of the names.

    Returns a float between 0 and 1, with 1 being identical.
    '''
    e1_shingles = shingle(clean_str(c1.contributor_name), 3)
    e2_shingles = shingle(clean_str(c2.contributor_name), 3)
    return jaccard_sim(e1_shingles, e2_shingles)
def employer_similarity(c1, c2):
    '''
    How similar are the two contribution donors' employer strings? Based on jaccard similarity
    of 3-shingles of the names.

    Returns a float between 0 and 1, with 1 being identical.
    '''
    e1_shingles = shingle(clean_str(c1.employer), 3)
    e2_shingles = shingle(clean_str(c2.employer), 3)
    return jaccard_sim(e1_shingles, e2_shingles)
def occupation_similarity(c1, c2):
    '''
    How similar are the two contribution donors' occupation strings? Based on jaccard similarity
    of 3-shingles of the names.

    Returns a float between 0 and 1, with 1 being identical.
    '''
    o1_shingles = shingle(clean_str(c1.occupation), 3)
    o2_shingles = shingle(clean_str(c2.occupation), 3)
    return jaccard_sim(o1_shingles, o2_shingles)
def first_name_similarity(c1, c2):
    '''
    How similar are the two contribution donors' first names? Based on jaccard similarity
    of 3-shingles of the names.

    Returns a float between 0 and 1, with 1 being identical.
    '''
    name1_shingles = shingle(clean_str(c1.first_name), 3)
    name2_shingles = shingle(clean_str(c2.first_name), 3)
    return jaccard_sim(name1_shingles, name2_shingles)
Beispiel #5
0
def group_by_lsh():
    '''
    Groups all of our contribution data by the output of a locality sensitive hashing function. The
    LSH implementation is stored in utils/lsh. You can read more about it here:

    http://en.wikipedia.org/wiki/Locality-sensitive_hashing
    '''
    # First step is to create the actual LSH clusters, based on 1-shingles of the names
    cluster = Cluster(threshold=1.0)
    for ln in Contribution.objects.values('last_name').distinct():
        name = ln['last_name']
        if not name: continue # If last name isn't filled out for some reason
        cluster.add_set(shingle(name, 1), name)

    # Next step is to iterate through those clusters and produce an output of each set
    # of last names, along with the contributions associated with them.
    for c in enumerate(cluster.get_sets()):
        for name in c[1]:
            g, created = Group.objects.get_or_create(name='LSH: %s' % c[0])
            Contribution.objects.filter(last_name=name).update(group=g)
    return
Beispiel #6
0
if __name__ == '__main__':
    # First do the initial groupings
    print 'Forming initial groups ...'
    group_by_last_name() # In this case, we're using the last name function from above

    # Now loop through the groups we just created and start putting together potential matches
    print 'Preprocessing matches ...'
    for g in Group.objects.all():
        tocreate = []
        # For any given last name, split up the contributions into every possible combibation of pairs
        for c in itertools.combinations(g.contribution_set.all(), 2):
            compstring1 = '%s %s %s' % (c[0].first_name, c[0].city, c[0].state)
            compstring2 = '%s %s %s' % (c[1].first_name, c[1].city, c[1].state)
            # Check to see if the two donors in a given pair are even remotely similar. If they're not, ignore.
            if jaccard_sim(shingle(compstring1.lower(), 2), shingle(compstring2.lower(), 2)) >= INITIAL_SIM:
                # But if they are, create a feature vector describing the dimensions of their similarity for the
                # machine learning algorithm to use later.
                c1, c2 = c[0], c[1]
                featurevector = str(create_featurevector(c1, c2))
                # Save that feature vector and other information into a match object
                match = Match(c1=c1, c2=c2, features=featurevector)
                match.same = False
                # If the two contributions in the pair are regarded as coming from the same donor by the ground-truth
                # CRP data, mark them as a match so we can use them for testing and training the classifier.
                if (c1.donor_id and c2.donor_id) and (c1.donor_id == c2.donor_id):
                    match.same = True
                tocreate.append(match)
        # Again, we're bulk creating to cut down on database transactions.
        Match.objects.bulk_create(tocreate)