Example #1
0
 def test_different_initials(self):
     assert_true(who.ratio(self.name, 'E. R. Liebowitz') < 100)
     assert_true(who.ratio(self.name, 'E. Liebowitz') < 100)
     assert_true(who.ratio(self.name, 'R. V. Liebowitz') < 100)
     assert_true(who.ratio(self.name, 'O. E. Liebowitz') < 100)
     assert_true(who.ratio(self.name, 'E. R. Liebowitz') <
                 who.ratio(self.name, 'E. E. Liebowitz'))
     assert_true(who.ratio(self.name, 'E. R. Liebowitz') <
                 who.ratio(self.name, 'R. R. Liebowitz'))
     assert_true(who.ratio(self.name, 'E. R. Liebowitz') <
                 who.ratio(self.name, 'E. Liebowitz'))
Example #2
0
 def test_nickname(self):
     name = 'Robert "Evan" Liebowitz'
     assert_equal(who.ratio(name, 'Evan Liebowitz'), 100)
     assert_equal(who.ratio('Evan Liebowitz', name), 100)
     assert_true(who.ratio(name, 'Wrongbert Lieobwitz') < 100)
     assert_true(who.ratio(name, 'Robert Evan') < 100)
     assert_true(who.ratio(name, 'Evan Liebowitz',
                           options={'check_nickname': False}) < 100)
     assert_true(who.ratio(name, 'xxxx Liebowitz') <
                 who.ratio(name, 'xvax Liebowitz'))
     assert_equal(who.ratio(name, 'xxxx Liebowitz'),
                  who.ratio(name, 'xvax Liebowitz', 'strict'))
Example #3
0
def string_disambiguation(value, candidates, name=False):
    def get_label(x):
        return x.split('resource')[1][1:].replace('_', ' ')

    if name:
        best_dist, best_match = (-1, -float('inf')), None
        for entity in candidates:
            label = get_label(entity)
            name_sim = who.ratio(label, value)
            neg_lev_dist = -levenshtein(value.lower(), label.lower())
            dist = (name_sim, neg_lev_dist)
            if dist[1] <= 0 and dist > best_dist:
                best_dist = dist
                best_match = entity
    else:
        distances = []
        best_dist, best_match = 999999, None
        for entity in candidates:
            label = get_label(entity)
            dist = levenshtein(value.lower(), label.lower(), best_dist)
            if 0 <= dist < best_dist:
                best_dist = dist
                best_match = entity

    return best_match
Example #4
0
 def test_suffixes(self):
     name = 'Robert Liebowitz Jr'
     assert_equal(who.ratio(name, 'Robert Liebowitz'), 100)
     assert_equal(who.ratio(name, 'Robert Liebowitz Jr'), 100)
     assert_equal(who.ratio(name, 'Robert Liebowitz, PhD'), 100)
     assert_false(who.ratio(name, 'Robert Liebowitz, Sr'))
     assert_false(who.ratio(name, 'Robert Liebowitz, Sr, PhD'))
     assert_equal(who.ratio(name, 'Robert Liebowitz, Jr, PhD'), 100)
     # Suffix doesn't change a match
     assert_equal(who.ratio(name, 'Zachary Liebowitz, Jr'),
                  who.ratio(name, 'Zachary Liebowitz'))
Example #5
0
 def test_titles(self):
     name = 'Mr. Robert Liebowitz'
     assert_equal(who.ratio(name, 'Robert Liebowitz'), 100)
     assert_equal(who.ratio(name, 'Sir Robert Liebowitz'), 100)
     assert_equal(who.ratio(name, 'Dr. Robert Liebowitz'), 100)
     assert_false(who.ratio(name, 'Mrs. Robert Liebowitz'))
     # Title doesn't change a match
     assert_equal(who.ratio(name, 'Dr. Zachary Liebowitz'),
                  who.ratio(name, 'Zachary Liebowitz'))
Example #6
0
s = p.extract(s1, choices, limit=len(choices))
q = p.extract(s2, choices, limit=len(choices))

# a flag
error = 0

# aggregate count for all the names in list when compared with 1st reference name
d1 = 0
print(s, q)
for i in s:
    d1 += i[1]

    # if score is somewhere less than 70, use whoswho library
    # since if acronym is there for some name then whoswho gives better result
    if i[1] < 70:
        q1 = (w.ratio(s1, i[0]))
        q2 = (w.ratio(s2, i[0]))
        print(i[0])
        # if score is still <60 then block the user
        if q1 < 60 and q2 < 60:
            print('blocked')
            error = 1
            break
        else:
            # if score >=60
            # decrement fuzzywuzzy score and add whoswho score
            d1 -= i[1]
            d1 += max(q1, q2)
            error = 0

# is all the names passed successfully from 1st reference name, repeat same process for second reference name
Example #7
0
 def test_name_and_initials(self):
     assert_equal(who.ratio(self.name, 'R. Evan Liebowitz'), 100)
     assert_equal(who.ratio(self.name, 'Robert E. Liebowitz'), 100)
     assert_equal(who.ratio(self.name, 'R. E. Liebowitz'), 100)
Example #8
0
 def test_unicode(self):
     name = self.name
     assert_equal(who.ratio(name, 'attaché Robert Evan Liebowitz'), 100)
     assert_equal(who.ratio(name, 'Rōbért Èvān Lîęböwitz'), 100)
     assert_true(who.ratio(name, 'Rōbért Èvān Lęîböwitz') < 100)
Example #9
0
 def test_string(self):
     # Only relevant for python 2.X
     assert_equal(who.ratio(self.name, str('Robert Liebowitz')), 100)
Example #10
0
 def test_equivalent_suffixes(self):
     name = 'Robert Liebowitz Jr'
     assert_equal(who.ratio(name, 'Robert Liebowitz Jnr'), 100)
     assert_false(who.ratio(name, 'Robert Liebowitz Snr'))
Example #11
0
 def test_short_names(self):
     assert_true(who.ratio(self.name, 'Rob Liebowitz'))
     assert_true(who.ratio(self.name, 'Bert Liebowitz') < 100)
     assert_true(who.ratio(self.name, 'Robbie Liebowitz') < 100)
     assert_true(who.ratio(self.name, 'xxxxx Liebowitz') <
                 who.ratio(self.name, 'Bobby Liebowitz'))
Example #12
0
 def test_different_number_initials(self):
     assert_equal(who.ratio(self.name, 'Robert Liebowitz'), 100)
     assert_equal(who.ratio(self.name, 'R. Liebowitz'), 100)
     assert_true(who.ratio(self.name, 'Robert E. E. Liebowitz') < 100)
     assert_true(who.ratio(self.name, 'R. E. E. Liebowitz') < 100)
     assert_equal(who.ratio('R.E.E. Liebowitz', 'R. E. E. Liebowitz'), 100)
Example #13
0
def index():
    name1 = request.args.get('name1')
    name2 = request.args.get('name2')
    return str(who.ratio(name1, name2))
Example #14
0
from queue import PriorityQueue
import pandas as pd
from whoswho import who
from tqdm import tqdm
from itertools import count

repec_authors = pd.read_csv('csv/authors.csv')
top50_authors = pd.read_csv('csv/top50.csv')

tiebreaker = count()
match_results = []
for index, row in tqdm(top50_authors.iterrows(), total=top50_authors.shape[0]):
    name = row['Name']
    sorted_matches = PriorityQueue()
    for i, r in tqdm(repec_authors.iterrows(), total=repec_authors.shape[0]):
        repec_name = r['author_name']
        score = who.ratio(name, repec_name, 'strict')
        # print(score, r)
        repect_author = {'score': score}
        repect_author.update(r.to_dict())
        sorted_matches.put((-score, next(tiebreaker), repect_author))
    match_result = row.to_dict()
    for i in range(1, 4):
        match = sorted_matches.get()[-1]
        for key in match.keys():
            match_result[f'match_{i}_{key}'] = match[key]
    match_results.append(match_result)

df = pd.DataFrame(match_results)
df.to_csv('csv/top50_matched.csv', index=False)