def test_different_initials(self): assert_true(who.ratio(self.name, 'E. R. Liebowitz') < 100) assert_true(who.ratio(self.name, 'E. Liebowitz') < 100) assert_true(who.ratio(self.name, 'R. V. Liebowitz') < 100) assert_true(who.ratio(self.name, 'O. E. Liebowitz') < 100) assert_true(who.ratio(self.name, 'E. R. Liebowitz') < who.ratio(self.name, 'E. E. Liebowitz')) assert_true(who.ratio(self.name, 'E. R. Liebowitz') < who.ratio(self.name, 'R. R. Liebowitz')) assert_true(who.ratio(self.name, 'E. R. Liebowitz') < who.ratio(self.name, 'E. Liebowitz'))
def test_nickname(self): name = 'Robert "Evan" Liebowitz' assert_equal(who.ratio(name, 'Evan Liebowitz'), 100) assert_equal(who.ratio('Evan Liebowitz', name), 100) assert_true(who.ratio(name, 'Wrongbert Lieobwitz') < 100) assert_true(who.ratio(name, 'Robert Evan') < 100) assert_true(who.ratio(name, 'Evan Liebowitz', options={'check_nickname': False}) < 100) assert_true(who.ratio(name, 'xxxx Liebowitz') < who.ratio(name, 'xvax Liebowitz')) assert_equal(who.ratio(name, 'xxxx Liebowitz'), who.ratio(name, 'xvax Liebowitz', 'strict'))
def string_disambiguation(value, candidates, name=False): def get_label(x): return x.split('resource')[1][1:].replace('_', ' ') if name: best_dist, best_match = (-1, -float('inf')), None for entity in candidates: label = get_label(entity) name_sim = who.ratio(label, value) neg_lev_dist = -levenshtein(value.lower(), label.lower()) dist = (name_sim, neg_lev_dist) if dist[1] <= 0 and dist > best_dist: best_dist = dist best_match = entity else: distances = [] best_dist, best_match = 999999, None for entity in candidates: label = get_label(entity) dist = levenshtein(value.lower(), label.lower(), best_dist) if 0 <= dist < best_dist: best_dist = dist best_match = entity return best_match
def test_suffixes(self): name = 'Robert Liebowitz Jr' assert_equal(who.ratio(name, 'Robert Liebowitz'), 100) assert_equal(who.ratio(name, 'Robert Liebowitz Jr'), 100) assert_equal(who.ratio(name, 'Robert Liebowitz, PhD'), 100) assert_false(who.ratio(name, 'Robert Liebowitz, Sr')) assert_false(who.ratio(name, 'Robert Liebowitz, Sr, PhD')) assert_equal(who.ratio(name, 'Robert Liebowitz, Jr, PhD'), 100) # Suffix doesn't change a match assert_equal(who.ratio(name, 'Zachary Liebowitz, Jr'), who.ratio(name, 'Zachary Liebowitz'))
def test_titles(self): name = 'Mr. Robert Liebowitz' assert_equal(who.ratio(name, 'Robert Liebowitz'), 100) assert_equal(who.ratio(name, 'Sir Robert Liebowitz'), 100) assert_equal(who.ratio(name, 'Dr. Robert Liebowitz'), 100) assert_false(who.ratio(name, 'Mrs. Robert Liebowitz')) # Title doesn't change a match assert_equal(who.ratio(name, 'Dr. Zachary Liebowitz'), who.ratio(name, 'Zachary Liebowitz'))
s = p.extract(s1, choices, limit=len(choices)) q = p.extract(s2, choices, limit=len(choices)) # a flag error = 0 # aggregate count for all the names in list when compared with 1st reference name d1 = 0 print(s, q) for i in s: d1 += i[1] # if score is somewhere less than 70, use whoswho library # since if acronym is there for some name then whoswho gives better result if i[1] < 70: q1 = (w.ratio(s1, i[0])) q2 = (w.ratio(s2, i[0])) print(i[0]) # if score is still <60 then block the user if q1 < 60 and q2 < 60: print('blocked') error = 1 break else: # if score >=60 # decrement fuzzywuzzy score and add whoswho score d1 -= i[1] d1 += max(q1, q2) error = 0 # is all the names passed successfully from 1st reference name, repeat same process for second reference name
def test_name_and_initials(self): assert_equal(who.ratio(self.name, 'R. Evan Liebowitz'), 100) assert_equal(who.ratio(self.name, 'Robert E. Liebowitz'), 100) assert_equal(who.ratio(self.name, 'R. E. Liebowitz'), 100)
def test_unicode(self): name = self.name assert_equal(who.ratio(name, 'attaché Robert Evan Liebowitz'), 100) assert_equal(who.ratio(name, 'Rōbért Èvān Lîęböwitz'), 100) assert_true(who.ratio(name, 'Rōbért Èvān Lęîböwitz') < 100)
def test_string(self): # Only relevant for python 2.X assert_equal(who.ratio(self.name, str('Robert Liebowitz')), 100)
def test_equivalent_suffixes(self): name = 'Robert Liebowitz Jr' assert_equal(who.ratio(name, 'Robert Liebowitz Jnr'), 100) assert_false(who.ratio(name, 'Robert Liebowitz Snr'))
def test_short_names(self): assert_true(who.ratio(self.name, 'Rob Liebowitz')) assert_true(who.ratio(self.name, 'Bert Liebowitz') < 100) assert_true(who.ratio(self.name, 'Robbie Liebowitz') < 100) assert_true(who.ratio(self.name, 'xxxxx Liebowitz') < who.ratio(self.name, 'Bobby Liebowitz'))
def test_different_number_initials(self): assert_equal(who.ratio(self.name, 'Robert Liebowitz'), 100) assert_equal(who.ratio(self.name, 'R. Liebowitz'), 100) assert_true(who.ratio(self.name, 'Robert E. E. Liebowitz') < 100) assert_true(who.ratio(self.name, 'R. E. E. Liebowitz') < 100) assert_equal(who.ratio('R.E.E. Liebowitz', 'R. E. E. Liebowitz'), 100)
def index(): name1 = request.args.get('name1') name2 = request.args.get('name2') return str(who.ratio(name1, name2))
from queue import PriorityQueue import pandas as pd from whoswho import who from tqdm import tqdm from itertools import count repec_authors = pd.read_csv('csv/authors.csv') top50_authors = pd.read_csv('csv/top50.csv') tiebreaker = count() match_results = [] for index, row in tqdm(top50_authors.iterrows(), total=top50_authors.shape[0]): name = row['Name'] sorted_matches = PriorityQueue() for i, r in tqdm(repec_authors.iterrows(), total=repec_authors.shape[0]): repec_name = r['author_name'] score = who.ratio(name, repec_name, 'strict') # print(score, r) repect_author = {'score': score} repect_author.update(r.to_dict()) sorted_matches.put((-score, next(tiebreaker), repect_author)) match_result = row.to_dict() for i in range(1, 4): match = sorted_matches.get()[-1] for key in match.keys(): match_result[f'match_{i}_{key}'] = match[key] match_results.append(match_result) df = pd.DataFrame(match_results) df.to_csv('csv/top50_matched.csv', index=False)