def get_matches(school, source): with open('../../teams/' + source + '/results_' + source + '.csv') as csv_file: csv_reader = csv.reader(csv_file) is_head_row = True for row in csv_reader: if is_head_row: # skip the one head row is_head_row = False continue my_match = match.match(source, row) # IMPORTANT LINE - CALCULATES SIMILARITY SCORE # rltk.jaro_winkler_similarity(s1, s2, <threshold for invoking prefix score>, <scaling factor for prefix score>, <length of prefix score>) if (source == "rivals" or reference_source == "rivals"): # then we have to use school name # distance_score = rltk.levenshtein_similarity(school.school_name, my_match.school_name) distance_score = rltk.jaro_winkler_similarity( school.school_name, my_match.school_name, 0.6, 0.25, 1) else: # distance_score = rltk.levenshtein_similarity(school.representation, my_match.representation) distance_score = rltk.jaro_winkler_similarity( school.representation, my_match.representation, 0.6, 0.25, 1) eval("school.matches_" + source + ".append((my_match, distance_score))") school.sort_lists(source) csv_file.close()
def confirmed_match(name_1, name_2, school_1, school_2, hometown_1, hometown_2, pos_1, pos_2, source_1, source_2): if (not (name_1 and name_2)): return False # COMPARE HOMETOWNS - if hometowns are present and more than jaro_threshold different, return false if (hometown_1 and hometown_2): string1 = hometown_1.strip(" '()").lower() string2 = hometown_2.strip(" '()").lower() distance_score = rltk.jaro_winkler_similarity(string1, string2) if (distance_score < jaro_threshold_hometown): return False # COMPARE NAMES string1 = name_1.strip(" '()").lower() string2 = name_2.strip(" '()").lower() distance_score = rltk.jaro_winkler_similarity(string1, string2) if (distance_score > jaro_threshold_players): # COMPARE POSITIONS IF NO SCHOOLS if (not (school_1 and school_2)): # if not school, use position if (pos_1 and pos_2): # return True if pos_1 and pos_2 share a substring of 3+ chars match_len = SequenceMatcher(None, pos_1, pos_2).find_longest_match( 0, len(pos_1), 0, len(pos_2))[2] if (match_len >= position_substring_min): return True return False # no school and no position -> no match # COMPARE SCHOOLS if school_match(school_1, school_2, source_1, source_2): return True if ('/' in school_1): school_new = school_1.split('/')[0] if school_match(school_new, school_2, source_1, source_2): return True school_new = school_1.split('/')[1] if school_match(school_new, school_2, source_1, source_2): return True elif ('/' in school_2): school_new = school_2.split('/')[0] if school_match(school_1, school_new, source_1, source_2): return True school_new = school_2.split('/')[1] if school_match(school_1, school_new, source_1, source_2): return True return False
def featurize_record_pair(r1, r2, freq, doc_size): """ Featurize a record pair and return a Series of the feature vectors Params: r1: (rltk.Record) record 1 r2: (rltk.Record) record 2 freq: (Dict) corpus frequency doc_size: (int) total size of dataset """ fv = pd.Series() fv['id1'] = r1.id fv['id2'] = r2.id if gt.is_member(r1.id, r2.id): fv['label'] = 1 else: fv['label'] = 0 if (r1.manufacturer == '' or None) or (r2.manufacturer == '' or None): fv['manufacturer_jaro_winkler'] = None fv['manufacturer_levenshtien'] = None fv['manufacturer_jaccard'] = None else: fv['manufacturer_jaro_winkler'] = rltk.jaro_winkler_similarity(r1.manufacturer, r2.manufacturer) fv['manufacturer_levenshtien'] = rltk.levenshtein_similarity(r1.manufacturer, r2.manufacturer) fv['manufacturer_jaccard'] = rltk.jaccard_index_similarity(set(tokenize(r1.manufacturer)), set(tokenize(r2.manufacturer))) if r1.price is None or r2.price is None: fv['price_difference'] = None else: fv['price_difference'] = abs(r1.price - r2.price)/max(r1.price, r2.price) fv['name_jaccard'] = rltk.jaccard_index_similarity(set(r1.name_tokenized), set(r2.name_tokenized)) fv['name_jaro_winkler'] = rltk.jaro_winkler_similarity(" ".join(r1.name_tokenized), " ".join(r2.name_tokenized)) fv['name_trigram'] = rltk.ngram_distance(r1.name, r2.name,3) if r1.description_tokenized is None or r2.description_tokenized is None: fv['desc_tf_idf'] = None fv['desc_trigram'] = None fv['desc_jaccard'] = None else: fv['desc_tf_idf'] = rltk.tf_idf_similarity(r1.description_tokenized, r2.description_tokenized,freq,doc_size) fv['desc_trigram'] = rltk.ngram_distance(" ".join(r1.description_tokenized), " ".join(r2.description_tokenized),3) fv['desc_jaccard'] = rltk.jaccard_index_similarity(set(r1.description_tokenized), set(r2.description_tokenized)) return fv
def school_match(school_1, school_2, source_1, source_2): # First just try it without getting the canonical source distance_score = rltk.jaro_winkler_similarity(school_1.lower(), school_2.lower()) if (distance_score > jaro_threshold_schools): return True school_1 = get_canonical_source(school_1, source_1) school_2 = get_canonical_source(school_2, source_2) distance_score = rltk.jaro_winkler_similarity(school_1.lower(), school_2.lower()) if (distance_score > jaro_threshold_schools): return True return False
def name_string_similarity_1(r_imdb, r_afi): s1 = r_imdb.name_string.lower()[:8] s2 = cached_names_1.get(r_afi) if s2 is None: s2 = r_afi.name_string.lower()[:8] cached_names_1[r_afi] = s2 return rltk.jaro_winkler_similarity(s1, s2)
def name_string_similarity_3(r_imdb, r_afi): s1 = ''.join(sorted(re.split(r'[-,\s]+', r_imdb.name_string.lower()))) s2 = cached_names_3.get(r_afi) if s2 is None: s2 = ''.join(sorted(re.split(r'[-,\s]+', r_afi.name_string.lower()))) cached_names_3[r_afi] = s2 return rltk.jaro_winkler_similarity(s1, s2)
def school_match(school_1, school_2, source_1, source_2): school_1 = get_canonical_source(school_1, source_1) school_2 = get_canonical_source(school_2, source_2) distance_score = rltk.jaro_winkler_similarity(school_1.lower(), school_2.lower()) if (distance_score > jaro_threshold_schools): return True return False
def confirmed_match(name_1, name_2, school_1, school_2, hometown_1, hometown_2, source_1, source_2): if (not (name_1 and name_2 and school_1 and school_2)): return False # COMPARE HOMETOWNS if (hometown_1 and hometown_2): string1 = hometown_1.strip(" '()").lower() string2 = hometown_2.strip(" '()").lower() distance_score = rltk.jaro_winkler_similarity(string1, string2) if (distance_score < jaro_threshold_hometown): return False # COMPARE NAMES string1 = name_1.strip(" '()").lower() string2 = name_2.strip(" '()").lower() distance_score = rltk.jaro_winkler_similarity(string1, string2) # COMPARE SCHOOLS if (distance_score > jaro_threshold_players): if school_match(school_1, school_2, source_1, source_2): return True if ('/' in school_1): school_new = school_1.split('/')[0] if school_match(school_new, school_2, source_1, source_2): return True school_new = school_1.split('/')[1] if school_match(school_new, school_2, source_1, source_2): return True elif ('/' in school_2): school_new = school_2.split('/')[0] if school_match(school_1, school_new, source_1, source_2): return True school_new = school_2.split('/')[1] if school_match(school_1, school_new, source_1, source_2): return True return False
def name_similarity(r_imdb, r_afi): # imdb_name = r_imdb['name_string'] # afi_name = r_afi['name_string'] imdb_name = r_imdb.lower() afi_name = r_afi.lower() # imdb_name = r_imdb.lower() # afi_name = r_afi.lower() return rltk.jaro_winkler_similarity(imdb_name, afi_name)
def entire_school_similarity(r1, r2): s1 = r1.school_string s2 = r2.school_string similarity_score = rltk.jaro_winkler_similarity(s1, s2) if s1 == s2: return 1 elif similarity_score > 0: return similarity_score else: return 0
def school_similarity(r1, r2): s1 = r1.school_string[:int(len(r1.school_string) / 2)] s2 = r2.school_string[:int(len(r2.school_string) / 2)] similarity_score = rltk.jaro_winkler_similarity(s1, s2) if s1 == s2: return 1 elif similarity_score > 0: return similarity_score else: return 0
def similarity_match_by_name(record1, record2): full_name_m = record1.full_name_string.lower() full_name_w = record2.full_name_string.lower() # full name score if full_name_m == full_name_w: return True, 1 # Jaccard name score for whole set of name tokens (dirty) jaccard_name_score = rltk.jaccard_index_similarity(record1.name_tokens, record2.name_tokens) # Jaro Winkerler name score for re-assembeled full name (clean) jw_name_score = rltk.jaro_winkler_similarity(full_name_m, full_name_w) total = jaccard_name_score * 0.65 + jw_name_score * 0.35 return total > 0.7, total
def er_task(block): st = time.time() similar = defaultdict(lambda: []) for i, (key, val) in enumerate(block.items()): if (i + 1) % 1 == 0: print("time taken for {} is {}".format(i, time.time() - st)) for igdb_obj in val[0]: for igdb_game_key, igdb_game_name in igdb_obj.items(): max_score = -1 matching_key = '' matching_name = '' max_lev_score = -1 max_jw_score = -1 if len(val[1]) != 0: for g2a_obj in val[1]: for g2a_game_key, g2a_game_name in g2a_obj.items(): lev_score = rltk.levenshtein_similarity( igdb_game_name, g2a_game_name) jw_score = rltk.jaro_winkler_similarity( igdb_game_name, g2a_game_name) score = lev_score + jw_score if score > max_score: max_score = score max_lev_score = lev_score max_jw_score = jw_score matching_key = g2a_game_key matching_name = g2a_game_name if max_score > 1.2: similar[key].append({ (igdb_game_key, igdb_game_name): (matching_key, matching_name, max_score) }) else: similar[key].append({ (igdb_game_key, igdb_game_name): ('', '', -1) }) else: similar[key].append({ (igdb_game_key, igdb_game_name): ('', '', -1) }) print("total time taken: ", time.time() - st) return similar
def address_similarity(r_usn, r_cc): if r_usn.address is not None and r_cc.address is not None: address1 = r_usn.address address2 = r_cc.address if address1 and address2: address1 = list(map(lambda s: s.strip(), r_usn.address.split(','))) address1 = address1[:-1] + [address1[-1].split()[0]] address1 = [address1[0][:int(len(address1[0]) / 2)]] + address1[1:] address2 = [address2[0][:int(len(address2[0]) / 2)]] + address2[1:] address1 = ' '.join(address1) address2 = ' '.join(address2) similarity = rltk.jaro_winkler_similarity(address1, address2) if address1 == address2: return 1 elif similarity > 0: return similarity else: return 0 return 0
def title_similarity(title1: str, title2: str) -> float: return rltk.jaro_winkler_similarity(title1, title2)
def compare_cpu(g2a_min_cpu_1, techpowerup_cpu, g2a_min_cpu_2=None, max_score_1=-1, max_score_2=-1): # techpowerup_cpu_reader = jsonlines.open(techpowerup_cpu, 'r') similar_id_1 = '' similar_id_2 = '' most_similar_cpu_1 = '' most_similar_cpu_2 = '' score_1 = -9999 score_2 = -9999 with jsonlines.open(techpowerup_cpu, 'r') as techpowerup_cpu_reader: if g2a_min_cpu_1 != None and g2a_min_cpu_2 != None: for cpu in techpowerup_cpu_reader: cpu_key, cpu_value = list(cpu.items())[0][0], list( cpu.items())[0][1] if len(g2a_min_cpu_1) != 0: score_1 = rltk.levenshtein_similarity( g2a_min_cpu_1, cpu_value['name'].lower( )) + rltk.jaro_winkler_similarity( g2a_min_cpu_1, cpu_value['name'].lower()) '''if "Intel" in g2a_min_cpu_1: if cpu_value["Company"] == "Intel": score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name']) elif "AMD" in g2a_min_cpu_1: if cpu_value["Company"] == 'AMD': score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name']) else: score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])''' if len(g2a_min_cpu_2) != 0: score_2 = rltk.levenshtein_similarity( g2a_min_cpu_2, cpu_value['name'].lower( )) + rltk.jaro_winkler_similarity( g2a_min_cpu_2, cpu_value['name'].lower()) '''if "Intel" in g2a_min_cpu_2: if cpu_value["Company"] == "Intel": score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name']) elif "AMD" in g2a_min_cpu_2: if cpu_value["Company"] == 'AMD': score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name']) else: score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name'])''' # score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name']) if score_1 > max_score_1: max_score_1 = score_1 similar_id_1 = cpu_key most_similar_cpu_1 = cpu_value['name'] if score_2 > max_score_2: max_score_2 = score_2 similar_id_2 = cpu_key most_similar_cpu_2 = cpu_value['name'] if max_score_1 >= 1.2 and max_score_2 >= 1.2: return { 'most_sim_cpu_1': { 'name': most_similar_cpu_1, 'sim_id': similar_id_1, 'sim_score': max_score_1 }, 'most_sim_cpu_2': { 'name': most_similar_cpu_2, 'sim_id': similar_id_2, 'sim_score': max_score_2 } } elif max_score_1 >= 1.2 and max_score_2 <= 1.2: return { 'most_sim_cpu_1': { 'name': most_similar_cpu_1, 'sim_id': similar_id_1, 'sim_score': max_score_1 }, 'most_sim_cpu_2': { 'name': '', 'sim_id': '', 'sim_score': max_score_2 } } elif max_score_1 <= 1.2 and max_score_2 >= 1.2: return { 'most_sim_cpu_1': { 'name': '', 'sim_id': '', 'sim_score': max_score_1 }, 'most_sim_cpu_2': { 'name': most_similar_cpu_2, 'sim_id': similar_id_2, 'sim_score': max_score_2 } } else: return { 'most_sim_cpu_1': { 'name': '', 'sim_id': '', 'sim_score': max_score_1 }, 'most_sim_cpu_2': { 'name': '', 'sim_id': '', 'sim_score': max_score_2 } } if g2a_min_cpu_1 != None and g2a_min_cpu_2 == None: for cpu in techpowerup_cpu_reader: cpu_key, cpu_value = list(cpu.items())[0][0], list( cpu.items())[0][1] if len(g2a_min_cpu_1) != 0: score_1 = rltk.levenshtein_similarity( g2a_min_cpu_1, cpu_value['name']) + rltk.jaro_winkler_similarity( g2a_min_cpu_1, cpu_value['name'].lower()) '''if "Intel" in g2a_min_cpu_1: if cpu_value["Company"] == "Intel": score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name']) elif "AMD" in g2a_min_cpu_1: if cpu_value["Company"] == 'AMD': score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name']) else: score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])''' # score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1, cpu_value['name']) if score_1 > max_score_1: max_score_1 = score_1 similar_id_1 = cpu_key most_similar_cpu_1 = cpu_value['name'] if max_score_1 >= 1.2: return { 'most_sim_cpu': { 'name': most_similar_cpu_1, 'sim_id': similar_id_1, 'sim_score': max_score_1 } } else: return { 'most_sim_cpu': { 'name': '', 'sim_id': '', 'sim_score': max_score_1 } }
def name_string_similarity(prod1, prod2): s1 = prod1.name_string s2 = prod2.name_string return rltk.jaro_winkler_similarity(s1, s2)
def name_similarity(r_imdb, r_afi): s1 = r_imdb.name_string s2 = r_afi.name_string return rltk.jaro_winkler_similarity(s1, s2)
return self.raw_object['Phone'] @property def cuisine(self): return self.raw_object['Cuisine'] ds1 = rltk.Dataset(reader=rltk.DataFrameReader(ds1), record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) ds2 = rltk.Dataset(reader=rltk.DataFrameReader(ds2), record_class=Record2, adapter=rltk.MemoryKeyValueAdapter()) '''bg = rltk.HashBlockGenerator() blocks = bg.generate(bg.block(ds1, property_='cuisine'), bg.block(ds2, property_='cuisine')) pairs = rltk.get_record_pairs(ds1, ds2, block=blocks)''' pairs = rltk.get_record_pairs(ds1, ds2) f = open('similarities.txt', 'w+') for r1, r2 in pairs: a_d = rltk.levenshtein_similarity(r1.address, r2.address) p_d = rltk.jaro_winkler_similarity(r1.phone, r2.phone) c_d = rltk.jaro_winkler_similarity(r1.cuisine, r2.cuisine) f.write(r1.id + "," + r2.id + "," + str(a_d) + "," + str(p_d) + "," + str(c_d) + "\n") f.close()
def main(): with open("dblp_final_JSON.json", "r") as f: dblp_dict = json.load(f) professors = set() for key in dblp_dict: professors.add(key['person']) #print(professors) #print(len(professors)) coauthor_dict = defaultdict(list) for key in dblp_dict: author = key['person'] for items in key['papers']: co_authors = items['co_authors'] if author in co_authors: co_authors.remove(author) if co_authors: coauthor_dict[author].extend(co_authors) list_of_coauthors = [] for key in coauthor_dict: list_of_coauthors.extend(coauthor_dict[key]) #print(len(list_of_coauthors)) ### String / Data Matching for Entity linking using RLTK ### Remove duplicates in the coauthor_dict using String Matching ### Compare with professors and do entity linking / remove duplicates df1 = pd.DataFrame(list(professors), columns=['name']) #print(df1) df2 = pd.DataFrame(list_of_coauthors, columns=['name']) #print(len(df2)) df1['first_name'] = df1.apply(lambda x: x['name'].split()[0], axis=1) df1['last_name'] = df1.apply(lambda x: ' '.join(x['name'].split()[1:]), axis=1) df1['id'] = (df1.index + 1).astype(str) #print(df1) df2['first_name'] = df2.apply(lambda x: x['name'].split()[0], axis=1) df2['last_name'] = df2.apply(lambda x: ' '.join(x['name'].split()[1:]), axis=1) df2['id'] = (df2.index + 1).astype(str) ds1 = rltk.Dataset(reader=rltk.DataFrameReader(df1), record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) ds2 = rltk.Dataset(reader=rltk.DataFrameReader(df2), record_class=Record2, adapter=rltk.MemoryKeyValueAdapter()) bg = rltk.HashBlockGenerator() block = bg.generate(bg.block(ds1, property_='fname'), bg.block(ds2, property_='fname')) pairs = rltk.get_record_pairs(ds1, ds2, block=block) num_pairs = 0 sim_pairs = [] sim_dict = {} for r1, r2 in pairs: num_pairs += 1 sim = rltk.jaro_winkler_similarity(r1.lname, r2.lname) if 0.9 < sim < 1: sim_pairs.append( (r1.fname + ' ' + r1.lname, r2.fname + ' ' + r2.lname)) sim_dict[r1.fname + ' ' + r1.lname] = r2.fname + ' ' + r2.lname #print(r1.lname,r2.lname,sim) #print(sim_pairs) #print("Blocking using Cuisine - Number of pairs:",num_pairs) for key in coauthor_dict: lis = coauthor_dict[key] for ind in range(len(lis)): if lis[ind] in sim_dict: lis[ind] = sim_dict[lis[ind]] with open("co_authors.json", "w") as jf: json.dump(coauthor_dict, jf, indent=2)