def SimilarityScore(record1, record2): names = rltk.jaccard_index_similarity(record1.name, record2.name) address = rltk.levenshtein_similarity(record1.address, record2.address) cuisine = rltk.levenshtein_similarity(record1.cuisine, record2.cuisine) # phone = rltk.levenshtein_similarity(record1.phone, record2.phone) if record1.phone != record2.phone: phone = 0. else: phone = 1. #0.7 0.2 0.1 > 0.8 104 #0.4 0.4 0.2 >0.59 106 #0.4 0.4 0.2 >0.53 113 return 0.4 * phone + 0.4 * names + 0.2 * address
def name_string_similarity_4(r_imdb, r_afi): s1 = ''.join(sorted(re.split(r'[-,\s]+', r_imdb.name_string.lower()))) s2 = cached_names_4.get(r_afi) if s2 is None: s2 = ''.join(sorted(re.split(r'[-,\s]+', r_afi.name_string.lower()))) cached_names_4[r_afi] = s2 return rltk.levenshtein_similarity(s1, s2)
def name_string_similarity_2(r_imdb, r_afi): s1 = r_imdb.name_string.lower()[-8:] s2 = cached_names_2.get(r_afi) if s2 is None: s2 = r_afi.name_string.lower()[-8:] cached_names_2[r_afi] = s2 return rltk.levenshtein_similarity(s1, s2)
def featurize_record_pair(r1, r2, freq, doc_size): """ Featurize a record pair and return a Series of the feature vectors Params: r1: (rltk.Record) record 1 r2: (rltk.Record) record 2 freq: (Dict) corpus frequency doc_size: (int) total size of dataset """ fv = pd.Series() fv['id1'] = r1.id fv['id2'] = r2.id if gt.is_member(r1.id, r2.id): fv['label'] = 1 else: fv['label'] = 0 if (r1.manufacturer == '' or None) or (r2.manufacturer == '' or None): fv['manufacturer_jaro_winkler'] = None fv['manufacturer_levenshtien'] = None fv['manufacturer_jaccard'] = None else: fv['manufacturer_jaro_winkler'] = rltk.jaro_winkler_similarity(r1.manufacturer, r2.manufacturer) fv['manufacturer_levenshtien'] = rltk.levenshtein_similarity(r1.manufacturer, r2.manufacturer) fv['manufacturer_jaccard'] = rltk.jaccard_index_similarity(set(tokenize(r1.manufacturer)), set(tokenize(r2.manufacturer))) if r1.price is None or r2.price is None: fv['price_difference'] = None else: fv['price_difference'] = abs(r1.price - r2.price)/max(r1.price, r2.price) fv['name_jaccard'] = rltk.jaccard_index_similarity(set(r1.name_tokenized), set(r2.name_tokenized)) fv['name_jaro_winkler'] = rltk.jaro_winkler_similarity(" ".join(r1.name_tokenized), " ".join(r2.name_tokenized)) fv['name_trigram'] = rltk.ngram_distance(r1.name, r2.name,3) if r1.description_tokenized is None or r2.description_tokenized is None: fv['desc_tf_idf'] = None fv['desc_trigram'] = None fv['desc_jaccard'] = None else: fv['desc_tf_idf'] = rltk.tf_idf_similarity(r1.description_tokenized, r2.description_tokenized,freq,doc_size) fv['desc_trigram'] = rltk.ngram_distance(" ".join(r1.description_tokenized), " ".join(r2.description_tokenized),3) fv['desc_jaccard'] = rltk.jaccard_index_similarity(set(r1.description_tokenized), set(r2.description_tokenized)) return fv
def main(): csv_file = open('levenshtein_matches_espn_sportsref.csv') csv_writer = csv.csv_writer(csv_file) csv_writer.writerow(["ESPN school_name", "ESPN representation", "Levenshtein "]) # read SportsReference file into a list of schools list_schools = [] with open('results_sportsref.csv') as csv_file: csv_reader = csv.reader(csv_file) is_head_row = True for school in csv_reader: if is_head_row: # skip the one head row is_head_row = False continue list_schools.append(get_full_name(school[3] + school[4])) # *2 mascot dict_lev_distance = {} with open('results_espn.csv') as csv_file: csv_reader = csv.reader(csv_file) is_head_row = True for school in csv_reader: # for each school in ESPN ref_1 = get_full_name(school[4] + school[5]) # *2 mascot if is_head_row: # skip the one head row is_head_row = False continue dict_lev_distance[ref_1] = [] # create new list for each school in espn for ref_2 in list_schools: # for each school_mascot in sportsref list lev_distance = rltk.levenshtein_similarity(ref_1, ref_2) dict_lev_distance[ref_1].append(((ref_2), lev_distance)) temp_list = dict_lev_distance[ref_1] temp_list.sort(reverse = True, key=lambda temp_list: temp_list[1]) dict_lev_distance[ref_1] = temp_list # OUTPUT print("{:<40}".format("Actual entity (ESPN)") + "lev\t" + "sportsref match name\n") for key, value in sorted(dict_lev_distance, key=dict_lev_distance.get[0][1]) # for key, value in dict_lev_distance.items(): if value[0][1] == 1: #only print those that don't match identically print("{:<40}".format(key) + "\t" + value[0][0]) else: print("{:<40}".format(key) + str(value[0][1]) + "\t" + value[0][0]) csv_writer.closer()
def getMostSimilarGPU_Benchmark(input_gpu): global benchmark_gpus max_score = -1 max_match_id = None max_match_val = None for key, val in benchmark_gpus.items(): bench_gpu = cleanGPUText(val["videocard_name"]) cur_score = rltk.levenshtein_similarity(input_gpu, bench_gpu) if cur_score > max_score: max_score = cur_score max_match_id = key max_match_val = val["videocard_name"] return max_match_id, max_score, max_match_val
def er_task(block): st = time.time() similar = defaultdict(lambda: []) for i, (key, val) in enumerate(block.items()): if (i + 1) % 1 == 0: print("time taken for {} is {}".format(i, time.time() - st)) for igdb_obj in val[0]: for igdb_game_key, igdb_game_name in igdb_obj.items(): max_score = -1 matching_key = '' matching_name = '' max_lev_score = -1 max_jw_score = -1 if len(val[1]) != 0: for g2a_obj in val[1]: for g2a_game_key, g2a_game_name in g2a_obj.items(): lev_score = rltk.levenshtein_similarity( igdb_game_name, g2a_game_name) jw_score = rltk.jaro_winkler_similarity( igdb_game_name, g2a_game_name) score = lev_score + jw_score if score > max_score: max_score = score max_lev_score = lev_score max_jw_score = jw_score matching_key = g2a_game_key matching_name = g2a_game_name if max_score > 1.2: similar[key].append({ (igdb_game_key, igdb_game_name): (matching_key, matching_name, max_score) }) else: similar[key].append({ (igdb_game_key, igdb_game_name): ('', '', -1) }) else: similar[key].append({ (igdb_game_key, igdb_game_name): ('', '', -1) }) print("total time taken: ", time.time() - st) return similar
def rule_based_method(bask_re, wikidata): MY_TRESH = 0.85 compared_attr_num = 0 score = 0 if bask_re["player_name"] is None or wikidata["player_name"] is None: return False, 0 elif (rltk.levenshtein_similarity(bask_re["player_name"], wikidata["player_name"]) < 0.95): return False, 0 if bask_re["height"] is not None and wikidata["height"] is not None: score_height = calc_height_sim(bask_re["height"], wikidata["height"]) score += 0.25 * score_height compared_attr_num += 1 if bask_re["weight"] is not None and wikidata["weight"] is not None: score_weight = calc_weight_sim(bask_re["weight"], wikidata["weight"]) score += 0.25 * score_weight compared_attr_num += 1 if bask_re["date_of_birth"] is not None and wikidata[ "date_of_birth"] is not None: score_birthday = calc_birthday_sim(bask_re["date_of_birth"], wikidata["date_of_birth"]) score += 0.25 * score_birthday compared_attr_num += 1 if bask_re["place_of_birth"] is not None and wikidata[ "place_of_birth"] is not None: bask_re_place_of_birth = re.split(r"[,,]\s+", bask_re["place_of_birth"])[0] score_place = calc_place_of_birth_sim(bask_re_place_of_birth, wikidata["place_of_birth"]) score += 0.25 * score_place compared_attr_num += 1 # rescale if compared_attr_num < 2: return False, 0 else: score = score / (compared_attr_num * 0.25) return score > MY_TRESH, score
for i, tech_obj in enumerate(techpowerup_reader): if i % 5 == 0: print('time taken for {} is {}'.format(i, time.time() - st)) tech_key, tech_value = list(tech_obj.items())[0][0], list( tech_obj.items())[0][1]['name'] similar_cpu = {} max_score = -1 similar_key = '' similar_cpu_name = '' # sys.argv[2] -----> path to cpu benchmark mapping file with jsonlines.open(sys.argv[2]) as cpubenchmark_reader: for cpu_obj in cpubenchmark_reader: cpu_key, cpu_value = list(cpu_obj.items())[0][0], list( cpu_obj.items())[0][1]['cpu_name'] score = rltk.levenshtein_similarity( tech_value.lower(), cpu_value.lower()) if score > max_score: max_score = score similar_key = cpu_key similar_cpu_name = cpu_value if max_score >= 0.5: sim_count += 1 similar_cpu['techpowerup'] = { 'id': tech_key, 'name': tech_value } similar_cpu['similarity'] = { 'similar_cpubenchmark_key': similar_key, 'similar_cpubenchmark_name': similar_cpu_name, 'sim_score': max_score }
print(c) """ # In[34]: count = 0 bucket_keys = set(list(calpoly.keys()) + list(ashford.keys())) frequent_words = set([ 'Introduction', 'Advanced', 'Intermediate', 'I', 'II', 'III', 'Principles' ]) result = [] for k in bucket_keys: if k in calpoly and k in ashford: for ca, ash in itertools.product(calpoly[k], ashford[k]): sub = rltk.levenshtein_similarity(ca[0], ash[0][:-8]) #name ca_name = set(ca[1].split()) - stop_words - frequent_words ash_name = set(ash[1].split()) - stop_words - frequent_words name = rltk.monge_elkan_similarity( list(ca_name), list(ash_name), function=rltk.levenshtein_similarity) score = 0.5 * sub + 0.5 * name if score > 0.77: count += 1 ans = (ca[2], ash[2]) #print(ans) result.append(ans)
def location_similarity(string1, string2): res = rltk.levenshtein_similarity(string1, string2) return res
def title_similarity(string1, string2): res = rltk.levenshtein_similarity(string1, string2) return res
def zipcode_similarity(string1, string2): res = rltk.levenshtein_similarity(string1, string2) return res
def calc_place_of_birth_sim(p1, p2): return rltk.levenshtein_similarity(p1.lower(), p2.lower())
def calc_birthday_sim(b1, b2): return rltk.levenshtein_similarity(b1, b2)
return self.raw_object['Phone'] @property def cuisine(self): return self.raw_object['Cuisine'] ds1 = rltk.Dataset(reader=rltk.DataFrameReader(ds1), record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) ds2 = rltk.Dataset(reader=rltk.DataFrameReader(ds2), record_class=Record2, adapter=rltk.MemoryKeyValueAdapter()) '''bg = rltk.HashBlockGenerator() blocks = bg.generate(bg.block(ds1, property_='cuisine'), bg.block(ds2, property_='cuisine')) pairs = rltk.get_record_pairs(ds1, ds2, block=blocks)''' pairs = rltk.get_record_pairs(ds1, ds2) f = open('similarities.txt', 'w+') for r1, r2 in pairs: a_d = rltk.levenshtein_similarity(r1.address, r2.address) p_d = rltk.jaro_winkler_similarity(r1.phone, r2.phone) c_d = rltk.jaro_winkler_similarity(r1.cuisine, r2.cuisine) f.write(r1.id + "," + r2.id + "," + str(a_d) + "," + str(p_d) + "," + str(c_d) + "\n") f.close()
def compare_cpu(g2a_min_cpu_1, techpowerup_cpu, g2a_min_cpu_2=None, max_score_1=-1, max_score_2=-1): # techpowerup_cpu_reader = jsonlines.open(techpowerup_cpu, 'r') similar_id_1 = '' similar_id_2 = '' most_similar_cpu_1 = '' most_similar_cpu_2 = '' score_1 = -9999 score_2 = -9999 with jsonlines.open(techpowerup_cpu, 'r') as techpowerup_cpu_reader: if g2a_min_cpu_1 != None and g2a_min_cpu_2 != None: for cpu in techpowerup_cpu_reader: cpu_key, cpu_value = list(cpu.items())[0][0], list( cpu.items())[0][1] if len(g2a_min_cpu_1) != 0: score_1 = rltk.levenshtein_similarity( g2a_min_cpu_1, cpu_value['name'].lower( )) + rltk.jaro_winkler_similarity( g2a_min_cpu_1, cpu_value['name'].lower()) '''if "Intel" in g2a_min_cpu_1: if cpu_value["Company"] == "Intel": score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name']) elif "AMD" in g2a_min_cpu_1: if cpu_value["Company"] == 'AMD': score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name']) else: score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])''' if len(g2a_min_cpu_2) != 0: score_2 = rltk.levenshtein_similarity( g2a_min_cpu_2, cpu_value['name'].lower( )) + rltk.jaro_winkler_similarity( g2a_min_cpu_2, cpu_value['name'].lower()) '''if "Intel" in g2a_min_cpu_2: if cpu_value["Company"] == "Intel": score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name']) elif "AMD" in g2a_min_cpu_2: if cpu_value["Company"] == 'AMD': score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name']) else: score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name'])''' # score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name']) if score_1 > max_score_1: max_score_1 = score_1 similar_id_1 = cpu_key most_similar_cpu_1 = cpu_value['name'] if score_2 > max_score_2: max_score_2 = score_2 similar_id_2 = cpu_key most_similar_cpu_2 = cpu_value['name'] if max_score_1 >= 1.2 and max_score_2 >= 1.2: return { 'most_sim_cpu_1': { 'name': most_similar_cpu_1, 'sim_id': similar_id_1, 'sim_score': max_score_1 }, 'most_sim_cpu_2': { 'name': most_similar_cpu_2, 'sim_id': similar_id_2, 'sim_score': max_score_2 } } elif max_score_1 >= 1.2 and max_score_2 <= 1.2: return { 'most_sim_cpu_1': { 'name': most_similar_cpu_1, 'sim_id': similar_id_1, 'sim_score': max_score_1 }, 'most_sim_cpu_2': { 'name': '', 'sim_id': '', 'sim_score': max_score_2 } } elif max_score_1 <= 1.2 and max_score_2 >= 1.2: return { 'most_sim_cpu_1': { 'name': '', 'sim_id': '', 'sim_score': max_score_1 }, 'most_sim_cpu_2': { 'name': most_similar_cpu_2, 'sim_id': similar_id_2, 'sim_score': max_score_2 } } else: return { 'most_sim_cpu_1': { 'name': '', 'sim_id': '', 'sim_score': max_score_1 }, 'most_sim_cpu_2': { 'name': '', 'sim_id': '', 'sim_score': max_score_2 } } if g2a_min_cpu_1 != None and g2a_min_cpu_2 == None: for cpu in techpowerup_cpu_reader: cpu_key, cpu_value = list(cpu.items())[0][0], list( cpu.items())[0][1] if len(g2a_min_cpu_1) != 0: score_1 = rltk.levenshtein_similarity( g2a_min_cpu_1, cpu_value['name']) + rltk.jaro_winkler_similarity( g2a_min_cpu_1, cpu_value['name'].lower()) '''if "Intel" in g2a_min_cpu_1: if cpu_value["Company"] == "Intel": score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name']) elif "AMD" in g2a_min_cpu_1: if cpu_value["Company"] == 'AMD': score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name']) else: score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])''' # score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1, cpu_value['name']) if score_1 > max_score_1: max_score_1 = score_1 similar_id_1 = cpu_key most_similar_cpu_1 = cpu_value['name'] if max_score_1 >= 1.2: return { 'most_sim_cpu': { 'name': most_similar_cpu_1, 'sim_id': similar_id_1, 'sim_score': max_score_1 } } else: return { 'most_sim_cpu': { 'name': '', 'sim_id': '', 'sim_score': max_score_1 } }
c+=1 #print(len(cscu[k]), len(calpoly[k])) print(c) # In[57]: count = 0 bucket_keys = set(list(csus.keys()) + list(ashford.keys())) result = [] for k in bucket_keys: if k in csus and k in ashford: for cs, ash in itertools.product(csus[k], ashford[k]): sub = rltk.levenshtein_similarity(cs[0],ash[0]) name = rltk.levenshtein_similarity(cs[1],ash[1]) score = 0.4 * sub + 0.6 * name if score > 0.7: count += 1 #if score < 0.9: ans = (cs[2],ash[2]) print(ans) result.append(ans) print(cs,ash,score,'count:' + str(count)) # In[26]:
def year_similarity(year1: str, year2: str) -> float: return 1.0 if year1 == "" or year2 == "" else rltk.levenshtein_similarity(year1, year2)
class Record2(rltk.Record): @rltk.cached_property def id(self): return self.raw_object['ident'] @rltk.cached_property def value(self): v = self.raw_object.get('values', list()) return v[0] if len(v) > 0 else 'empty' ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv'), record_class=Record1, adapter=rltk.MemoryAdapter()) ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2, adapter=rltk.DBMAdapter('file_index')) pairs = rltk.get_record_pairs(ds1, ds2) for r1, r2 in pairs: print('-------------') print(r1.id, r1.value, '\t', r2.id, r2.value) if r1.parent_id: print('r1\'s parent', r1.parent_id, ds1.get_record(r1.parent_id).value) print('levenshtein_distance:', rltk.levenshtein_distance(r1.value, r2.value)) print('levenshtein_similarity:', rltk.levenshtein_similarity(r1.value, r2.value))
def getMostSimilarGPU_Techpowerup(input_gpu): global gpu_blocks, techpowerup_gpus split_words = [" / ", ", ", " or "] game_gpus = [input_gpu] for cur_word in split_words: if cur_word in input_gpu: game_gpus = input_gpu.split(cur_word) break gpu1 = {} gpu2 = {} # Base Cases: # Invalid input gpu if game_gpus[0] == "-" and len(game_gpus) == 1: return gpu1, gpu2 # GPU1 max_score = -1 max_match_id = None max_match_val = None cur_game_gpu = cleanGPUText(game_gpus[0].lower()) gpu1["actual_val"] = game_gpus[0].lower() cur_block_key = getGPUBlockKey(cur_game_gpu) for tgpu_id in gpu_blocks[cur_block_key].keys(): try: tgpu_val = techpowerup_gpus[tgpu_id] cur_product_name = tgpu_val["Product Name"].lower() cur_score = rltk.levenshtein_similarity(cur_game_gpu, cur_product_name) if cur_score > max_score: max_score = cur_score max_match_id = tgpu_id max_match_val = cur_product_name except: pass gpu1["max_score"] = max_score gpu1["max_match_id"] = max_match_id gpu1["max_match_val"] = max_match_val # GPU2 if len(game_gpus) >= 2: max_score = -1 max_match_id = None max_match_val = None cur_game_gpu = cleanGPUText(game_gpus[1].lower()) gpu2["actual_val"] = game_gpus[1].lower() cur_block_key = getGPUBlockKey(cur_game_gpu) for tgpu_id in gpu_blocks[cur_block_key].keys(): try: tgpu_val = techpowerup_gpus[tgpu_id] cur_product_name = tgpu_val["Product Name"].lower() cur_score = rltk.levenshtein_similarity( cur_game_gpu, cur_product_name) if cur_score > max_score: max_score = cur_score max_match_id = tgpu_id max_match_val = cur_product_name except: pass gpu2["max_score"] = max_score gpu2["max_match_id"] = max_match_id gpu2["max_match_val"] = max_match_val return gpu1, gpu2
# c+=1 # #print(len(cscu[k]), len(calpoly[k])) # print(c) # In[32]: count = 0 bucket_keys = set(list(cscu.keys()) + list(calpoly.keys())) result,allr = [],[] for k in bucket_keys: if k in cscu and k in calpoly: for cs, ca in itertools.product(cscu[k], calpoly[k]): sub = rltk.levenshtein_similarity(cs[0],ca[0]) title = rltk.levenshtein_similarity(cs[1],ca[1]) score = 0.6 * sub + 0.4 * title mat = [] if score > 0.867: count += 1 #if score < 0.9: mat.append((cs)) mat.append((ca)) # print(ans) result.append(mat) print(result) # print(cs,ca,score,'count:' + str(count))