Exemple #1
0
def SimilarityScore(record1, record2):
    names = rltk.jaccard_index_similarity(record1.name, record2.name)
    address = rltk.levenshtein_similarity(record1.address, record2.address)
    cuisine = rltk.levenshtein_similarity(record1.cuisine, record2.cuisine)
    #     phone = rltk.levenshtein_similarity(record1.phone, record2.phone)

    if record1.phone != record2.phone:
        phone = 0.
    else:
        phone = 1.
    #0.7  0.2 0.1 > 0.8 104
    #0.4 0.4 0.2 >0.59 106
    #0.4 0.4 0.2 >0.53 113
    return 0.4 * phone + 0.4 * names + 0.2 * address
def name_string_similarity_4(r_imdb, r_afi):
    s1 = ''.join(sorted(re.split(r'[-,\s]+', r_imdb.name_string.lower())))
    s2 = cached_names_4.get(r_afi)
    if s2 is None:
        s2 = ''.join(sorted(re.split(r'[-,\s]+', r_afi.name_string.lower())))
        cached_names_4[r_afi] = s2

    return rltk.levenshtein_similarity(s1, s2)
def name_string_similarity_2(r_imdb, r_afi):
    s1 = r_imdb.name_string.lower()[-8:]
    s2 = cached_names_2.get(r_afi)
    if s2 is None:
        s2 = r_afi.name_string.lower()[-8:]
        cached_names_2[r_afi] = s2

    return rltk.levenshtein_similarity(s1, s2)
Exemple #4
0
def featurize_record_pair(r1, r2, freq, doc_size):
    """
    Featurize a record pair and return a Series of the feature vectors

    Params:
        r1: (rltk.Record) record 1
        r2: (rltk.Record) record 2
        freq: (Dict) corpus frequency
        doc_size: (int) total size of dataset
    """
    fv = pd.Series()
    fv['id1'] = r1.id
    fv['id2'] = r2.id

    if gt.is_member(r1.id, r2.id):
        fv['label'] = 1
    else:
        fv['label'] = 0

    if (r1.manufacturer == '' or None) or (r2.manufacturer == '' or None):
        fv['manufacturer_jaro_winkler'] = None 
        fv['manufacturer_levenshtien'] = None
        fv['manufacturer_jaccard'] = None
    else:
        fv['manufacturer_jaro_winkler'] = rltk.jaro_winkler_similarity(r1.manufacturer, r2.manufacturer)
        fv['manufacturer_levenshtien'] = rltk.levenshtein_similarity(r1.manufacturer, r2.manufacturer)
        fv['manufacturer_jaccard'] = rltk.jaccard_index_similarity(set(tokenize(r1.manufacturer)), 
                                set(tokenize(r2.manufacturer)))

    if r1.price is None or r2.price is None:
        fv['price_difference'] = None
    else:
        fv['price_difference'] = abs(r1.price - r2.price)/max(r1.price, r2.price)

    fv['name_jaccard'] = rltk.jaccard_index_similarity(set(r1.name_tokenized), set(r2.name_tokenized))
    fv['name_jaro_winkler'] = rltk.jaro_winkler_similarity(" ".join(r1.name_tokenized), " ".join(r2.name_tokenized))
    fv['name_trigram'] = rltk.ngram_distance(r1.name, r2.name,3)
    
    if r1.description_tokenized is None or r2.description_tokenized is None:
        fv['desc_tf_idf'] = None
        fv['desc_trigram'] = None
        fv['desc_jaccard'] = None
    else:
        fv['desc_tf_idf'] = rltk.tf_idf_similarity(r1.description_tokenized,
                                                r2.description_tokenized,freq,doc_size)
        fv['desc_trigram'] = rltk.ngram_distance(" ".join(r1.description_tokenized), " ".join(r2.description_tokenized),3)
        fv['desc_jaccard'] = rltk.jaccard_index_similarity(set(r1.description_tokenized), set(r2.description_tokenized))

    return fv
def main():

	csv_file = open('levenshtein_matches_espn_sportsref.csv')
	csv_writer = csv.csv_writer(csv_file)
	csv_writer.writerow(["ESPN school_name", "ESPN representation", "Levenshtein "])

	# read SportsReference file into a list of schools
	list_schools = []
	with open('results_sportsref.csv') as csv_file:
		csv_reader = csv.reader(csv_file)
		is_head_row = True
		for school in csv_reader:
			if is_head_row:						# skip the one head row
				is_head_row = False
				continue
			list_schools.append(get_full_name(school[3] + school[4]))	# *2 mascot

	dict_lev_distance = {}

	with open('results_espn.csv') as csv_file:
		csv_reader = csv.reader(csv_file)
		is_head_row = True

		for school in csv_reader:				# for each school in ESPN
			ref_1 = get_full_name(school[4] + school[5])	# *2 mascot
			if is_head_row:						# skip the one head row
				is_head_row = False
				continue

			dict_lev_distance[ref_1] = []		# create new list for each school in espn

			for ref_2 in list_schools:			# for each school_mascot in sportsref list
				lev_distance = rltk.levenshtein_similarity(ref_1, ref_2)
				dict_lev_distance[ref_1].append(((ref_2), lev_distance))

			temp_list = dict_lev_distance[ref_1]
			temp_list.sort(reverse = True, key=lambda temp_list: temp_list[1])
			dict_lev_distance[ref_1] = temp_list

	# OUTPUT
	print("{:<40}".format("Actual entity (ESPN)") + "lev\t" + "sportsref match name\n")
	for key, value in sorted(dict_lev_distance, key=dict_lev_distance.get[0][1])
	# for key, value in dict_lev_distance.items():
		if value[0][1] == 1:						#only print those that don't match identically
			print("{:<40}".format(key) + "\t" + value[0][0])
		else:
			print("{:<40}".format(key) + str(value[0][1]) + "\t" + value[0][0])

	csv_writer.closer()
def getMostSimilarGPU_Benchmark(input_gpu):
    global benchmark_gpus

    max_score = -1
    max_match_id = None
    max_match_val = None

    for key, val in benchmark_gpus.items():
        bench_gpu = cleanGPUText(val["videocard_name"])
        cur_score = rltk.levenshtein_similarity(input_gpu, bench_gpu)
        if cur_score > max_score:
            max_score = cur_score
            max_match_id = key
            max_match_val = val["videocard_name"]

    return max_match_id, max_score, max_match_val
Exemple #7
0
def er_task(block):
    st = time.time()
    similar = defaultdict(lambda: [])

    for i, (key, val) in enumerate(block.items()):
        if (i + 1) % 1 == 0:
            print("time taken for {} is {}".format(i, time.time() - st))
        for igdb_obj in val[0]:
            for igdb_game_key, igdb_game_name in igdb_obj.items():
                max_score = -1
                matching_key = ''
                matching_name = ''
                max_lev_score = -1
                max_jw_score = -1

                if len(val[1]) != 0:
                    for g2a_obj in val[1]:
                        for g2a_game_key, g2a_game_name in g2a_obj.items():
                            lev_score = rltk.levenshtein_similarity(
                                igdb_game_name, g2a_game_name)
                            jw_score = rltk.jaro_winkler_similarity(
                                igdb_game_name, g2a_game_name)
                            score = lev_score + jw_score
                            if score > max_score:
                                max_score = score
                                max_lev_score = lev_score
                                max_jw_score = jw_score
                                matching_key = g2a_game_key
                                matching_name = g2a_game_name
                    if max_score > 1.2:
                        similar[key].append({
                            (igdb_game_key, igdb_game_name):
                            (matching_key, matching_name, max_score)
                        })
                    else:
                        similar[key].append({
                            (igdb_game_key, igdb_game_name): ('', '', -1)
                        })
                else:
                    similar[key].append({
                        (igdb_game_key, igdb_game_name): ('', '', -1)
                    })
    print("total time taken: ", time.time() - st)

    return similar
Exemple #8
0
def rule_based_method(bask_re, wikidata):
    MY_TRESH = 0.85

    compared_attr_num = 0
    score = 0

    if bask_re["player_name"] is None or wikidata["player_name"] is None:
        return False, 0
    elif (rltk.levenshtein_similarity(bask_re["player_name"],
                                      wikidata["player_name"]) < 0.95):
        return False, 0

    if bask_re["height"] is not None and wikidata["height"] is not None:
        score_height = calc_height_sim(bask_re["height"], wikidata["height"])
        score += 0.25 * score_height
        compared_attr_num += 1
    if bask_re["weight"] is not None and wikidata["weight"] is not None:
        score_weight = calc_weight_sim(bask_re["weight"], wikidata["weight"])
        score += 0.25 * score_weight
        compared_attr_num += 1
    if bask_re["date_of_birth"] is not None and wikidata[
            "date_of_birth"] is not None:
        score_birthday = calc_birthday_sim(bask_re["date_of_birth"],
                                           wikidata["date_of_birth"])
        score += 0.25 * score_birthday
        compared_attr_num += 1
    if bask_re["place_of_birth"] is not None and wikidata[
            "place_of_birth"] is not None:
        bask_re_place_of_birth = re.split(r"[,,]\s+",
                                          bask_re["place_of_birth"])[0]
        score_place = calc_place_of_birth_sim(bask_re_place_of_birth,
                                              wikidata["place_of_birth"])
        score += 0.25 * score_place
        compared_attr_num += 1

    # rescale
    if compared_attr_num < 2:
        return False, 0
    else:
        score = score / (compared_attr_num * 0.25)
        return score > MY_TRESH, score
Exemple #9
0
        for i, tech_obj in enumerate(techpowerup_reader):
            if i % 5 == 0:
                print('time taken for {} is {}'.format(i, time.time() - st))
            tech_key, tech_value = list(tech_obj.items())[0][0], list(
                tech_obj.items())[0][1]['name']
            similar_cpu = {}
            max_score = -1
            similar_key = ''
            similar_cpu_name = ''
            # sys.argv[2] -----> path to cpu benchmark mapping file
            with jsonlines.open(sys.argv[2]) as cpubenchmark_reader:
                for cpu_obj in cpubenchmark_reader:
                    cpu_key, cpu_value = list(cpu_obj.items())[0][0], list(
                        cpu_obj.items())[0][1]['cpu_name']
                    score = rltk.levenshtein_similarity(
                        tech_value.lower(), cpu_value.lower())
                    if score > max_score:
                        max_score = score
                        similar_key = cpu_key
                        similar_cpu_name = cpu_value
                if max_score >= 0.5:
                    sim_count += 1
                    similar_cpu['techpowerup'] = {
                        'id': tech_key,
                        'name': tech_value
                    }
                    similar_cpu['similarity'] = {
                        'similar_cpubenchmark_key': similar_key,
                        'similar_cpubenchmark_name': similar_cpu_name,
                        'sim_score': max_score
                    }
print(c)
"""

# In[34]:

count = 0

bucket_keys = set(list(calpoly.keys()) + list(ashford.keys()))
frequent_words = set([
    'Introduction', 'Advanced', 'Intermediate', 'I', 'II', 'III', 'Principles'
])
result = []
for k in bucket_keys:
    if k in calpoly and k in ashford:
        for ca, ash in itertools.product(calpoly[k], ashford[k]):
            sub = rltk.levenshtein_similarity(ca[0], ash[0][:-8])
            #name
            ca_name = set(ca[1].split()) - stop_words - frequent_words
            ash_name = set(ash[1].split()) - stop_words - frequent_words
            name = rltk.monge_elkan_similarity(
                list(ca_name),
                list(ash_name),
                function=rltk.levenshtein_similarity)

            score = 0.5 * sub + 0.5 * name

            if score > 0.77:
                count += 1
                ans = (ca[2], ash[2])
                #print(ans)
                result.append(ans)
Exemple #11
0
def location_similarity(string1, string2):
    res = rltk.levenshtein_similarity(string1, string2)
    return res
Exemple #12
0
def title_similarity(string1, string2):
    res = rltk.levenshtein_similarity(string1, string2)
    return res
Exemple #13
0
def zipcode_similarity(string1, string2):
    res = rltk.levenshtein_similarity(string1, string2)
    return res
Exemple #14
0
def calc_place_of_birth_sim(p1, p2):
    return rltk.levenshtein_similarity(p1.lower(), p2.lower())
Exemple #15
0
def calc_birthday_sim(b1, b2):
    return rltk.levenshtein_similarity(b1, b2)
Exemple #16
0
        return self.raw_object['Phone']

    @property
    def cuisine(self):
        return self.raw_object['Cuisine']


ds1 = rltk.Dataset(reader=rltk.DataFrameReader(ds1),
                   record_class=Record1,
                   adapter=rltk.MemoryKeyValueAdapter())
ds2 = rltk.Dataset(reader=rltk.DataFrameReader(ds2),
                   record_class=Record2,
                   adapter=rltk.MemoryKeyValueAdapter())
'''bg = rltk.HashBlockGenerator()
blocks = bg.generate(bg.block(ds1, property_='cuisine'), bg.block(ds2, property_='cuisine'))
pairs = rltk.get_record_pairs(ds1, ds2, block=blocks)'''

pairs = rltk.get_record_pairs(ds1, ds2)

f = open('similarities.txt', 'w+')

for r1, r2 in pairs:

    a_d = rltk.levenshtein_similarity(r1.address, r2.address)
    p_d = rltk.jaro_winkler_similarity(r1.phone, r2.phone)
    c_d = rltk.jaro_winkler_similarity(r1.cuisine, r2.cuisine)
    f.write(r1.id + "," + r2.id + "," + str(a_d) + "," + str(p_d) + "," +
            str(c_d) + "\n")

f.close()
def compare_cpu(g2a_min_cpu_1,
                techpowerup_cpu,
                g2a_min_cpu_2=None,
                max_score_1=-1,
                max_score_2=-1):
    # techpowerup_cpu_reader = jsonlines.open(techpowerup_cpu, 'r')
    similar_id_1 = ''
    similar_id_2 = ''
    most_similar_cpu_1 = ''
    most_similar_cpu_2 = ''
    score_1 = -9999
    score_2 = -9999
    with jsonlines.open(techpowerup_cpu, 'r') as techpowerup_cpu_reader:
        if g2a_min_cpu_1 != None and g2a_min_cpu_2 != None:

            for cpu in techpowerup_cpu_reader:
                cpu_key, cpu_value = list(cpu.items())[0][0], list(
                    cpu.items())[0][1]
                if len(g2a_min_cpu_1) != 0:
                    score_1 = rltk.levenshtein_similarity(
                        g2a_min_cpu_1, cpu_value['name'].lower(
                        )) + rltk.jaro_winkler_similarity(
                            g2a_min_cpu_1, cpu_value['name'].lower())
                '''if "Intel" in g2a_min_cpu_1:
                    if cpu_value["Company"] == "Intel":
                        score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])
                elif "AMD" in g2a_min_cpu_1:
                    if cpu_value["Company"] == 'AMD':
                        score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])
                else:
                    score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])'''

                if len(g2a_min_cpu_2) != 0:
                    score_2 = rltk.levenshtein_similarity(
                        g2a_min_cpu_2, cpu_value['name'].lower(
                        )) + rltk.jaro_winkler_similarity(
                            g2a_min_cpu_2, cpu_value['name'].lower())
                '''if "Intel" in g2a_min_cpu_2:
                    if cpu_value["Company"] == "Intel":
                        score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name'])
                elif "AMD" in g2a_min_cpu_2:
                    if cpu_value["Company"] == 'AMD':
                        score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name'])
                else:
                    score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name'])'''
                # score_2 = rltk.levenshtein_similarity(g2a_min_cpu_2,cpu_value['name'])

                if score_1 > max_score_1:
                    max_score_1 = score_1
                    similar_id_1 = cpu_key
                    most_similar_cpu_1 = cpu_value['name']

                if score_2 > max_score_2:
                    max_score_2 = score_2
                    similar_id_2 = cpu_key
                    most_similar_cpu_2 = cpu_value['name']

            if max_score_1 >= 1.2 and max_score_2 >= 1.2:
                return {
                    'most_sim_cpu_1': {
                        'name': most_similar_cpu_1,
                        'sim_id': similar_id_1,
                        'sim_score': max_score_1
                    },
                    'most_sim_cpu_2': {
                        'name': most_similar_cpu_2,
                        'sim_id': similar_id_2,
                        'sim_score': max_score_2
                    }
                }

            elif max_score_1 >= 1.2 and max_score_2 <= 1.2:
                return {
                    'most_sim_cpu_1': {
                        'name': most_similar_cpu_1,
                        'sim_id': similar_id_1,
                        'sim_score': max_score_1
                    },
                    'most_sim_cpu_2': {
                        'name': '',
                        'sim_id': '',
                        'sim_score': max_score_2
                    }
                }

            elif max_score_1 <= 1.2 and max_score_2 >= 1.2:
                return {
                    'most_sim_cpu_1': {
                        'name': '',
                        'sim_id': '',
                        'sim_score': max_score_1
                    },
                    'most_sim_cpu_2': {
                        'name': most_similar_cpu_2,
                        'sim_id': similar_id_2,
                        'sim_score': max_score_2
                    }
                }

            else:
                return {
                    'most_sim_cpu_1': {
                        'name': '',
                        'sim_id': '',
                        'sim_score': max_score_1
                    },
                    'most_sim_cpu_2': {
                        'name': '',
                        'sim_id': '',
                        'sim_score': max_score_2
                    }
                }

        if g2a_min_cpu_1 != None and g2a_min_cpu_2 == None:
            for cpu in techpowerup_cpu_reader:
                cpu_key, cpu_value = list(cpu.items())[0][0], list(
                    cpu.items())[0][1]

                if len(g2a_min_cpu_1) != 0:
                    score_1 = rltk.levenshtein_similarity(
                        g2a_min_cpu_1,
                        cpu_value['name']) + rltk.jaro_winkler_similarity(
                            g2a_min_cpu_1, cpu_value['name'].lower())
                '''if "Intel" in g2a_min_cpu_1:
                    if cpu_value["Company"] == "Intel":
                        score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])
                elif "AMD" in g2a_min_cpu_1:
                    if cpu_value["Company"] == 'AMD':
                        score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])
                else:
                    score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1,cpu_value['name'])'''
                # score_1 = rltk.levenshtein_similarity(g2a_min_cpu_1, cpu_value['name'])
                if score_1 > max_score_1:
                    max_score_1 = score_1
                    similar_id_1 = cpu_key
                    most_similar_cpu_1 = cpu_value['name']
            if max_score_1 >= 1.2:
                return {
                    'most_sim_cpu': {
                        'name': most_similar_cpu_1,
                        'sim_id': similar_id_1,
                        'sim_score': max_score_1
                    }
                }
            else:
                return {
                    'most_sim_cpu': {
                        'name': '',
                        'sim_id': '',
                        'sim_score': max_score_1
                    }
                }
        c+=1
        #print(len(cscu[k]), len(calpoly[k]))
print(c)


# In[57]:


count = 0

bucket_keys = set(list(csus.keys()) + list(ashford.keys()))
result = []
for k in bucket_keys:
    if k in csus and k in ashford:
        for cs, ash in itertools.product(csus[k], ashford[k]):
            sub = rltk.levenshtein_similarity(cs[0],ash[0])
            name = rltk.levenshtein_similarity(cs[1],ash[1])

            score = 0.4 * sub + 0.6 * name

            if score > 0.7:
                count += 1
                #if score < 0.9:
                ans = (cs[2],ash[2])
                print(ans)
                result.append(ans)
                print(cs,ash,score,'count:' + str(count))


# In[26]:
def year_similarity(year1: str, year2: str) -> float:
    return 1.0 if year1 == "" or year2 == "" else rltk.levenshtein_similarity(year1, year2)
Exemple #20
0

class Record2(rltk.Record):
    @rltk.cached_property
    def id(self):
        return self.raw_object['ident']

    @rltk.cached_property
    def value(self):
        v = self.raw_object.get('values', list())
        return v[0] if len(v) > 0 else 'empty'


ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv'),
                   record_class=Record1,
                   adapter=rltk.MemoryAdapter())
ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'),
                   record_class=Record2,
                   adapter=rltk.DBMAdapter('file_index'))

pairs = rltk.get_record_pairs(ds1, ds2)
for r1, r2 in pairs:
    print('-------------')
    print(r1.id, r1.value, '\t', r2.id, r2.value)
    if r1.parent_id:
        print('r1\'s parent', r1.parent_id, ds1.get_record(r1.parent_id).value)
    print('levenshtein_distance:',
          rltk.levenshtein_distance(r1.value, r2.value))
    print('levenshtein_similarity:',
          rltk.levenshtein_similarity(r1.value, r2.value))
Exemple #21
0
def getMostSimilarGPU_Techpowerup(input_gpu):
    global gpu_blocks, techpowerup_gpus

    split_words = [" / ", ", ", " or "]

    game_gpus = [input_gpu]
    for cur_word in split_words:
        if cur_word in input_gpu:
            game_gpus = input_gpu.split(cur_word)
            break

    gpu1 = {}
    gpu2 = {}

    # Base Cases:
    # Invalid input gpu
    if game_gpus[0] == "-" and len(game_gpus) == 1:
        return gpu1, gpu2

    # GPU1
    max_score = -1
    max_match_id = None
    max_match_val = None
    cur_game_gpu = cleanGPUText(game_gpus[0].lower())
    gpu1["actual_val"] = game_gpus[0].lower()
    cur_block_key = getGPUBlockKey(cur_game_gpu)
    for tgpu_id in gpu_blocks[cur_block_key].keys():
        try:
            tgpu_val = techpowerup_gpus[tgpu_id]
            cur_product_name = tgpu_val["Product Name"].lower()

            cur_score = rltk.levenshtein_similarity(cur_game_gpu,
                                                    cur_product_name)

            if cur_score > max_score:
                max_score = cur_score
                max_match_id = tgpu_id
                max_match_val = cur_product_name
        except:
            pass

    gpu1["max_score"] = max_score
    gpu1["max_match_id"] = max_match_id
    gpu1["max_match_val"] = max_match_val

    # GPU2
    if len(game_gpus) >= 2:
        max_score = -1
        max_match_id = None
        max_match_val = None
        cur_game_gpu = cleanGPUText(game_gpus[1].lower())
        gpu2["actual_val"] = game_gpus[1].lower()
        cur_block_key = getGPUBlockKey(cur_game_gpu)
        for tgpu_id in gpu_blocks[cur_block_key].keys():
            try:
                tgpu_val = techpowerup_gpus[tgpu_id]
                cur_product_name = tgpu_val["Product Name"].lower()

                cur_score = rltk.levenshtein_similarity(
                    cur_game_gpu, cur_product_name)

                if cur_score > max_score:
                    max_score = cur_score
                    max_match_id = tgpu_id
                    max_match_val = cur_product_name
            except:
                pass

        gpu2["max_score"] = max_score
        gpu2["max_match_id"] = max_match_id
        gpu2["max_match_val"] = max_match_val

    return gpu1, gpu2
Exemple #22
0
#         c+=1
#         #print(len(cscu[k]), len(calpoly[k]))
# print(c)


# In[32]:


count = 0

bucket_keys = set(list(cscu.keys()) + list(calpoly.keys()))
result,allr = [],[]
for k in bucket_keys:
    if k in cscu and k in calpoly:
        for cs, ca in itertools.product(cscu[k], calpoly[k]):
            sub = rltk.levenshtein_similarity(cs[0],ca[0])
            title = rltk.levenshtein_similarity(cs[1],ca[1])

            score = 0.6 * sub + 0.4 * title
            mat = []
            if score > 0.867:
                count += 1
                #if score < 0.9:
                mat.append((cs))
                mat.append((ca))
#                 print(ans)
                result.append(mat)
print(result)
#                 print(cs,ca,score,'count:' + str(count))