def featurize_record_pair(r1, r2, freq, doc_size): """ Featurize a record pair and return a Series of the feature vectors Params: r1: (rltk.Record) record 1 r2: (rltk.Record) record 2 freq: (Dict) corpus frequency doc_size: (int) total size of dataset """ fv = pd.Series() fv['id1'] = r1.id fv['id2'] = r2.id if gt.is_member(r1.id, r2.id): fv['label'] = 1 else: fv['label'] = 0 if (r1.manufacturer == '' or None) or (r2.manufacturer == '' or None): fv['manufacturer_jaro_winkler'] = None fv['manufacturer_levenshtien'] = None fv['manufacturer_jaccard'] = None else: fv['manufacturer_jaro_winkler'] = rltk.jaro_winkler_similarity(r1.manufacturer, r2.manufacturer) fv['manufacturer_levenshtien'] = rltk.levenshtein_similarity(r1.manufacturer, r2.manufacturer) fv['manufacturer_jaccard'] = rltk.jaccard_index_similarity(set(tokenize(r1.manufacturer)), set(tokenize(r2.manufacturer))) if r1.price is None or r2.price is None: fv['price_difference'] = None else: fv['price_difference'] = abs(r1.price - r2.price)/max(r1.price, r2.price) fv['name_jaccard'] = rltk.jaccard_index_similarity(set(r1.name_tokenized), set(r2.name_tokenized)) fv['name_jaro_winkler'] = rltk.jaro_winkler_similarity(" ".join(r1.name_tokenized), " ".join(r2.name_tokenized)) fv['name_trigram'] = rltk.ngram_distance(r1.name, r2.name,3) if r1.description_tokenized is None or r2.description_tokenized is None: fv['desc_tf_idf'] = None fv['desc_trigram'] = None fv['desc_jaccard'] = None else: fv['desc_tf_idf'] = rltk.tf_idf_similarity(r1.description_tokenized, r2.description_tokenized,freq,doc_size) fv['desc_trigram'] = rltk.ngram_distance(" ".join(r1.description_tokenized), " ".join(r2.description_tokenized),3) fv['desc_jaccard'] = rltk.jaccard_index_similarity(set(r1.description_tokenized), set(r2.description_tokenized)) return fv
def match_records_using_string_similarity(record_1, field_1, record_2, field_2): value_1 = getattr(record_1, field_1).lower() value_2 = getattr(record_2, field_2).lower() ngram_tokenizer = rltk.NGramTokenizer() if rltk.jaccard_index_similarity(ngram_tokenizer.basic(value_1, 3), ngram_tokenizer.basic(value_2, 3)) > 0.8: return True return False
def similarity_match_by_name(record1, record2): full_name_m = record1.full_name_string.lower() full_name_w = record2.full_name_string.lower() # full name score if full_name_m == full_name_w: return True, 1 # Jaccard name score for whole set of name tokens (dirty) jaccard_name_score = rltk.jaccard_index_similarity(record1.name_tokens, record2.name_tokens) # Jaro Winkerler name score for re-assembeled full name (clean) jw_name_score = rltk.jaro_winkler_similarity(full_name_m, full_name_w) total = jaccard_name_score * 0.65 + jw_name_score * 0.35 return total > 0.7, total
def SimilarityScore(record1, record2): names = rltk.jaccard_index_similarity(record1.name, record2.name) address = rltk.levenshtein_similarity(record1.address, record2.address) cuisine = rltk.levenshtein_similarity(record1.cuisine, record2.cuisine) # phone = rltk.levenshtein_similarity(record1.phone, record2.phone) if record1.phone != record2.phone: phone = 0. else: phone = 1. #0.7 0.2 0.1 > 0.8 104 #0.4 0.4 0.2 >0.59 106 #0.4 0.4 0.2 >0.53 113 return 0.4 * phone + 0.4 * names + 0.2 * address
def similarity(self, id1, id2): if not self._gm.in_graph(id1) or not self._gm.in_graph(id2): raise ValueError('Invalid id1 or id2') if id1 == id2: return 1.0 # node type (item or property) cat1 = self._gm.get_node_type(id1) cat2 = self._gm.get_node_type(id2) if cat1 != cat2: return 0.0 # attributes attr1 = self._gm.get_node_attributes(id1) attr2 = self._gm.get_node_attributes(id2) attr1_keys = set(attr1) attr2_keys = set(attr2) # comparing on same attributes attr_score = 0 all_keys = attr1_keys & attr2_keys for key in all_keys: similarity = Comparator.get(key) attr_score += similarity(attr1[key], attr2[key]) / len( all_keys ) # weights of different attributes should be configurable # weight of overlapped keys (penalty of non-shared keys) attr_score *= rltk.jaccard_index_similarity(attr1_keys, attr2_keys) # class # aida's entity and event types are special # need a more general way to compare class similarity cls1 = self._gm.get_node_class(id1) cls2 = self._gm.get_node_class(id2) type1 = set(normalize_ontology_type(cls1).split()) type2 = set(normalize_ontology_type(cls2).split()) cls_score = len(type1 & type2) / max(len(type1 | type2), 1) return (attr_score + cls_score) / 2
def name_sim(a, b): def _decode(s): # en:"label1",ru:"label2" r = defaultdict(list) for l in s.split(','): lang, label = l[:2], l[4:-1] r[lang].append(normalize_text(label)) return r if a == b: return 1 if not a or not b: return # multi-lingual labels0 ml_l_a, ml_l_b = _decode(a), _decode(b) shared_langs = set(ml_l_a.keys()) & set(ml_l_b.keys()) score = 0 for lang in shared_langs: l_a, l_b = ml_l_a[lang], ml_l_b[lang] score += rltk.jaccard_index_similarity( set(l_a), set(l_b)) / len(shared_langs) return score
def genre_similarity(r_imdb, r_afi): s1 = r_imdb.genre_set s2 = r_afi.genre_set return rltk.jaccard_index_similarity(s1, s2)
try: gt_path = set(gt_sp[cur_key]) except: gt_path = set() try: base_gen_path = set(base_gen_sp[cur_key]) except: base_gen_path = set() try: comp_gen_path = set(comp_gen_sp[cur_key]) except: comp_gen_path = set() base_jaccard_sim = rltk.jaccard_index_similarity( gt_path, base_gen_path) baseline_jaccard_sims.append(base_jaccard_sim) comp_jaccard_sim = rltk.jaccard_index_similarity( gt_path, comp_gen_path) comparison_jaccard_sims.append(comp_jaccard_sim) baseline_JACCARD = np.mean(baseline_jaccard_sims) * 100 comparison_JACCARD = np.mean(comparison_jaccard_sims) * 100 print("\nBaseline Method Jaccard Similarity metric = ", baseline_JACCARD) print("Comparison Method Jaccard Similarity metric = ", comparison_JACCARD) print("MAP significance test = ", stats.ttest_rel(baseline_jaccard_sims, comparison_jaccard_sims), "\n")
def ingredient_set(prod1, prod2): set1 = prod1.ingredients set2 = prod2.ingredients return rltk.jaccard_index_similarity(set1, set2)
def record_score(r1, r2): score = rltk.jaccard_index_similarity(set(r1.concatenated_labels), set(r2.concatenated_labels)) return score