def find_relation_class_attr_name_sem_matchings(network, kr_handlers): # Retrieve relation names #self.find_relation_class_name_sem_matchings() st = time.time() names = [] seen_fields = [] for (db_name, source_name, field_name, _) in network.iterate_values(): orig_field_name = field_name if field_name not in seen_fields: seen_fields.append(field_name) # seen already field_name = nlp.camelcase_to_snakecase(field_name) field_name = field_name.replace('-', ' ') field_name = field_name.replace('_', ' ') field_name = field_name.lower() svs = [] for token in field_name.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append( ('attribute', (db_name, source_name, orig_field_name), svs)) num_attributes_inserted = len(names) # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append(('class', (kr_name, original_cl_name), svs)) matchings = [] for idx_rel in range(0, num_attributes_inserted): # Compare only with classes for idx_class in range(num_attributes_inserted, len(names)): svs_rel = names[idx_rel][2] svs_cla = names[idx_class][2] semantic_sim = SS.compute_semantic_similarity(svs_rel, svs_cla) if semantic_sim > 0.8: # match.format db_name, source_name, field_name -> class_name match = ((names[idx_rel][1][0], names[idx_rel][1][1], names[idx_rel][1][2]), names[idx_class][1]) matchings.append(match) et = time.time() print("Time to relation-class (sem): " + str(et - st)) return matchings
def find_relation_class_name_matchings(network, kr_handlers): # Retrieve relation names st = time.time() names = [] seen_sources = [] for (db_name, source_name, _, _) in network.iterate_values(): original_source_name = source_name if source_name not in seen_sources: seen_sources.append(source_name) # seen already source_name = nlp.camelcase_to_snakecase(source_name) source_name = source_name.replace('-', ' ') source_name = source_name.replace('_', ' ') source_name = source_name.lower() m = MinHash(num_perm=32) for token in source_name.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('relation', (db_name, original_source_name), m)) num_relations_inserted = len(names) # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() m = MinHash(num_perm=32) for token in cl.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('class', (kr_name, original_cl_name), m)) # Index all the minhashes lsh_index = MinHashLSH(threshold=0.5, num_perm=32) for idx in range(len(names)): lsh_index.insert(idx, names[idx][2]) matchings = [] for idx in range(0, num_relations_inserted): # Compare only with classes N = lsh_index.query(names[idx][2]) for n in N: kind_q = names[idx][0] kind_n = names[n][0] if kind_n != kind_q: # match.format is db_name, source_name, field_name -> class_name match = ((names[idx][1][0], names[idx][1][1], "_"), names[n][1]) matchings.append(match) et = time.time() print("Time to relation-class (name): " + str(et - st)) return matchings
def minhash(str_values): def java_long(number): return (number + 2**63) % 2**64 - 2**63 def remainder(a, b): return a - (b * int(a / b)) def hash_this(value): h = mersenne_prime length = len(value) for i in range(length): h = 31 * h + ord(value[i]) return h mh = [9223372036854775807 for i in range(k)] for v in str_values: v = nlp.camelcase_to_snakecase(v) v = v.replace('_', ' ') v = v.replace('-', ' ') v = v.lower() for token in v.split(' '): if token not in stopwords.words('english'): raw_hash = hash_this(token) for i in range(k): first_part = java_long(random_seeds[i][0] * raw_hash) second_part = java_long(random_seeds[i][1]) nomodule = java_long(first_part + second_part) h = java_long(remainder(nomodule, mersenne_prime)) if h < mh[i]: mh[i] = h return mh
def find_sem_coh_matchings(network, kr_handlers): matchings = [] matchings_special = [] # Get all relations with groups table_groups = dict() for db, t, attrs in SS.read_table_columns(None, network=network): groups = SS.extract_cohesive_groups(t, attrs) table_groups[(db, t)] = groups # (score, [set()]) names = [] # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append(('class', (kr_name, original_cl_name), svs)) for db_table_info, groups in table_groups.items(): db_name, table_name = db_table_info class_seen = [] # to filter out already seen classes for g_score, g_tokens in groups: g_svs = [] for t in g_tokens: sv = glove_api.get_embedding_for_word(t) if sv is not None: g_svs.append(sv) for _, class_info, class_svs in names: kr_name, class_name = class_info sim = SS.compute_semantic_similarity(class_svs, g_svs) if sim > g_score and class_name not in class_seen: class_seen.append(class_name) match = ((db_name, table_name, "_"), (kr_name, class_name)) matchings.append(match) """ similar = SS.groupwise_semantic_sim(class_svs, g_svs, 0.7) if similar: class_seen.append(class_name) match = ((db_name, table_name, "_"), (kr_name, class_name)) matchings_special.append(match) continue """ return matchings, table_groups #, matchings_special
def test_find_semantic_sim(): # Load onto om = SSAPI(None, None, None, None) # Load parsed ontology om.add_krs([("dbpedia", "cache_onto/schemaorg.pkl")], parsed=True) # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") print("Loading ontology classes...") names = [] # Load classes for kr_name, kr_handler in om.kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append(('class', cl, svs)) print("Loading ontology classes...OK") while True: # Get words i = input("introduce two words separated by space to get similarity. EXIT to exit") tokens = i.split(' ') if tokens[0] == "EXIT": print("bye!") break svs = [] for t in tokens: sv = glove_api.get_embedding_for_word(t) if sv is not None: svs.append(sv) else: print("No vec for : " + str(t)) for _, cl, vecs in names: sim = SS.compute_semantic_similarity(svs, vecs) if sim > 0.4: print(str(cl) + " -> " + str(sim))
def retrieve_class_names(kr_handlers, num_perm=32): names = list() for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() m = MinHash(num_perm=num_perm) for token in cl.split(): if token not in stopwords.words('english'): m.update(token.encode('utf8')) names.append(('class', (kr_name, original_cl_name), m)) return names
def find_matching_to_text(network, semantic_sim_threshold=0.5, sensitivity_neg_signal=0.5, negative_signal_threshold=0.4, penalize_unknown_word=False, add_exact_matches=True, reference_name="", reference_gen=None): # Retrieve relation names st = time.time() names = [] seen_fields = set() for (db_name, source_name, field_name, _) in network.iterate_values(): orig_field_name = field_name key_seen = source_name + field_name if key_seen not in seen_fields: seen_fields.add(key_seen) # seen already field_name = nlp.camelcase_to_snakecase(field_name) field_name = field_name.replace('-', ' ') field_name = field_name.replace('_', ' ') field_name = field_name.lower() svs = [] for token in field_name.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append( ('attribute', (db_name, source_name, orig_field_name), svs)) num_attributes_inserted = len(names) # Retrieve class names for cl in reference_gen: original_cl_name = cl cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append(('class', (reference_name, original_cl_name), svs)) print("N equals: " + str(len(names))) pos_matchings = [] neg_matchings = [] for idx_class in range(num_attributes_inserted, len(names)): for idx_rel in range( 0, num_attributes_inserted): # Compare only with classes ban_index1, ban_index2 = get_ban_indexes(names[idx_rel][1][2], names[idx_class][1][1]) svs_rel = remove_banned_vectors(ban_index1, names[idx_rel][2]) svs_cla = remove_banned_vectors(ban_index2, names[idx_class][2]) semantic_sim, strong_signal = SS.compute_semantic_similarity( svs_rel, svs_cla, penalize_unknown_word=penalize_unknown_word, add_exact_matches=add_exact_matches, signal_strength_threshold=sensitivity_neg_signal) if strong_signal and semantic_sim > semantic_sim_threshold: # match.format db_name, source_name, field_name -> class_name match = ((names[idx_rel][1][0], names[idx_rel][1][1], names[idx_rel][1][2]), names[idx_class][1]) pos_matchings.append(match) continue # FIXME: one matching per entity elif strong_signal and semantic_sim < negative_signal_threshold: match = ((names[idx_rel][1][0], names[idx_rel][1][1], names[idx_rel][1][2]), names[idx_class][1]) neg_matchings.append(match) et = time.time() print("l52: " + str(et - st)) return pos_matchings, neg_matchings
def find_relation_class_name_sem_matchings(network, kr_handlers): # Retrieve relation names st = time.time() names = [] seen_sources = [] for (db_name, source_name, _, _) in network.iterate_values(): original_source_name = source_name if source_name not in seen_sources: seen_sources.append(source_name) # seen already source_name = source_name.replace('-', ' ') source_name = source_name.replace('_', ' ') source_name = source_name.lower() svs = [] for token in source_name.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) #if sv is not None: svs.append( sv) # append even None, to apply penalization later names.append(('relation', (db_name, original_source_name), svs)) num_relations_inserted = len(names) # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) #if sv is not None: svs.append( sv) # append even None, to apply penalization later names.append(('class', (kr_name, original_cl_name), svs)) matchings = [] for idx_rel in range(0, num_relations_inserted): # Compare only with classes for idx_class in range(num_relations_inserted, len(names)): svs_rel = names[idx_rel][2] svs_cla = names[idx_class][2] semantic_sim = SS.compute_semantic_similarity( svs_rel, svs_cla, penalize_unknown_word=True, add_exact_matches=False) #semantic_sim = SS.compute_semantic_similarity(svs_rel, svs_cla) if semantic_sim > 0.5: # match.format is db_name, source_name, field_name -> class_name match = ((names[idx_rel][1][0], names[idx_rel][1][1], "_"), names[idx_class][1]) matchings.append(match) et = time.time() print("Time to relation-class (sem): " + str(et - st)) return matchings