def extract_cohesive_groups1(table_name, attrs): tokens = set() ctb = nlp.curate_string(table_name) tokens |= set(ctb.split(' ')) for attr in attrs: cattr = nlp.curate_string(attr) tokens |= set(cattr.split(' ')) #tokens = [t for t in tokens if t not in stopwords.words('english') and len(t) > 1] token_vector = [(t, glove_api.get_embedding_for_word(t)) for t in tokens if t not in stopwords.words('english') and len(t) > 1 and glove_api.get_embedding_for_word(t) is not None] threshold = 0.5 group = set() for a, b in itertools.combinations(token_vector, 2): sim = glove_api.semantic_distance(a[1], b[1]) if sim > threshold: group.add(a[0]) group.add(b[0]) #group2 = extract_cohesive_groups2(table_name, attrs) return [(threshold, group)] #, group2
def find_relation_class_attr_name_sem_matchings(network, kr_handlers): # Retrieve relation names #self.find_relation_class_name_sem_matchings() st = time.time() names = [] seen_fields = [] for (db_name, source_name, field_name, _) in network.iterate_values(): orig_field_name = field_name if field_name not in seen_fields: seen_fields.append(field_name) # seen already field_name = nlp.camelcase_to_snakecase(field_name) field_name = field_name.replace('-', ' ') field_name = field_name.replace('_', ' ') field_name = field_name.lower() svs = [] for token in field_name.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append( ('attribute', (db_name, source_name, orig_field_name), svs)) num_attributes_inserted = len(names) # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append(('class', (kr_name, original_cl_name), svs)) matchings = [] for idx_rel in range(0, num_attributes_inserted): # Compare only with classes for idx_class in range(num_attributes_inserted, len(names)): svs_rel = names[idx_rel][2] svs_cla = names[idx_class][2] semantic_sim = SS.compute_semantic_similarity(svs_rel, svs_cla) if semantic_sim > 0.8: # match.format db_name, source_name, field_name -> class_name match = ((names[idx_rel][1][0], names[idx_rel][1][1], names[idx_rel][1][2]), names[idx_class][1]) matchings.append(match) et = time.time() print("Time to relation-class (sem): " + str(et - st)) return matchings
def extract_cohesive_groups(table_name, attrs, sem_sim_threshold=0.7, group_size_cutoff=0): def does_it_keep_group_coherent(running_group, a, b, threshold): if len(running_group) == 0: return True av = glove_api.get_embedding_for_word(a) bv = glove_api.get_embedding_for_word(b) for el in running_group: elv = glove_api.get_embedding_for_word(el) sim_a = glove_api.semantic_distance(elv, av) if sim_a > threshold: sim_b = glove_api.semantic_distance(elv, bv) if sim_b > threshold: return True else: return False else: return False tokens = set() ctb = nlp.curate_string(table_name) tokens |= set(ctb.split(' ')) for attr in attrs: cattr = nlp.curate_string(attr) tokens |= set(cattr.split(' ')) tokens = [ t for t in tokens if t not in stopwords.words('english') and len(t) > 1 ] running_groups = [set()] for a, b in itertools.combinations(tokens, 2): av = glove_api.get_embedding_for_word(a) bv = glove_api.get_embedding_for_word(b) if av is None or bv is None: continue sim = glove_api.semantic_distance(av, bv) if sim > sem_sim_threshold: # try to add to existing group added_to_existing_group = False for running_group in running_groups: ans = does_it_keep_group_coherent(running_group, a, b, sem_sim_threshold) if ans: # Add to as many groups as necessary added_to_existing_group = True running_group.add(a) running_group.add(b) if not added_to_existing_group: running_group = set() running_group.add(a) running_group.add(b) running_groups.append(running_group) return [(sem_sim_threshold, group) for group in running_groups if len(group) > group_size_cutoff]
def find_sem_coh_matchings(network, kr_handlers): matchings = [] matchings_special = [] # Get all relations with groups table_groups = dict() for db, t, attrs in SS.read_table_columns(None, network=network): groups = SS.extract_cohesive_groups(t, attrs) table_groups[(db, t)] = groups # (score, [set()]) names = [] # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append(('class', (kr_name, original_cl_name), svs)) for db_table_info, groups in table_groups.items(): db_name, table_name = db_table_info class_seen = [] # to filter out already seen classes for g_score, g_tokens in groups: g_svs = [] for t in g_tokens: sv = glove_api.get_embedding_for_word(t) if sv is not None: g_svs.append(sv) for _, class_info, class_svs in names: kr_name, class_name = class_info sim = SS.compute_semantic_similarity(class_svs, g_svs) if sim > g_score and class_name not in class_seen: class_seen.append(class_name) match = ((db_name, table_name, "_"), (kr_name, class_name)) matchings.append(match) """ similar = SS.groupwise_semantic_sim(class_svs, g_svs, 0.7) if similar: class_seen.append(class_name) match = ((db_name, table_name, "_"), (kr_name, class_name)) matchings_special.append(match) continue """ return matchings, table_groups #, matchings_special
def extract_cohesive_groups2(table_name, attrs): def maybe_add_new_set(groups, current): # Right now, filter duplicate sets, and subsumed sets as well score, current_set = current for score, set_attrs in groups: if len(current_set) == len(set_attrs) and len(current_set - set_attrs) == 0: return # if repeated, then just return without adding len_a = len(current_set) len_b = len(set_attrs) if len_a > len_b: if len(set_attrs - current_set) == 0: return else: if len((current_set - set_attrs)) == 0: return groups.append(current) # otherwise add and finish groups = [] tokens = set() ctb = nlp.curate_string(table_name) tokens |= set(ctb.split(' ')) for attr in attrs: cattr = nlp.curate_string(attr) tokens |= set(cattr.split(' ')) tokens = [ t for t in tokens if t not in stopwords.words('english') and len(t) > 1 ] for anchor in tokens: threshold = 0.7 current = ( threshold, set() ) # keeps (score, []) cohesiveness score and list of attrs that honor it for t in tokens: if anchor == t: # not interested in self-comparison continue anchor_v = glove_api.get_embedding_for_word(anchor) t_v = glove_api.get_embedding_for_word(t) if anchor_v is not None and t_v is not None: ss = glove_api.semantic_distance(anchor_v, t_v) if ss > current[0]: new_set = current[1] new_set.add(anchor) new_set.add(t) #current = (ss, new_set) current = (threshold, new_set) if len(current[1]) > 0: maybe_add_new_set(groups, current) return groups
def test_find_semantic_sim(): # Load onto om = SSAPI(None, None, None, None) # Load parsed ontology om.add_krs([("dbpedia", "cache_onto/schemaorg.pkl")], parsed=True) # Load glove model print("Loading language model...") path_to_glove_model = "../glove/glove.6B.100d.txt" glove_api.load_model(path_to_glove_model) print("Loading language model...OK") print("Loading ontology classes...") names = [] # Load classes for kr_name, kr_handler in om.kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append(('class', cl, svs)) print("Loading ontology classes...OK") while True: # Get words i = input("introduce two words separated by space to get similarity. EXIT to exit") tokens = i.split(' ') if tokens[0] == "EXIT": print("bye!") break svs = [] for t in tokens: sv = glove_api.get_embedding_for_word(t) if sv is not None: svs.append(sv) else: print("No vec for : " + str(t)) for _, cl, vecs in names: sim = SS.compute_semantic_similarity(svs, vecs) if sim > 0.4: print(str(cl) + " -> " + str(sim))
def does_it_keep_group_coherent(running_group, a, b, threshold): if len(running_group) == 0: return True av = glove_api.get_embedding_for_word(a) bv = glove_api.get_embedding_for_word(b) for el in running_group: elv = glove_api.get_embedding_for_word(el) sim_a = glove_api.semantic_distance(elv, av) if sim_a > threshold: sim_b = glove_api.semantic_distance(elv, bv) if sim_b > threshold: return True else: return False else: return False
def get_semantic_vectors_for(tokens): s_vectors = [] for t in tokens: vec = glove_api.get_embedding_for_word(t) if vec is not None: s_vectors.append(vec) return s_vectors
def generate_table_vectors(path_to_serialized_model, network=False): table_vectors = dict() for db_name, table_name, cols in read_table_columns( path_to_serialized_model, network=network): semantic_vectors = [] seen_tokens = [] for c in cols: c = c.replace('_', ' ') tokens = c.split(' ') for token in tokens: token = token.lower() if token not in stopwords.words('english'): if token not in seen_tokens: seen_tokens.append(token) vec = glove_api.get_embedding_for_word(token) if vec is not None: semantic_vectors.append(vec) print("Table: " + str(table_name) + " has: " + str(len(semantic_vectors))) table_vectors[(db_name, table_name)] = semantic_vectors return table_vectors
def _get_kr_classes_vectors(self): class_vectors = dict() for kr_name, kr in self.kr_handlers.items(): for class_name in kr.classes_id(): success, ret = kr.bow_repr_of(class_name, class_id=True) # Get bag of words representation if success: label, bow = ret seen_tokens = [] # filtering out already seen tokens sem_vectors = [] for el in bow: el = el.replace('_', ' ') tokens = el.split(' ') for token in tokens: token = token.lower() if token not in stopwords.words('english'): seen_tokens.append(token) sem_vector = glove_api.get_embedding_for_word(token) if sem_vector is not None: sem_vectors.append(sem_vector) if len(sem_vectors) > 0: # otherwise just no context generated for this class class_vectors[kr.name_of_class(class_name)] = sem_vectors else: print(ret) return class_vectors
def find_matching_to_text(network, semantic_sim_threshold=0.5, sensitivity_neg_signal=0.5, negative_signal_threshold=0.4, penalize_unknown_word=False, add_exact_matches=True, reference_name="", reference_gen=None): # Retrieve relation names st = time.time() names = [] seen_fields = set() for (db_name, source_name, field_name, _) in network.iterate_values(): orig_field_name = field_name key_seen = source_name + field_name if key_seen not in seen_fields: seen_fields.add(key_seen) # seen already field_name = nlp.camelcase_to_snakecase(field_name) field_name = field_name.replace('-', ' ') field_name = field_name.replace('_', ' ') field_name = field_name.lower() svs = [] for token in field_name.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append( ('attribute', (db_name, source_name, orig_field_name), svs)) num_attributes_inserted = len(names) # Retrieve class names for cl in reference_gen: original_cl_name = cl cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) if sv is not None: svs.append(sv) names.append(('class', (reference_name, original_cl_name), svs)) print("N equals: " + str(len(names))) pos_matchings = [] neg_matchings = [] for idx_class in range(num_attributes_inserted, len(names)): for idx_rel in range( 0, num_attributes_inserted): # Compare only with classes ban_index1, ban_index2 = get_ban_indexes(names[idx_rel][1][2], names[idx_class][1][1]) svs_rel = remove_banned_vectors(ban_index1, names[idx_rel][2]) svs_cla = remove_banned_vectors(ban_index2, names[idx_class][2]) semantic_sim, strong_signal = SS.compute_semantic_similarity( svs_rel, svs_cla, penalize_unknown_word=penalize_unknown_word, add_exact_matches=add_exact_matches, signal_strength_threshold=sensitivity_neg_signal) if strong_signal and semantic_sim > semantic_sim_threshold: # match.format db_name, source_name, field_name -> class_name match = ((names[idx_rel][1][0], names[idx_rel][1][1], names[idx_rel][1][2]), names[idx_class][1]) pos_matchings.append(match) continue # FIXME: one matching per entity elif strong_signal and semantic_sim < negative_signal_threshold: match = ((names[idx_rel][1][0], names[idx_rel][1][1], names[idx_rel][1][2]), names[idx_class][1]) neg_matchings.append(match) et = time.time() print("l52: " + str(et - st)) return pos_matchings, neg_matchings
def find_relation_class_name_sem_matchings(network, kr_handlers): # Retrieve relation names st = time.time() names = [] seen_sources = [] for (db_name, source_name, _, _) in network.iterate_values(): original_source_name = source_name if source_name not in seen_sources: seen_sources.append(source_name) # seen already source_name = source_name.replace('-', ' ') source_name = source_name.replace('_', ' ') source_name = source_name.lower() svs = [] for token in source_name.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) #if sv is not None: svs.append( sv) # append even None, to apply penalization later names.append(('relation', (db_name, original_source_name), svs)) num_relations_inserted = len(names) # Retrieve class names for kr_name, kr_handler in kr_handlers.items(): all_classes = kr_handler.classes() for cl in all_classes: original_cl_name = cl cl = nlp.camelcase_to_snakecase(cl) cl = cl.replace('-', ' ') cl = cl.replace('_', ' ') cl = cl.lower() svs = [] for token in cl.split(): if token not in stopwords.words('english'): sv = glove_api.get_embedding_for_word(token) #if sv is not None: svs.append( sv) # append even None, to apply penalization later names.append(('class', (kr_name, original_cl_name), svs)) matchings = [] for idx_rel in range(0, num_relations_inserted): # Compare only with classes for idx_class in range(num_relations_inserted, len(names)): svs_rel = names[idx_rel][2] svs_cla = names[idx_class][2] semantic_sim = SS.compute_semantic_similarity( svs_rel, svs_cla, penalize_unknown_word=True, add_exact_matches=False) #semantic_sim = SS.compute_semantic_similarity(svs_rel, svs_cla) if semantic_sim > 0.5: # match.format is db_name, source_name, field_name -> class_name match = ((names[idx_rel][1][0], names[idx_rel][1][1], "_"), names[idx_class][1]) matchings.append(match) et = time.time() print("Time to relation-class (sem): " + str(et - st)) return matchings