def extract_cohesive_groups1(table_name, attrs): tokens = set() ctb = nlp.curate_string(table_name) tokens |= set(ctb.split(' ')) for attr in attrs: cattr = nlp.curate_string(attr) tokens |= set(cattr.split(' ')) #tokens = [t for t in tokens if t not in stopwords.words('english') and len(t) > 1] token_vector = [(t, glove_api.get_embedding_for_word(t)) for t in tokens if t not in stopwords.words('english') and len(t) > 1 and glove_api.get_embedding_for_word(t) is not None] threshold = 0.5 group = set() for a, b in itertools.combinations(token_vector, 2): sim = glove_api.semantic_distance(a[1], b[1]) if sim > threshold: group.add(a[0]) group.add(b[0]) #group2 = extract_cohesive_groups2(table_name, attrs) return [(threshold, group)] #, group2
def does_it_keep_group_coherent(running_group, a, b, threshold): if len(running_group) == 0: return True av = glove_api.get_embedding_for_word(a) bv = glove_api.get_embedding_for_word(b) for el in running_group: elv = glove_api.get_embedding_for_word(el) sim_a = glove_api.semantic_distance(elv, av) if sim_a > threshold: sim_b = glove_api.semantic_distance(elv, bv) if sim_b > threshold: return True else: return False else: return False
def groupwise_semantic_sim(sv1, sv2, threshold): to_ret = False # the default is false for a, b in itertools.product(sv1, sv2): sim = glove_api.semantic_distance(a, b) if sim < threshold: return False # return False and terminate as soon as one combination does not satisfy the threshold to_ret = True # if at least we iterate once, the default changes to True return to_ret
def compute_internal_cohesion(sv): semantic_sim_array = [] for a, b in itertools.combinations(sv, 2): sem_sim = glove_api.semantic_distance(a, b) semantic_sim_array.append(sem_sim) coh = 0 if len(semantic_sim_array) > 1: # if not empty slice coh = np.mean(semantic_sim_array) return coh
def compute_sem_distance_with(x, sv): semantic_sim_array = [] for el in sv: if x is not None and el is not None: sem_sim = glove_api.semantic_distance(x, el) semantic_sim_array.append(sem_sim) ssim = 0 if len(semantic_sim_array) > 1: ssim = np.mean(semantic_sim_array) return ssim
def compute_internal_cohesion_elementwise(x, sv): semantic_sim_array = [] for el in sv: if x is not None and el is not None: sem_sim = glove_api.semantic_distance(x, el) semantic_sim_array.append(sem_sim) coh = 0 if len(semantic_sim_array) > 1: coh = np.mean(semantic_sim_array) return coh
def extract_cohesive_groups(table_name, attrs, sem_sim_threshold=0.7, group_size_cutoff=0): def does_it_keep_group_coherent(running_group, a, b, threshold): if len(running_group) == 0: return True av = glove_api.get_embedding_for_word(a) bv = glove_api.get_embedding_for_word(b) for el in running_group: elv = glove_api.get_embedding_for_word(el) sim_a = glove_api.semantic_distance(elv, av) if sim_a > threshold: sim_b = glove_api.semantic_distance(elv, bv) if sim_b > threshold: return True else: return False else: return False tokens = set() ctb = nlp.curate_string(table_name) tokens |= set(ctb.split(' ')) for attr in attrs: cattr = nlp.curate_string(attr) tokens |= set(cattr.split(' ')) tokens = [t for t in tokens if t not in stopwords.words('english') and len(t) > 1] running_groups = [set()] for a, b in itertools.combinations(tokens, 2): av = glove_api.get_embedding_for_word(a) bv = glove_api.get_embedding_for_word(b) if av is None or bv is None: continue sim = glove_api.semantic_distance(av, bv) if sim > sem_sim_threshold: # try to add to existing group added_to_existing_group = False for running_group in running_groups: ans = does_it_keep_group_coherent(running_group, a, b, sem_sim_threshold) if ans: # Add to as many groups as necessary added_to_existing_group = True running_group.add(a) running_group.add(b) if not added_to_existing_group: running_group = set() running_group.add(a) running_group.add(b) running_groups.append(running_group) return [(sem_sim_threshold, group) for group in running_groups if len(group) > group_size_cutoff]
def extract_cohesive_groups2(table_name, attrs): def maybe_add_new_set(groups, current): # Right now, filter duplicate sets, and subsumed sets as well score, current_set = current for score, set_attrs in groups: if len(current_set) == len(set_attrs) and len(current_set - set_attrs) == 0: return # if repeated, then just return without adding len_a = len(current_set) len_b = len(set_attrs) if len_a > len_b: if len(set_attrs - current_set) == 0: return else: if len((current_set - set_attrs)) == 0: return groups.append(current) # otherwise add and finish groups = [] tokens = set() ctb = nlp.curate_string(table_name) tokens |= set(ctb.split(' ')) for attr in attrs: cattr = nlp.curate_string(attr) tokens |= set(cattr.split(' ')) tokens = [t for t in tokens if t not in stopwords.words('english') and len(t) > 1] for anchor in tokens: threshold = 0.7 current = (threshold, set()) # keeps (score, []) cohesiveness score and list of attrs that honor it for t in tokens: if anchor == t: # not interested in self-comparison continue anchor_v = glove_api.get_embedding_for_word(anchor) t_v = glove_api.get_embedding_for_word(t) if anchor_v is not None and t_v is not None: ss = glove_api.semantic_distance(anchor_v, t_v) if ss > current[0]: new_set = current[1] new_set.add(anchor) new_set.add(t) #current = (ss, new_set) current = (threshold, new_set) if len(current[1]) > 0: maybe_add_new_set(groups, current) return groups
def compute_semantic_similarity_min_average(sv1, sv2): global_sim = [] for v1 in sv1: local_sim = [] for v2 in sv2: sem_sim = glove_api.semantic_distance(v1, v2) local_sim.append(sem_sim) if len(local_sim) > 0: ls = min(local_sim) else: continue global_sim.append(ls) gs = 0 if len(global_sim) > 1: gs = np.mean(global_sim) elif len(global_sim) == 1: gs = global_sim[0] return gs
def compute_semantic_similarity_median(sv1, sv2): global_sim = [] for v1 in sv1: local_sim = [] for v2 in sv2: sem_sim = glove_api.semantic_distance(v1, v2) local_sim.append(sem_sim) ls = 0 if len(local_sim) > 1: ls = np.median(local_sim) elif len(local_sim) == 1: ls = local_sim[0] global_sim.append(ls) gs = 0 if len(global_sim) > 1: gs = np.median(global_sim) elif len(global_sim) == 1: gs = global_sim[0] return gs
def compute_semantic_similarity(sv1, sv2, penalize_unknown_word=False, add_exact_matches=True, signal_strength_threshold=0.5): total_comparisons = 0 skipped_comparisons = 0 accum = [] for a, b in itertools.product(sv1, sv2): if a is not None and b is not None: if not (a == b).all() or add_exact_matches: # otherwise this just does not add up total_comparisons += 1 sim = glove_api.semantic_distance(a, b) accum.append(sim) elif (a == b).all() and not add_exact_matches: skipped_comparisons += 1 elif penalize_unknown_word: # if one is None and penalize is True, then sim = 0 skipped_comparisons += 1 sim = 0 accum.append(sim) sim = 0 if len(accum) > 0: sim = np.mean(accum) strong_signal = False # in this case we cannot judge the semantic as the word is not in the dict if total_comparisons == 0: # capturing the case of [] - [a, ...n] when n > 1: intuition is that many words convey a lot of "meaning" if len(sv1) > 2 or len(sv2) > 2: return sim, True return sim, strong_signal total_of_all_comparisons = skipped_comparisons + total_comparisons ratio_of_strong_signal = 0 if total_of_all_comparisons > 0: ratio_of_strong_signal = float(total_comparisons/total_of_all_comparisons) # if not many skipped comparisons, then this is a strong signal if ratio_of_strong_signal >= signal_strength_threshold: strong_signal = True return sim, strong_signal