def extract_jaccard_ngrams_word(tweet1, tweet2, threshold, ngram_num): """ Extracts all similar ngrams with jaccard distance below a threshold Threshold adapts to ngrams since jaccard penalises short shorter ngrams Extracts the longest ngrams first for more context """ new_thres = threshold / (ngram_num * .1) result = [] for n in reversed(range(5, ngram_num + 1)): tokens_a, tokens_b = tweet1.tokenize(), tweet2.tokenize() ngrams_a, ngrams_b = ngrams(tokens_a, n), ngrams(tokens_b, n) for ngram_a in ngrams_a: temp_list = [] joint_a = " ".join(ngram_a) set_a = set(joint_a) for ngram_b in ngrams_b: joint_b = " ".join(ngram_b) set_b = set(joint_b) distance = jaccard_distance(set_a, set_b) temp_list.append((distance, joint_a, joint_b)) min_score = min(temp_list, key = lambda t:t[0], default=1) if min_score != 1: if min_score[0] < new_thres and min_score[0] != 0: # Sort best sentence in second joint_a = Tweet(min_score[1]) joint_b = Tweet(min_score[2]) joint_a.filter("*") joint_b.filter("*") if joint_a.oov_words() > joint_b.oov_words(): bi_combination = joint_a.source_filter(), joint_b.target_filter() else: bi_combination = joint_b.source_filter(), joint_a.target_filter() if bi_combination not in result: result.append(bi_combination) tweet1.strip_out(joint_a.clean_text) tweet2.strip_out(joint_b.clean_text) return result
from tweet import Tweet s = "Aurah : “ Ves a Miriam bipolar ? ” Verdeliss : “ Las enfermedades mentales las tiene que diagnosticar un profesional , no yo , así que no ” Aurah hija , te cubres de gloria" z = "Aurah : “ Ves a Miriam bipolar ? ” Verdeliss : “ Las enfermedades mentales las tiene que diagnosticar un profesional , no yo , …" a = Tweet(s) a.filter("*") b = Tweet(z) b.filter("*") print(a.tweet_len()) print(b.tweet_len()) result = a.tweet_len() - b.tweet_len() < 4 print(result)