def url_similarity(url1, url2): result = 0 if "://github.com" not in url1 and "://github.com" not in url2: url1_main_part = extract_url_main_part(url1) url2_main_part = extract_url_main_part(url2) result = textdistance.jaccard(url1_main_part, url2_main_part) elif "://github.com" in url1 and "://github.com" not in url2: url1_main_part = get_name_by_github_url(url1) #url2_main_part = extract_url_main_part(url2) # print "url1: ", url1, " url1_main: ", url1_main_part, " url2: ", url2, " url2_main: ", url2 lcsstr = textdistance.lcsstr(url1_main_part, url2) min_len = min(len(url1_main_part), len(url2)) if min_len == 0: min_len = 1 result = len(lcsstr) * 1.0 / min_len elif "://github.com" not in url1 and "://github.com" in url2: #url1_main_part = extract_url_main_part(url1) url2_main_part = get_name_by_github_url(url2) # print "url1: ", url1, " url1_main: ", url1, " url2: ", url2, " url2_main: ", url2_main_part lcsstr = textdistance.lcsstr(url1, url2_main_part) min_len = min(len(url1), len(url2_main_part)) if min_len == 0: min_len = 1 result = len(lcsstr) * 1.0 / min_len else: url1_main_part = get_name_by_github_url(url1) url2_main_part = get_name_by_github_url(url2) result = textdistance.jaccard(url1_main_part, url2_main_part) return result
def jaccard_euclidean(df, sample, num=15): vectors = np.zeros((len(df), len(sample))) start = time() checkpoint = time() for idx, row in df.iterrows(): if idx % 10000 == 0: print('Extracting sample ' + str(idx) + '/' + str(len(df))) print('Time pass: '******'{:.1f}'.format(time() - start) + '. Time left: ' + '{:.1f}'.format((time() - checkpoint) * (len(df) - idx) / 10000)) checkpoint = time() vectors[idx] = np.asarray( [tdc.jaccard(row[key], sample[key]) for key in sample.keys()]) param1 = np.asarray([ 7 / 66, 5 / 66, 11 / 66, 1 / 66, 3 / 66, 4 / 66, 6 / 66, 8 / 66, 9 / 66, 10 / 66, 2 / 66 ]) param2 = np.asarray([ 10 / 64, 5 / 64, 16 / 64, 1 / 64, 3 / 64, 4 / 64, 6 / 64, 3 / 64, 4 / 64, 10 / 64, 2 / 64 ]) vectors = vectors * param1 score = np.linalg.norm(vectors, axis=1) / np.linalg.norm(param1) # score /= np.max(score) score_sorted = np.flip(np.sort(score)[-num:], axis=0) indices = np.flip(np.argsort(score)[-num:], axis=0) most_similar = df.loc[indices, :] most_similar['SCORE'] = score_sorted return most_similar
def add_query_features(df, inc, exc, k1list, k2list): """ Return a copy of a dataframe with summary features added for the named text files defining the query """ df_new = df.copy() k1lens = list(map(len, k1list)) k2lens = list(map(len, k2list)) k1max = max(k1lens) k2max = max(k2lens) k1count = len(k1list) k2count = len(k2list) df_new['k1_count'] = k1count df_new['k2_count'] = k2count df_new['k1_max'] = k1max df_new['k2_max'] = k2max jaro_dist = jellyfish.jaro_distance(inc, exc) lev_dist = jellyfish.levenshtein_distance(inc, exc) ji = textdistance.jaccard(inc, exc) sd = textdistance.sorensen(inc, exc) ro = textdistance.ratcliff_obershelp(inc, exc) #jellyfish.damerau_levenshtein_distance(inc,exc) #jellyfish.jaro_winkler(inc,exc) df_new['inc_jaro_exc'] = jaro_dist df_new['inc_lev_exc'] = lev_dist df_new['inc_ji_exc'] = ji df_new['inc_sd_exc'] = sd df_new['inc_ro_exc'] = ro return df_new
def compare_lines2headings(lines, headings): if headings.shape[0] == 0: print('Headings are empty') return np.zeros(len(lines)), np.zeros(len(lines)), np.zeros(len(lines)) max_similarities = [] for line in lines: ln_similarities = [] ln_words = line.lower().split() for i, heading in headings.iterrows( ): # save info whether the best comparison is to a heading or subheading hd = heading.Text.lower() hd, _ = heading_id_toc.split_pagenum(hd) hd_words = hd.split() # compare words similarity = textdistance.jaccard( ln_words, hd_words ) # similarity defined by: intersection/union. see "jaccard index" ln_similarities.append([similarity, heading.Heading, i]) max = np.array(ln_similarities)[:, 0].argmax() bestsim = ln_similarities[max] if bestsim[0] == 0: # if basically find no similarity bestsim = np.array([0, 0, 0]) max_similarities.append(bestsim) max_similarities = np.array(max_similarities) return max_similarities[:, 0], max_similarities[:, 1], max_similarities[:, 2] # return similarity,type matched, and i of heading matched
def search(self, word, limit=0.7): """[Pesquisa palavra próxima no vocabulário utilizando um valor de aceitação de distancia.] Arguments: word {[string]} -- [Palavra de entrada] Keyword Arguments: limit {float} -- [Valor de distancia para aceitação de palavra (0,1)] (default: {0.7}) Returns: [new_embed] -- [] """ dist = list() for index, token in enumerate(self.words): if self.algorithm == "levenshtein": distance = textdistance.levenshtein.normalized_similarity( word, token) if distance > limit: return self.embeds[index] dist.append(distance) elif self.algorithm == "jaccard": distance = textdistance.jaccard(word, token) if distance > limit: return self.embeds[index] dist.append(distance) elif self.algorithm == "ratcliff_obershelp": distance = textdistance.ratcliff_obershelp(word, token) if distance > limit: return self.embeds[index] dist.append(distance) index = dist.index(max(dist)) return self.embeds[index]
def similarity(line1, line2): """ return a number """ distance = textdistance.jaccard(line1, line2) closeness = 1 - 1 / (1 + math.exp(-30 * (distance - 0.5))) # Sigmoid mapping (0,1) to (0,1) return closeness
def knn_search(self, word): """[Pesquisa uma palavra próxima no vocabulário utilizado a lógica do KNN] Arguments: word {[string]} -- [Palavra de entrada] Returns: [int] -- [Índice da palava mais próxima no vocabulário] """ dist = list() for token in self.vocabulary: if self.algorithm == "levenshtein": dist.append( textdistance.levenshtein.normalized_similarity( word, token)) elif self.algorithm == "jaccard": dist.append(textdistance.jaccard(word, token)) elif self.algorithm == "ratcliff_obershelp": dist.append(textdistance.ratcliff_obershelp(word, token)) index = dist.index(max(dist)) return self.embeds[index]
def calculate_similarity_matrix(selected_data_list, vector_map, weight, current_index, total_length): full_result = [] lack_result = [] if selected_data_list is not None: # print("there are total " + str(len(selected_data_list)) + " data.") for i in range(0, len(selected_data_list)): # print("execute the NO." + str(current_index) + " list now, there are total " + str(total_length) + " list.") # print("calculate the NO." + str(i) + " data's similarity, " + "there are total " + str(len(selected_data_list)) + " data.") first = str(selected_data_list[i]["paragraph_id"]) first_noun_phrase = selected_data_list[i]["noun_phrase"] full_temp = [] lack_temp = [] for j in range(0, len(selected_data_list)): second = str(selected_data_list[j]["paragraph_id"]) second_noun_phrase = selected_data_list[j]["noun_phrase"] if i == j: similarity = 0 else: similarity = (vector_map.similarity(first, second) + 1) * weight / 2 + textdistance.jaccard( first_noun_phrase, second_noun_phrase) * (1 - weight) similarity = round(similarity, 6) - 1 lack_temp.append(similarity) full_temp.append(similarity) full_result.append(full_temp) lack_result.append(lack_temp) return full_result, lack_result
def Jaccard_Index(Cell1, Cell2): Cell1 = Cell1.lower().split( ) #removes punctuation and spilts string into list of words Cell2 = Cell2.lower().split( ) #removes punctuation and spilts string into list of words return jaccard(Cell1, Cell2) #calculates jaccard index of two strings
def extract_by_similarity(self, concepts, sentences, alpha=0.75, top_k=3, min_sim=0.5): alias2concept = {} for concept in concepts: for alias in concept.aliases: alias2concept[alias.lower()] = concept terms = set(alias2concept.keys()) term2sents = dict() for sent in sentences: rs = sent.find_spans(*terms, ignore_case=True) for term in rs.keys(): if term not in term2sents: term2sents[term] = set() term2sents[term].add(sent) term2emb = { term: sum(sent.emb() for sent in sents) / len(sents) for term, sents in term2sents.items() } concept2emb = { concept: sum(term2emb[alias.lower()] for alias in concept.aliases) for concept in concepts } def __cosine(vector1, vector2): norm = (np.linalg.norm(vector1) * np.linalg.norm(vector2)) if norm == 0: return 0 cos = np.dot(vector1, vector2) / norm return 0.5 + 0.5 * cos start2scores = dict() for start, end in itertools.combinations(concepts, 2): cos = __cosine(concept2emb[start], concept2emb[end]) start_words = set() for alias in start.aliases: start_words.update(self.name_handler.normalize(alias).split()) end_words = set() for alias in end.aliases: end_words.update(self.name_handler.normalize(alias).split()) jaccard = textdistance.jaccard(start_words, end_words) score = alpha * cos + (1 - alpha) * jaccard if start not in start2scores: start2scores[start] = set() start2scores[start].add((end, score)) relations = set() for start, pairs in start2scores.items(): pairs = list(sorted(pairs, key=lambda item: item[1], reverse=True)) for end, score in pairs[:top_k]: if score < min_sim: break relations.add(Relation(start, end, RelType.RELATED_TO)) return relations
def criteria_features(x, col): raw_text = x[col].lower() jd = jellyfish.jaro_distance(raw_text, crit) ld = jellyfish.levenshtein_distance(raw_text, crit) ji = textdistance.jaccard(raw_text, crit) sd = textdistance.sorensen(raw_text, crit) ro = textdistance.ratcliff_obershelp(raw_text, crit) return jd, ld, ji, sd, ro
def similarity(phrase1, phrase2): s1 = "" s2 = "" for note in phrase1: s1 += chr(note) for note in phrase2: s2 += chr(note) return jaccard(s1, s2)
def prova(): l = sorted(rf.keys()) dm = get_entry(l) joint = {} for word in dm: joint[word] = [ td.jaccard(rf[word].split(), dm[word].split()), rf[word], dm[word] ] return joint
def getSimilarity(self, desired, given): # If the given string contains desired as a substring if desired in given: return 0.9 # Strings have to be split up into character arrays for this algorithm desired_attr = [char for char in desired] given_attr = [char for char in given] # Returns a number representing how similar the two strings are return textdistance.jaccard(desired_attr, given_attr)
def map_marie(input_data, target_data, input_bert_weights, target_bert_weights, string_match='edit', alpha=0.8, bert_layers=1, top_n=5): inp_txt2idx, inp_idx2txt = _create_txt2idx(input_data) tgt_txt2idx, tgt_idx2txt = _create_txt2idx(target_data) inp_bert_vectors = _get_bert_vectors(input_bert_weights, input_data, inp_txt2idx, bert_layers) tgt_bert_vectors = _get_bert_vectors(target_bert_weights, target_data, tgt_txt2idx, bert_layers) mapper = dict() for cnt, (inp_txt, inp_idx) in enumerate(inp_txt2idx.items()): inp_bert_vector = inp_bert_vectors[inp_idx] cal_dist = [] cos_dist = [ alpha * (_cal_cosine(inp_bert_vector, tgt_bert_vectors[tgt_idx])) for tgt_txt, tgt_idx in tgt_txt2idx.items() ] if string_match == 'edit': str_match = [ (1 - alpha) * (1 - editdistance.eval(inp_txt, tgt_txt) / max(len(inp_txt), len(tgt_txt))) for tgt_txt, tgt_idx in tgt_txt2idx.items() ] if string_match == 'jaccard': str_match = [(1 - alpha) * textdistance.jaccard(inp_txt, tgt_txt) for tgt_txt, tgt_idx in tgt_txt2idx.items()] if string_match == 'ob': str_match = [(1 - alpha) * textdistance.ratcliff_obershelp(inp_txt, tgt_txt) for tgt_txt, tgt_idx in tgt_txt2idx.items()] ord2idx = [tgt_idx for _, tgt_idx in tgt_txt2idx.items()] cal_dist = np.add(cos_dist, str_match) topn_ord_idx = cal_dist.argsort()[::-1][:top_n] mapper[inp_idx] = [(ord2idx[idx], cal_dist[idx]) for idx in topn_ord_idx] if cnt % 100 == 0: print("...Processed %i mappings" % (cnt)) return mapper, inp_idx2txt, tgt_idx2txt
def answer(self, msg): if not self.active or self.correct: return if not self.correct: for ans in self.a: ans = " ".join(ans.split()).strip().lower() guess = " ".join(msg.args[1].split()).strip().lower() if guess == ans: self.correct = True break elif not self.correct: answer = self.clean(ans) guess = self.clean(guess) if not self.correct and guess == answer: self.correct = True break elif (not self.correct and self.flexibility < 1 and self.flexibility > 0.5): dist = textdistance.jaro_winkler(guess, answer) log.debug( "Jeopardy: guess: {0}, answer: {1}, length: {2}, " "distance: {3}, flexibility: {4}".format( guess, answer, len(answer), dist, self.flexibility)) if dist >= self.flexibility: self.correct = True break elif (dist < self.flexibility and "," in self.a[0] or "&" in self.a[0]): dist = textdistance.jaccard(guess, answer) if dist >= self.flexibility: self.correct = True break if self.correct: if not msg.nick in self.scores: self.scores[msg.nick] = 0 self.scores[msg.nick] += self.p if not msg.nick in self.roundscores: self.roundscores[msg.nick] = 0 self.roundscores[msg.nick] += self.p self.unanswered = 0 reply = self.correct_template.render( nick=msg.nick, answer=self.a[0], points=self.p, round=self.roundscores[msg.nick], total=self.scores[msg.nick], ) self.reply(reply) self.correct = True self.answered += 1 self.clear() self.newquestion()
def check_match(data): gr_row_no = data["gr_row_no"] gr_row = data["gr_row"] lib_rows = data["lib_rows"] score = int(data["score"]) match_mode = data["match_mode"].upper().strip() matches = [] for idx, lr in enumerate(lib_rows): auth_score = int(TD.jaccard(gr_row["auth_tok"], lr["auth_tok"]) * 100) title_score = int( TD.jaccard(gr_row["title_tok"], lr["title_tok"]) * 100) gr_comb = gr_row["auth_tok"] + gr_row["title_tok"] lr_comb = lr["auth_tok"] + lr["title_tok"] total_score = int(TD.jaccard(gr_comb, lr_comb) * 100) cond = False if match_mode == "TA": cond = total_score > score elif match_mode == "T": cond = title_score > score elif match_mode == "TTA": cond = title_score > score or total_score > score elif match_mode == "A": cond = auth_score > score else: raise Exception("Unsupported matching condition: " + match_mode) if cond: row = dict( zip(OUT_CSV_HEADER, [ gr_row_no, gr_row["author"], gr_row["title"], idx, lr['author'], lr['title'], auth_score, title_score, total_score ])) matches.append(row) return matches
def test_similarity_calculation(self): str1 = "AbstractInputMethodService provides a abstract base class for inut methods." str2 = "The default implementation in this abstract class returns 1.0 for all components." vector_map = EntityVectorModel.load( "mean_vector_api_paragraph.plain.txt", binary=False) vector1 = vector_map.compute_mean_vector(str1) vector2 = vector_map.compute_mean_vector(str2) semantic_similarity = dot(matutils.unitvec(vector1), matutils.unitvec(vector2)) print("semantic similarity is " + semantic_similarity) structure_similarity = textdistance.jaccard(str1, str2) print("structure similarity is " + structure_similarity)
def sm_features(x, col1, col2): if (x[col1] != x[col1]) or (x[col2] != x[col2]): jd = np.nan ld = np.nan ji = np.nan sd = np.nan else: raw_text1 = x[col1].lower() raw_text2 = x[col2].lower() jd = jellyfish.jaro_distance(raw_text1, raw_text2) ld = jellyfish.levenshtein_distance(raw_text1, raw_text2) ji = textdistance.jaccard(raw_text1, raw_text2) sd = textdistance.sorensen(raw_text1, raw_text2) return jd, ld, ji, sd
def match_jaccard(ee, platforms): for index, row in ee.iterrows(): if row.possible_stops == '': subset = platforms[platforms.routes_wkd.str.contains(row.line)] if subset.shape[0] > 0: subset_stop_names = pd.DataFrame(subset.stop_name.unique(),columns=['stop_name']) name_dist = [textdistance.jaccard(row.station_name,y) for y in subset_stop_names.stop_name] matched_station_name = subset_stop_names.iloc[np.argmax(name_dist),0] matched_stop_ids = subset[subset.stop_name == matched_station_name][['stop_id']] score = max(name_dist) if score >=0.8: ee.loc[index,'possible_stops'] = ', '.join(matched_stop_ids.stop_id) return ee
def similarity(str1, str2, type): """ Similarity score calculated by either Jaccard or Sorensen methods. :param str1: string :param str2: string :param type: ['jaccard'|'sorensen'] :return: a float number between 0 (inclusive) and 1 (inclusive) """ tokens_1 = tokenize(str1) tokens_2 = tokenize(str2) if type == 'jaccard': return textdistance.jaccard(tokens_1, tokens_2) elif type == 'sorensen': return textdistance.sorensen_dice(tokens_1, tokens_2) return 0
def best_match_bert(orig_title, names_pool): ''' Best matching using bert, some simple text distance measures :param title: searched occupation name :param names_pool: where are we searching :return: best match occupation name from pool ''' scores1 = [] scores2 = [] scores3 = [] scores4 = [] scores5 = [] scores6 = [] scores7 = [] # 1,2, 8 - scores > 1 title = pre_process(orig_title) bb = BertEncoder() l1 = bb.bert_encoder(title) for name in names_pool: proc_name = pre_process(name) l2 = bb.bert_encoder(proc_name) l1 = np.array(l1).reshape(-1, 1) l2 = np.array(l2).reshape(-1, 1) scores1.append(np.sum(euclidean_distances(l1, l2))) scores2.append(np.sum(paired_distances(l1, l2))) scores3.append(np.sum(cosine(l1, l2))) scores4.append(np.sum(naive_metric(title, proc_name))) scores5.append(np.sum(textdistance.jaccard(title, proc_name))) scores6.append(np.sum(textdistance.sorensen_dice(title, proc_name))) scores7.append( np.sum(textdistance.damerau_levenshtein(title, proc_name))) textdistance.j chosen_idx = [ names_pool[np.argmin(scores1)], names_pool[np.argmin(scores2)], names_pool[np.argmax(scores3)], names_pool[np.argmax(scores4)], names_pool[np.argmax(scores5)], names_pool[np.argmax(scores6)], names_pool[np.argmin(scores7)] ] print( "-----------------------------------------------------------------------------------------" ) print("All title similarity candidates \n", chosen_idx) c = Counter(chosen_idx) frequency = c.most_common(1)[0][1] if frequency < 3: return names_pool[np.argmax(scores3)] return c.most_common(1)[0][0]
def compare(self, str1, str2): if self.debug: self.log("jaccard comparison") self.start_time() self.result.distance = jaccard(str1, str2) self.end_time() self.result.nos = max(len(str1), len(str2)) self.result.threshold = 90 self.result.similarity = self.result.distance * 100 return self.result
def single_compare(bsl, our, j, label, count, id2vocab, id2response_dict): # golden data golden_context = [[id2vocab[w.item()] for w in sent if w != 0] for sent in our[0]['context'][j] if not all(sent == 0)] golden_query = [id2vocab[w.item()] for w in our[0]['query'][0] if w != 0] golden_response = id2response_dict[our[0]['response_id'][j].item()] golden_profile = vector2profile(our[0]['profile'][j]) golden_incomplete_profile = vector2profile(our[0]['incomplete_profile'][j]) # prediction our_pred_response = id2response_dict[our[1]['pred_response_id'][j].item()] # our_pred_profile = vector2profile(our[1]['pred_profile'][j]) our_pred_profile = vector2profile( pred_profile_to_onehot_vec( our[1]['pred_profile_prob'][j].unsqueeze(0)).squeeze(0)) our_pred_profile_prob = vector2probdict(our[1]['pred_profile_prob'][j]) bsl_pred_response = id2response_dict[bsl[1]['pred_response_id'][j].item()] # similarity # similarity = round(textdistance.hamming.normalized_similarity(tokenizer(bsl_pred_response, type=None), tokenizer(our_pred_response, type=None)), 3) similarity = round( textdistance.jaccard(tokenizer(bsl_pred_response, type='word'), tokenizer(our_pred_response, type='word')), 3) # similarity = round(textdistance.cosine(bsl_pred_response, our_pred_response), 3) print_str = '' if len(golden_incomplete_profile) < len(golden_profile) and len( golden_incomplete_profile) != 0: print_str = '%s %s %s\n' % (count, '=' * 50, label) print_str += 'CONTEXT:\n' for sent in golden_context: #[-5:]: # avoid too much context print_str += '%s: %s\n' % (''.join(sent[-2:]), ' '.join(sent[:-2])) # if sent[-2] == '$kb' and (sent[0] in our_pred_response or sent[0] in bsl_pred_response): # if restaurant mentioned in the response # pass # else: # print_str += '%s: %s\n' % (''.join(sent[-2:]), ' '.join(sent[:-2])) print_str += 'QUERY: %s\n' % ' '.join(golden_query) print_str += 'INCOMPLETE:%s\n COMPLETE:%s\n PRED:%s\n PRED_PROB:%s\n' % ( golden_incomplete_profile, golden_profile, our_pred_profile, our_pred_profile_prob) print_str += 'GOLD: %s\n BSL: %s\n OUR: %s\n' % ( golden_response, bsl_pred_response, our_pred_response) print_str += 'SIM: %s\n' % similarity print(print_str) # if label == 'GOOD': # fw_good.write(print_str) # elif label == 'BAD': # fw_bad.write(print_str) return similarity, print_str
def jaccardDist(name1, name2): """ info: calculate the jaccard distance between the two strings name1, name2 input: name1:String, name2:String output: distance value dist (real number within [0, 1]) """ """ dist = 0 for i in range(len(name1)): if not (name1[i] == name2[i]): dist += 1 #still have to calculate the real Jaccard distance """ dist = 1 - td.jaccard(name1, name2) return dist
def compare(s_inp, s_out): '''nlp = spacy.load("en_core_web_sm") str_inp = nlp(s_inp) srt_inp = " ".join([token.lemma_ for token in str_inp]) #print(str_inp) #inp_lower = str_inp.lower() print("Lower String Input: {}".format(str_inp)) str_out = nlp(s_out) srt_out = " ".join([token.lemma_ for token in str_out]) #print(str_out) #out_lower = str_out.lower() print("Lower String Output: {}".format(str_out)) #print("Inside Compare") #print("Str1: ", s_inp) #print("Str2: ", s_out)''' #Jaccard Index jacc = textdistance.jaccard(s_inp, s_out) print("jaccard: ", jacc) #Sorens soren = textdistance.sorensen(s_inp, s_out) print("Sorensen: ", soren) #TVR Value tvr = textdistance.tversky(s_inp, s_out) print("Tversky: ", tvr) #Over Lap Index overlap = textdistance.overlap(s_inp, s_out) print("overlap_cofficient: ", overlap) #Tanimoto Distance #tanimoto_distance = textdistance.tanimoto(str_inp, str_out) #print("Tanimoto: ", tanimoto_distance) res = (jacc+soren+tvr+overlap)/4 if res == 0: pass else: lst.append(res) print("Result: {}".format(res)) '''if (res >= 0.6):
def fuzzy_match(term, term_list): best_match = 0 if term == 'nan': return False if 'unnamed' in term: return False for t in term_list: match = textdistance.jaccard(term, t) #match = textdistance.damerau_levenshtein.normalized_similarity(term, t) if match > best_match: best_match = match if best_match == 1: print("Found match for: ", term, " with score of: ", best_match) return True # don't need to keep searching once find exact match if best_match >= 0.8: # if best match is above a threhold for similarity - can modify this number print("Found match for: ", term, " with score of: ", best_match) return True if best_match >= 0.6: print("Didn't find match for: ", term, " with score of: ", best_match) return False
def get_similarity_score(title, videos, movie_description, keywords=None): ''' main method that take the list of video_data from a YouTube query and sort the queries based on a set of similarities as defined by accord index and description keywords Parameters ========== title: the title of the movie videos: list of youtube video objects keywords: list of top-5 cast, top-5 characters, and director(s) movie_description: string of description from tmdb Return ========== list of YouTubeVideo objects with similarity score ''' youtube_videos = [] for video in videos: # first, all items will be converted to lowercase and tokenized lowercase_title = title.lower() movie_desc = strip_characters(movie_description.lower()).split() trailer_desc = strip_characters(video.description.lower()).split() movie_title = (strip_characters(lowercase_title) + " official trailer").split() trailer_title = strip_characters(video.title.lower()).split() similarity_score = is_clip( trailer_title, lowercase_title.split()) * 0.5 * ( textdistance.sorensen_dice(movie_desc, trailer_desc) + textdistance.jaccard(movie_title, trailer_title)) video.set_similarity_score(similarity_score) youtube_videos.append(video) return youtube_videos
def super_similiar(es1, es2, sim_factor=0.8, sim_box=0.6): """Check if two elements are super similiar by text (Jaccad) and visually (compare bbox). """ text1 = only_text(es1) text2 = only_text(es2) points1 = only_points(es1) points2 = only_points(es2) if min(len(points1), len(points2)) < 4: return False logger.debug("points") logger.debug(points1) logger.debug(points2) j_sim = jaccard(text1, text2) b_sim = sim_bbox(points1, points2) logger.debug(f"footer/header sims {j_sim} {b_sim}") return j_sim > sim_factor and b_sim > sim_box
def calculate_similarity_matrix(selected_data_list, vector_map, weight): full_result = [] id_list = [] if selected_data_list is not None: vector_matrix = calculate_matrix(selected_data_list, vector_map).tolist() for i in range(0, len(selected_data_list)): first_noun_phrase = selected_data_list[i]["noun_phrase"] id_list.append(selected_data_list[i]["id"]) full_temp = [] for j in range(0, len(selected_data_list)): second_noun_phrase = selected_data_list[j]["noun_phrase"] if i == j: similarity = 0 else: similarity = (vector_matrix[i][j] + 1) * weight / 2 + textdistance.jaccard( first_noun_phrase, second_noun_phrase) * ( 1 - weight) similarity = -round(similarity, 6) + 1 full_temp.append(similarity) full_result.append(full_temp) return full_result, id_list