def topicsPrint(URL, filtered_content): headers = {'Accept':'application/vnd.github.mercy-preview+json', 'Authorization': [USERNAME]} req = requests.get(URL, headers = headers) reqJSON = req.json() content = reqJSON['names'] if req.status_code == requests.codes.ok or len(content) == 0: scores = {} for x in range(len(content)): scores[content[x]] = textdistance.ratcliff_obershelp(filtered_content, content[x]) return(max(scores.items(), key=operator.itemgetter(1))[0]) else: print('Content was not found') content = [] currContent = "" while currContent != "exit": currContent = input("Enter a topic for this repo or 'exit' to submit current suggestions: ") content.append(currContent) content.remove("exit") scores = {} for x in range(len(content)): scores[content[x]] = textdistance.ratcliff_obershelp(filtered_content, content[x]) print(scores) return(max(scores.items(), key=operator.itemgetter(1))[0])
def filter_blends(word, first_blend, second_blend, limit="1237"): refined_first_blend, refined_second_blend = [], [] for w in first_blend: qua1 = metric.qualified("jw_sim", textdistance.jaro_winkler(word, w), True, False) qua2 = metric.qualified( "levenshtein_sim", textdistance.levenshtein.normalized_similarity(word, w), True, False) qua3 = metric.qualified("ro_sim", textdistance.ratcliff_obershelp(word, w), True, False) qua4 = metric.qualified("needleman_wunsch", textdistance.needleman_wunsch(word, w), True, False) qua5 = metric.qualified("smith_waterman", textdistance.smith_waterman(word, w), True, False) qua6 = metric.qualified("gotoh", textdistance.gotoh(word, w), True, False) qua7 = metric.qualified("strcmp95", textdistance.strcmp95(word, w), True, False) metric_pool = [qua1, qua2, qua3, qua4, qua5, qua6, qua7] statis = True for i in limit: statis &= metric_pool[int(i) - 1] if statis: refined_first_blend.append(w) for w in second_blend: qua1 = metric.qualified("jw_sim", textdistance.jaro_winkler(word, w), False, True) qua2 = metric.qualified( "levenshtein_sim", textdistance.levenshtein.normalized_similarity(word, w), False, True) qua3 = metric.qualified("ro_sim", textdistance.ratcliff_obershelp(word, w), False, True) qua4 = metric.qualified("needleman_wunsch", textdistance.needleman_wunsch(word, w), False, True) qua5 = metric.qualified("smith_waterman", textdistance.smith_waterman(word, w), False, True) qua6 = metric.qualified("gotoh", textdistance.gotoh(word, w), False, True) qua7 = metric.qualified("strcmp95", textdistance.strcmp95(word, w), False, True) metric_pool = [qua1, qua2, qua3, qua4, qua5, qua6, qua7] statis = True for i in limit: statis &= metric_pool[int(i) - 1] if statis: refined_second_blend.append(w) return refined_first_blend, refined_second_blend
def valid_song(*input): names = [x.lower().strip() for x in input] minimum_score = (0.5, 0.5) # Track, Artist score1 = ratcliff_obershelp(names[0], names[1]) score2 = ratcliff_obershelp(names[2], names[3]) # print(names[0], "and", names[1], " got a score of ", score1) # print(names[2], "and", names[3], " got a score of ", score2) return score1 >= minimum_score[0] and score2 >= minimum_score[1]
def add_query_features(df, inc, exc, k1list, k2list): """ Return a copy of a dataframe with summary features added for the named text files defining the query """ df_new = df.copy() k1lens = list(map(len, k1list)) k2lens = list(map(len, k2list)) k1max = max(k1lens) k2max = max(k2lens) k1count = len(k1list) k2count = len(k2list) df_new['k1_count'] = k1count df_new['k2_count'] = k2count df_new['k1_max'] = k1max df_new['k2_max'] = k2max jaro_dist = jellyfish.jaro_distance(inc, exc) lev_dist = jellyfish.levenshtein_distance(inc, exc) ji = textdistance.jaccard(inc, exc) sd = textdistance.sorensen(inc, exc) ro = textdistance.ratcliff_obershelp(inc, exc) #jellyfish.damerau_levenshtein_distance(inc,exc) #jellyfish.jaro_winkler(inc,exc) df_new['inc_jaro_exc'] = jaro_dist df_new['inc_lev_exc'] = lev_dist df_new['inc_ji_exc'] = ji df_new['inc_sd_exc'] = sd df_new['inc_ro_exc'] = ro return df_new
def knn_search(self, word): """[Pesquisa uma palavra próxima no vocabulário utilizado a lógica do KNN] Arguments: word {[string]} -- [Palavra de entrada] Returns: [int] -- [Índice da palava mais próxima no vocabulário] """ dist = list() for token in self.vocabulary: if self.algorithm == "levenshtein": dist.append( textdistance.levenshtein.normalized_similarity( word, token)) elif self.algorithm == "jaccard": dist.append(textdistance.jaccard(word, token)) elif self.algorithm == "ratcliff_obershelp": dist.append(textdistance.ratcliff_obershelp(word, token)) index = dist.index(max(dist)) return self.embeds[index]
def search(self, word, limit=0.7): """[Pesquisa palavra próxima no vocabulário utilizando um valor de aceitação de distancia.] Arguments: word {[string]} -- [Palavra de entrada] Keyword Arguments: limit {float} -- [Valor de distancia para aceitação de palavra (0,1)] (default: {0.7}) Returns: [new_embed] -- [] """ dist = list() for index, token in enumerate(self.words): if self.algorithm == "levenshtein": distance = textdistance.levenshtein.normalized_similarity( word, token) if distance > limit: return self.embeds[index] dist.append(distance) elif self.algorithm == "jaccard": distance = textdistance.jaccard(word, token) if distance > limit: return self.embeds[index] dist.append(distance) elif self.algorithm == "ratcliff_obershelp": distance = textdistance.ratcliff_obershelp(word, token) if distance > limit: return self.embeds[index] dist.append(distance) index = dist.index(max(dist)) return self.embeds[index]
def compare(self, statement, other_statement): # Get the lowercase version of both strings statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) result = ratcliff_obershelp(statement_text, other_statement_text) # Return your calculated value here return result
def criteria_features(x, col): raw_text = x[col].lower() jd = jellyfish.jaro_distance(raw_text, crit) ld = jellyfish.levenshtein_distance(raw_text, crit) ji = textdistance.jaccard(raw_text, crit) sd = textdistance.sorensen(raw_text, crit) ro = textdistance.ratcliff_obershelp(raw_text, crit) return jd, ld, ji, sd, ro
def sm_features(x, col1, col2): if (x[col1] != x[col1]) or (x[col2] != x[col2]): ro = np.nan else: raw_text1 = x[col1].lower() raw_text2 = x[col2].lower() ro = textdistance.ratcliff_obershelp(raw_text1, raw_text2) return ro
def EvalSequences(df): df["seq_score"] = df[["ST_stemmed", "Item_stemmed"]].apply( lambda x: textdistance.ratcliff_obershelp(*x), axis=1) df.seq_score = df.seq_score * 100 df = df[(df.seq_score < 40)] df.reset_index(drop=True, inplace=True) print("eval sequences done") return df
def RatcliffObershelp(str1, match_against): best_match = ['', 0] str_comparison = [[x, textdistance.ratcliff_obershelp(str1, x)] for x in match_against] for item in str_comparison: if item[1] > best_match[1]: best_match = item return best_match
def map_marie(input_data, target_data, input_bert_weights, target_bert_weights, string_match='edit', alpha=0.8, bert_layers=1, top_n=5): inp_txt2idx, inp_idx2txt = _create_txt2idx(input_data) tgt_txt2idx, tgt_idx2txt = _create_txt2idx(target_data) inp_bert_vectors = _get_bert_vectors(input_bert_weights, input_data, inp_txt2idx, bert_layers) tgt_bert_vectors = _get_bert_vectors(target_bert_weights, target_data, tgt_txt2idx, bert_layers) mapper = dict() for cnt, (inp_txt, inp_idx) in enumerate(inp_txt2idx.items()): inp_bert_vector = inp_bert_vectors[inp_idx] cal_dist = [] cos_dist = [ alpha * (_cal_cosine(inp_bert_vector, tgt_bert_vectors[tgt_idx])) for tgt_txt, tgt_idx in tgt_txt2idx.items() ] if string_match == 'edit': str_match = [ (1 - alpha) * (1 - editdistance.eval(inp_txt, tgt_txt) / max(len(inp_txt), len(tgt_txt))) for tgt_txt, tgt_idx in tgt_txt2idx.items() ] if string_match == 'jaccard': str_match = [(1 - alpha) * textdistance.jaccard(inp_txt, tgt_txt) for tgt_txt, tgt_idx in tgt_txt2idx.items()] if string_match == 'ob': str_match = [(1 - alpha) * textdistance.ratcliff_obershelp(inp_txt, tgt_txt) for tgt_txt, tgt_idx in tgt_txt2idx.items()] ord2idx = [tgt_idx for _, tgt_idx in tgt_txt2idx.items()] cal_dist = np.add(cos_dist, str_match) topn_ord_idx = cal_dist.argsort()[::-1][:top_n] mapper[inp_idx] = [(ord2idx[idx], cal_dist[idx]) for idx in topn_ord_idx] if cnt % 100 == 0: print("...Processed %i mappings" % (cnt)) return mapper, inp_idx2txt, tgt_idx2txt
def is_brand(word: str, ignore_keywords: list, score=0.70): """ Function to check brand. Using Ratcliff-Obershelp similarity Inp: score: Ratcliff-Obershelp score lager than score Return: True mean that it is brand """ for ig in ignore_keywords: ig = ig.lower() if compare(word, ig) > score * 100 and textdistance.ratcliff_obershelp( ig, word) > score: return True return False
def ratcliff_obershelp_sim_of_blends(): blend1, blend2 = [], [] count = 0 with open("data/blends.txt", 'r') as f: for line in f: s = line.split() origin, first, second = s[0], s[1], s[2] blend1.append(textdistance.ratcliff_obershelp(origin, first)) blend2.append(textdistance.ratcliff_obershelp(origin, second)) count += 1 #print(textdistance.jaro_winkler()) x = np.array([i for i in range(count)]) y1 = np.array(blend1) y2 = np.array(blend2) plt.plot(x, y1, color="r", linestyle="-", marker="^", linewidth=1) plt.plot(x, y2, color="b", linestyle="-", marker="s", linewidth=1) plt.xlabel("x") plt.ylabel("y") plt.title("ratcliff-obershelp similarity", fontsize=12, color='g') print("# first blend: 0.35 ~ 0.85\nsecond blend: 0.45 ~ 0.95") plt.show()
def get_skill_header(headers, skills): list_of_skills = list() for header in headers: skills_weight = list() for No_of_skill in range(len(skills)): skills_weight.append( (header, textdistance.ratcliff_obershelp(header, skills[No_of_skill]))) #if reached the end of skills sort them and get the highest prob.only if No_of_skill == len(skills) - 1: list_of_skills.append(max(skills_weight, key=lambda x: x[1])) skill_element = max(list_of_skills, key=lambda x: x[1]) skill = skill_element[0] headers.remove(skill) return skill
def concordance_search(tm_objects, searchCon, matchRate, search_lang): # normalized_levenshtein = NormalizedLevenshtein() out_sequences = [] q_tokens = removeStopwords(searchCon).split() for tm_object in tm_objects: tm_url = os.path.join(settings.MEDIA_ROOT, getattr(tm_object, 'file_url').name) tm_s_lang = getattr(tm_object, 's_lang') tm_t_lang = getattr(tm_object, 't_lang') tm_name = getattr(tm_object, 'name') if os.path.isfile(tm_url): fin = open(tm_url, 'rb') tmx_file = tmxfile(fin, tm_s_lang, tm_t_lang) for node in tmx_file.unit_iter(): sequence = node.getsource() s_tokens = removeStopwords(sequence).split() average_rate = 0 index_list = [] ordering = False for q_token in q_tokens: q_index = s_tokens.index( q_token) if q_token in s_tokens else -1 if q_index == -1: matched = difflib.get_close_matches(q_token, s_tokens, n=1, cutoff=0.85) if len(matched) > 0: average_rate += float( textdistance.ratcliff_obershelp( q_token, matched[0])) else: average_rate += 1 index_list.append([q_token, q_index]) average_rate = int(average_rate / max(len(s_tokens), len(q_tokens)) * 100) if average_rate >= matchRate: out_sequences.append({ 'source': sequence, 'target': node.gettarget(), 'tm_name': tm_name, 'match_rate': average_rate }) out_sequences.sort(key=compare_matchrate, reverse=True) return out_sequences
def get_otherHeaders(headers, otherHeaders): list_of_chosen_Headers = set() list_of_otherHeaders = list() for header in headers: otherHeaders_weight = list() for No_of_otherH in range(len(otherHeaders)): otherHeaders_weight.append( (header, textdistance.ratcliff_obershelp(header, otherHeaders[No_of_otherH]))) if No_of_otherH == len(otherHeaders) - 1: list_of_otherHeaders.append( max(otherHeaders_weight, key=lambda y: y[1])) list_of_otherHeaders.sort(key=lambda y: y[1], reverse=True) for header in list_of_otherHeaders: if header[1] > 0.7: list_of_chosen_Headers.add(header[0]) return list_of_chosen_Headers
def similarity(type, a, b): """ String similarity metrics input: type: hamming (similarity type) a: John (string 1) b: John Snow (string 2) output: 0.73 (probability) """ if type == 'hamming': return textdistance.hamming.normalized_similarity(a, b) elif type == 'levenshtein': return textdistance.levenshtein.normalized_similarity(a, b) elif type == 'jaro_winkler': return textdistance.jaro_winkler(a, b) elif type == 'jaccard': tokens_1 = a.split() tokens_2 = b.split() return textdistance.jaccard(tokens_1, tokens_2) elif type == 'sorensen': tokens_1 = a.split() tokens_2 = b.split() return textdistance.sorensen(tokens_1, tokens_2) elif type == 'ratcliff_obershelp': return textdistance.ratcliff_obershelp(a, b)
def comparar_textos(self, str1, str2, metodo): was_found = False if metodo == "jaccard": similaridade = 100 * textdistance.jaccard(str1, str2) was_found = True elif metodo == "levenshtein": if len(str1.split()) > 1 or len(str2.split()) > 1: print( "A similaridade pelo método de Levenshtein pode comparar apenas palavras, não textos" ) else: str1, str2 = str1.lower(), str2.lower() similaridade = 100 * textdistance.levenshtein.normalized_similarity( str1, str2) was_found = True elif metodo == "ratcliff_obershelp": similaridade = 100 * textdistance.ratcliff_obershelp(str1, str2) was_found = True else: print("O método informado não está implementado!") if was_found: print( "A similaridade pelo método %s entre os 2 textos informados é de %.2f %%" % (metodo, similaridade))
def fakeTextDetect(request, format=None): print(request.data) serializer = TextSerializer(data=request.data) if serializer.is_valid(): all_data = FakeText.objects.all() max_similarity = -1 feedback_1 = "" feedback_2 = "" for data in all_data: similarity = textdistance.ratcliff_obershelp( serializer.data['fake_text'], data.fake_text) if max_similarity < similarity: max_similarity = similarity feedback_1 = data.feedback_one feedback_2 = data.feedback_two print(max_similarity) #if text is more than 50% similar then if max_similarity * 100 > 50: #Most likely fake news. content = { 'Description': 'Strong Likeley hood of fake news.', 'Feedback_1': feedback_1, 'Feedback_2': feedback_2 } return Response(content, status=status.HTTP_200_OK) #Less likely fake news content = {'Description': 'Less Likeley hood of fake news.'} return Response(content, status=status.HTTP_200_OK) print(serializer.errors) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
lns = [] diff = abs(len(smiles) - len(atp)) lns.append(name) lns.append(nl) lns.append(i[-1]) lns.append(nl) lns.append(dl) lns.append(str(td.damerau_levenshtein(smiles, atp))) lns.append(nl) lns.append(lev) lns.append(str(td.levenshtein(smiles, atp))) lns.append(nl) lns.append(over) lns.append(str(td.overlap(smiles, atp))) lns.append(nl) lns.append(lcsseq) lns.append(str(len(td.lcsseq(smiles, atp)))) lns.append(nl) lns.append(lcsstr) lns.append(str(len(td.lcsstr(smiles, atp)))) lns.append(nl) lns.append(gest) lns.append(str(td.ratcliff_obershelp(smiles, atp))) lns.append(nl) lns.append(nl) lns.append('________________________________') lns.append(nl) res.writelines(lns) res.close()
def Seq_StringDistance(str_seq, str_ref, method="hamming"): if (method is "hamming"): return [ textdistance.hamming(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "levenshtein"): return [ textdistance.levenshtein(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "damerau_lev"): return [ textdistance.damerau_levenshtein(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "j-winkler"): return [ textdistance.jaro_winkler(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "smith-waterman"): return [ textdistance.smith_waterman(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "jaccard"): return [ textdistance.jaccard(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "sorensen-dice"): return [ textdistance.sorensen_dice(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tversky"): return [ textdistance.tversky(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tanimoto"): return [ textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "cosine"): return [ textdistance.cosine(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tanimoto"): return [ textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "ratcliff"): return [ textdistance.ratcliff_obershelp(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "bwt"): return [ textdistance.bwtrle_ncd(str_seq_i, str_ref) for str_seq_i in str_seq ]
result_array.append( ['file_A','file_B','hamming_normalized','levenshtein_normalized','jaro_winkler','ratcliff_obershelp','jaccard']) for filename_a in os.listdir('/app/input_txt_a'): if filename_a.endswith(".txt"): path_a = '/app/input_txt_a'+'/'+filename_a with open(path_a, 'r') as file: data_a = file.read() for filename_b in os.listdir('/app/input_txt_b'): if filename_b.endswith(".txt"): path_b = '/app/input_txt_b'+'/'+filename_b with open(path_b, 'r') as file: data_b = file.read() print('(A: '+filename_a+') VS (B: '+filename_b+')') hamming_normalized = round( (textdistance.hamming.normalized_similarity(data_b, data_a)),2) print(' Hamming percent normalized similarity: '+str(hamming_normalized)) levenshtein_normalized = round( (textdistance.levenshtein.normalized_similarity(data_b, data_a)),2) print(' Levenshtein percent normalized similarity: '+str(levenshtein_normalized)) jaro_winkler = round( (textdistance.jaro_winkler(data_b, data_a)),2) print(' Jaro/Winkler percent similarity: '+str(jaro_winkler)) ratcliff_obershelp = round( (textdistance.ratcliff_obershelp(data_b, data_a)),2) print(' Ratcliff/Obershelp percent similarity: '+str(ratcliff_obershelp)) jaccard = round( (textdistance.jaccard(data_b, data_a)),2) print(' Jaccard percent similarity: '+str(ratcliff_obershelp)) result_array.append([filename_a,filename_b,hamming_normalized,levenshtein_normalized,jaro_winkler,ratcliff_obershelp,jaccard]) now = datetime.now() timestamp = datetime.timestamp(now) with open("/app/output_csv/confrontation"+str(timestamp)+".csv","w+") as my_csv: csvWriter = csv.writer(my_csv,delimiter=',') csvWriter.writerows(result_array)
def SearchResultsView(request): model = Company, Filing, Funds, Directors, Proxies, Executives template_name = 'companyOverview.html' extended_template = 'base_company.html' if request.user.is_authenticated: extended_template = 'base_company_member.html' query = request.GET.get('q') mycompany = Company.objects.get(name=query) filings = Filing.objects.filter(cik=mycompany.cik).order_by('-filingdate') proxies = Proxies.objects.filter(cik=mycompany.cik).order_by('-filingdate') name = mycompany.name name = name.upper() name = name.replace('INTERNATIONAL', 'INTL') name = name.replace(' /DE', '') name = name.replace('/DE', '') name = name.replace('INC.', 'INC') name = name.replace(',', '') matches = [] exectable = [] funds = Funds.objects.raw( 'SELECT * FROM edgarapp_funds WHERE company = %s ORDER BY share_prn_amount+0 DESC LIMIT 100', [name]) directors = Directors.objects.filter( company=mycompany.name).order_by('-director') allDirectors = Directors.objects.all() executives = Executives.objects.filter(company=mycompany.name) today = datetime.today() currYear = today.year for year in executives: if year.filingdate.split('-')[0] == str(currYear): exectable.append(year) for person in directors: if person: personA = person.director.replace("Mr.", '') personA = person.director.replace("Dr.", '') personA = person.director.replace("Ms.", '') a = set([s for s in personA if s != "," and s != "." and s != " "]) aLast = personA.split(' ')[-1] if (len(personA.split(' ')) == 1): aLast = personA.split('.')[-1] comps = [] for check in allDirectors: if person: personB = check.director.replace("Mr.", '') personB = check.director.replace("Dr.", '') personB = check.director.replace("Ms.", '') bLast = personB.split(' ')[-1] if (len(personB.split(' ')) == 1): bLast = personB.split('.')[-1] # print(personA, aLast, person.company, personB, bLast, check.company) if aLast == bLast: # first check jaccard index to speed up algo, threshold of .65 b = set([ s for s in personB if s != "," and s != "." and s != " " ]) if (len(a.union(b)) != 0): jaccard = float( len(a.intersection(b)) / len(a.union(b))) else: jaccard = 1 # print(personA, personB, jaccard) if (jaccard > 0.65): # run Ratcliff-Obershel for further matching, threshold of .75 and prevent self-match sequence = textdistance.ratcliff_obershelp( personA, personB) # print(sequence) if sequence > 0.75 and mycompany.name != check.company: comps.append(check.company) if not comps: comps.append('Director is not on the board of any other companies') matches.append(comps) object_list = [] object_list.append(query) object_list.append((mycompany.name, mycompany.name)) object_list.append(filings) object_list.append(funds) object_list.append(zip(directors, matches)) object_list.append(zip(exectable, matches)) # object_list.append(itertools.zip_longest(proxies, filings, fillvalue='foo')) # object_list is (q, (companyname, name), (filings object)) if request.user.is_authenticated: return render(request, template_name, { 'object_list': object_list, 'extended_template': extended_template }) else: if query == 'HD': return render(request, template_name, { 'object_list': object_list, 'extended_template': extended_template }) else: return render(request, 'about.html', {'extended_template': 'base.html'})
n2 = dt.datetime.now() ji_time.append((n2 - n1).microseconds) #end = timeit.timeit() #ji_time.append(end - start) #start = timeit.timeit() n1 = dt.datetime.now() sd = textdistance.sorensen(raw_text1, raw_text2) n2 = dt.datetime.now() sd_time.append((n2 - n1).microseconds) #end = timeit.timeit() #sd_time.append(end - start) #start = timeit.timeit() n1 = dt.datetime.now() ro = textdistance.ratcliff_obershelp(raw_text1, raw_text2) n2 = dt.datetime.now() ro_time.append((n2 - n1).microseconds) #end = timeit.timeit() #ro_time.append(end - start) print("jellyfish.jaro_distance") print(sum(jd_time) / 50000) print("jellyfish.levenshtein_distance") print(sum(ld_time) / 50000) print("textdistance.jaccard") print(sum(ji_time) / 50000) print("textdistance.sorensen")
def randcliff(string1, string2): return textdistance.ratcliff_obershelp(string1, string2)
credit = [] debit = [] row_data = [] i = 0 for ind, row in statement.iloc[:].iterrows(): string1 = row['Clean Txn'] for ind_copy, row_copy in statement.iloc[i:].iterrows(): string2 = row_copy['Clean Txn'] if (string1 + string2) not in table and (string2 + string1) not in table: table.append(string1 + string2) trans_date = row_copy['Trans Date'] cred = row_copy['Deposit(CR)'] deb = row_copy['Withdrawal(DR)'] if deb > 5000 or deb == 0: score = td.ratcliff_obershelp(string1, string2) if score >= 0.7 and string2 not in right_side: date.append(trans_date) row_data.append(string1) similarity.append(score) right_side.append(string2) credit.append(cred) debit.append(deb) if string1 not in left_side: left_side.append(string1) else: left_side.append('-') # similairity.append(process.extract(string1, string2.split(), scorer=fuzz.ratio)[0][1]) # print (table[i], ':', score[i]) # for index in range(0, nr_matches):
def simple_example(): str1, str2 = 'test', 'text' qval = 2 #-------------------- # Edit-based. if True: print("textdistance.hamming({}, {}) = {}.".format( str1, str2, textdistance.hamming(str1, str2))) print("textdistance.hamming.distance({}, {}) = {}.".format( str1, str2, textdistance.hamming.distance(str1, str2))) print("textdistance.hamming.similarity({}, {}) = {}.".format( str1, str2, textdistance.hamming.similarity(str1, str2))) print("textdistance.hamming.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.hamming.normalized_distance(str1, str2))) print( "textdistance.hamming.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.hamming.normalized_similarity(str1, str2))) print( "textdistance.Hamming(qval={}, test_func=None, truncate=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Hamming(qval=qval, test_func=None, truncate=False, external=True).distance(str1, str2))) print("textdistance.mlipns({}, {}) = {}.".format( str1, str2, textdistance.mlipns(str1, str2))) print("textdistance.mlipns.distance({}, {}) = {}.".format( str1, str2, textdistance.mlipns.distance(str1, str2))) print("textdistance.mlipns.similarity({}, {}) = {}.".format( str1, str2, textdistance.mlipns.similarity(str1, str2))) print("textdistance.mlipns.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.mlipns.normalized_distance(str1, str2))) print("textdistance.mlipns.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.mlipns.normalized_similarity(str1, str2))) print( "textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval=qval, external=True).distance(str1, str2))) print("textdistance.levenshtein({}, {}) = {}.".format( str1, str2, textdistance.levenshtein(str1, str2))) print("textdistance.levenshtein.distance({}, {}) = {}.".format( str1, str2, textdistance.levenshtein.distance(str1, str2))) print("textdistance.levenshtein.similarity({}, {}) = {}.".format( str1, str2, textdistance.levenshtein.similarity(str1, str2))) print("textdistance.levenshtein.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.levenshtein.normalized_distance(str1, str2))) print("textdistance.levenshtein.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.levenshtein.normalized_similarity(str1, str2))) print( "textdistance.Levenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Levenshtein(qval=qval, test_func=None, external=True).distance(str1, str2))) print("textdistance.damerau_levenshtein({}, {}) = {}.".format( str1, str2, textdistance.damerau_levenshtein(str1, str2))) print("textdistance.damerau_levenshtein.distance({}, {}) = {}.".format( str1, str2, textdistance.damerau_levenshtein.distance(str1, str2))) print( "textdistance.damerau_levenshtein.similarity({}, {}) = {}.".format( str1, str2, textdistance.damerau_levenshtein.similarity(str1, str2))) print( "textdistance.damerau_levenshtein.normalized_distance({}, {}) = {}." .format( str1, str2, textdistance.damerau_levenshtein.normalized_distance( str1, str2))) print( "textdistance.damerau_levenshtein.normalized_similarity({}, {}) = {}." .format( str1, str2, textdistance.damerau_levenshtein.normalized_similarity( str1, str2))) print( "textdistance.DamerauLevenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.DamerauLevenshtein(qval=qval, test_func=None, external=True).distance( str1, str2))) print("textdistance.jaro({}, {}) = {}.".format( str1, str2, textdistance.jaro(str1, str2))) print("textdistance.jaro.distance({}, {}) = {}.".format( str1, str2, textdistance.jaro.distance(str1, str2))) print("textdistance.jaro.similarity({}, {}) = {}.".format( str1, str2, textdistance.jaro.similarity(str1, str2))) print("textdistance.jaro.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.jaro.normalized_distance(str1, str2))) print("textdistance.jaro.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.jaro.normalized_similarity(str1, str2))) print( "textdistance.Jaro(long_tolerance=False, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Jaro(long_tolerance=False, qval=qval, external=True).distance(str1, str2))) print("textdistance.jaro_winkler({}, {}) = {}.".format( str1, str2, textdistance.jaro_winkler(str1, str2))) print("textdistance.jaro_winkler.distance({}, {}) = {}.".format( str1, str2, textdistance.jaro_winkler.distance(str1, str2))) print("textdistance.jaro_winkler.similarity({}, {}) = {}.".format( str1, str2, textdistance.jaro_winkler.similarity(str1, str2))) print("textdistance.jaro_winkler.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.jaro_winkler.normalized_distance(str1, str2))) print("textdistance.jaro_winkler.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.jaro_winkler.normalized_similarity(str1, str2))) print( "textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval=qval, external=True).distance(str1, str2))) print("textdistance.strcmp95({}, {}) = {}.".format( str1, str2, textdistance.strcmp95(str1, str2))) print("textdistance.strcmp95.distance({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.distance(str1, str2))) print("textdistance.strcmp95.similarity({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.similarity(str1, str2))) print("textdistance.strcmp95.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.normalized_distance(str1, str2))) print( "textdistance.strcmp95.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.strcmp95.normalized_similarity(str1, str2))) print( "textdistance.StrCmp95(long_strings=False, external=True).distance({}, {}) = {}." .format( str1, str2, textdistance.StrCmp95(long_strings=False, external=True).distance(str1, str2))) print("textdistance.needleman_wunsch({}, {}) = {}.".format( str1, str2, textdistance.needleman_wunsch(str1, str2))) print("textdistance.needleman_wunsch.distance({}, {}) = {}.".format( str1, str2, textdistance.needleman_wunsch.distance(str1, str2))) print("textdistance.needleman_wunsch.similarity({}, {}) = {}.".format( str1, str2, textdistance.needleman_wunsch.similarity(str1, str2))) print( "textdistance.needleman_wunsch.normalized_distance({}, {}) = {}.". format( str1, str2, textdistance.needleman_wunsch.normalized_distance(str1, str2))) print( "textdistance.needleman_wunsch.normalized_similarity({}, {}) = {}." .format( str1, str2, textdistance.needleman_wunsch.normalized_similarity( str1, str2))) print( "textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval=qval, external=True).distance( str1, str2))) print("textdistance.gotoh({}, {}) = {}.".format( str1, str2, textdistance.gotoh(str1, str2))) print("textdistance.gotoh.distance({}, {}) = {}.".format( str1, str2, textdistance.gotoh.distance(str1, str2))) print("textdistance.gotoh.similarity({}, {}) = {}.".format( str1, str2, textdistance.gotoh.similarity(str1, str2))) print("textdistance.gotoh.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.gotoh.normalized_distance(str1, str2))) print("textdistance.gotoh.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.gotoh.normalized_similarity(str1, str2))) print( "textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval=qval, external=True).distance(str1, str2))) print("textdistance.smith_waterman({}, {}) = {}.".format( str1, str2, textdistance.smith_waterman(str1, str2))) print("textdistance.smith_waterman.distance({}, {}) = {}.".format( str1, str2, textdistance.smith_waterman.distance(str1, str2))) print("textdistance.smith_waterman.similarity({}, {}) = {}.".format( str1, str2, textdistance.smith_waterman.similarity(str1, str2))) print("textdistance.smith_waterman.normalized_distance({}, {}) = {}.". format( str1, str2, textdistance.smith_waterman.normalized_distance(str1, str2))) print( "textdistance.smith_waterman.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.smith_waterman.normalized_similarity(str1, str2))) print( "textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval=qval, external=True).distance(str1, str2))) #-------------------- # Token-based. if False: print("textdistance.jaccard({}, {}) = {}.".format( str1, str2, textdistance.jaccard(str1, str2))) print("textdistance.jaccard.distance({}, {}) = {}.".format( str1, str2, textdistance.jaccard.distance(str1, str2))) print("textdistance.jaccard.similarity({}, {}) = {}.".format( str1, str2, textdistance.jaccard.similarity(str1, str2))) print("textdistance.jaccard.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.jaccard.normalized_distance(str1, str2))) print( "textdistance.jaccard.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.jaccard.normalized_similarity(str1, str2))) print( "textdistance.Jaccard(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Jaccard(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.sorensen({}, {}) = {}.".format( str1, str2, textdistance.sorensen(str1, str2))) print("textdistance.sorensen.distance({}, {}) = {}.".format( str1, str2, textdistance.sorensen.distance(str1, str2))) print("textdistance.sorensen.similarity({}, {}) = {}.".format( str1, str2, textdistance.sorensen.similarity(str1, str2))) print("textdistance.sorensen.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.sorensen.normalized_distance(str1, str2))) print( "textdistance.sorensen.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.sorensen.normalized_similarity(str1, str2))) print( "textdistance.Sorensen(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Sorensen(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.sorensen_dice({}, {}) = {}.".format( str1, str2, textdistance.sorensen_dice(str1, str2))) print("textdistance.sorensen_dice.distance({}, {}) = {}.".format( str1, str2, textdistance.sorensen_dice.distance(str1, str2))) print("textdistance.sorensen_dice.similarity({}, {}) = {}.".format( str1, str2, textdistance.sorensen_dice.similarity(str1, str2))) print("textdistance.sorensen_dice.normalized_distance({}, {}) = {}.". format( str1, str2, textdistance.sorensen_dice.normalized_distance(str1, str2))) print("textdistance.sorensen_dice.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.sorensen_dice.normalized_similarity(str1, str2))) #print("textdistance.SorensenDice().distance({}, {}) = {}.".format(str1, str2, textdistance.SorensenDice().distance(str1, str2))) print("textdistance.tversky({}, {}) = {}.".format( str1, str2, textdistance.tversky(str1, str2))) print("textdistance.tversky.distance({}, {}) = {}.".format( str1, str2, textdistance.tversky.distance(str1, str2))) print("textdistance.tversky.similarity({}, {}) = {}.".format( str1, str2, textdistance.tversky.similarity(str1, str2))) print("textdistance.tversky.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.tversky.normalized_distance(str1, str2))) print( "textdistance.tversky.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.tversky.normalized_similarity(str1, str2))) print( "textdistance.Tversky(qval={}, ks=None, bias=None, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Tversky(qval=qval, ks=None, bias=None, as_set=False, external=True).distance(str1, str2))) print("textdistance.overlap({}, {}) = {}.".format( str1, str2, textdistance.overlap(str1, str2))) print("textdistance.overlap.distance({}, {}) = {}.".format( str1, str2, textdistance.overlap.distance(str1, str2))) print("textdistance.overlap.similarity({}, {}) = {}.".format( str1, str2, textdistance.overlap.similarity(str1, str2))) print("textdistance.overlap.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.overlap.normalized_distance(str1, str2))) print( "textdistance.overlap.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.overlap.normalized_similarity(str1, str2))) print( "textdistance.Overlap(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Overlap(qval=qval, as_set=False, external=True).distance(str1, str2))) # This is identical to the Jaccard similarity coefficient and the Tversky index for alpha=1 and beta=1. print("textdistance.tanimoto({}, {}) = {}.".format( str1, str2, textdistance.tanimoto(str1, str2))) print("textdistance.tanimoto.distance({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.distance(str1, str2))) print("textdistance.tanimoto.similarity({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.similarity(str1, str2))) print("textdistance.tanimoto.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.normalized_distance(str1, str2))) print( "textdistance.tanimoto.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.tanimoto.normalized_similarity(str1, str2))) print( "textdistance.Tanimoto(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Tanimoto(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.cosine({}, {}) = {}.".format( str1, str2, textdistance.cosine(str1, str2))) print("textdistance.cosine.distance({}, {}) = {}.".format( str1, str2, textdistance.cosine.distance(str1, str2))) print("textdistance.cosine.similarity({}, {}) = {}.".format( str1, str2, textdistance.cosine.similarity(str1, str2))) print("textdistance.cosine.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.cosine.normalized_distance(str1, str2))) print("textdistance.cosine.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.cosine.normalized_similarity(str1, str2))) print( "textdistance.Cosine(qval={}, as_set=False, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Cosine(qval=qval, as_set=False, external=True).distance(str1, str2))) print("textdistance.monge_elkan({}, {}) = {}.".format( str1, str2, textdistance.monge_elkan(str1, str2))) print("textdistance.monge_elkan.distance({}, {}) = {}.".format( str1, str2, textdistance.monge_elkan.distance(str1, str2))) print("textdistance.monge_elkan.similarity({}, {}) = {}.".format( str1, str2, textdistance.monge_elkan.similarity(str1, str2))) print("textdistance.monge_elkan.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.monge_elkan.normalized_distance(str1, str2))) print("textdistance.monge_elkan.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.monge_elkan.normalized_similarity(str1, str2))) print( "textdistance.MongeElkan(algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval={}, external=True).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.MongeElkan( algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval=qval, external=True).distance(str1, str2))) print("textdistance.bag({}, {}) = {}.".format( str1, str2, textdistance.bag(str1, str2))) print("textdistance.bag.distance({}, {}) = {}.".format( str1, str2, textdistance.bag.distance(str1, str2))) print("textdistance.bag.similarity({}, {}) = {}.".format( str1, str2, textdistance.bag.similarity(str1, str2))) print("textdistance.bag.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.bag.normalized_distance(str1, str2))) print("textdistance.bag.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.bag.normalized_similarity(str1, str2))) print("textdistance.Bag(qval={}).distance({}, {}) = {}.".format( qval, str1, str2, textdistance.Bag(qval=qval).distance(str1, str2))) #-------------------- # Sequence-based. if False: print("textdistance.lcsseq({}, {}) = {}.".format( str1, str2, textdistance.lcsseq(str1, str2))) print("textdistance.lcsseq.distance({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.distance(str1, str2))) print("textdistance.lcsseq.similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.similarity(str1, str2))) print("textdistance.lcsseq.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.normalized_distance(str1, str2))) print("textdistance.lcsseq.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsseq.normalized_similarity(str1, str2))) #print("textdistance.LCSSeq(qval={}, test_func=None, external=True).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.LCSSeq(qval=qval, test_func=None, external=True).distance(str1, str2))) print("textdistance.LCSSeq().distance({}, {}) = {}.".format( str1, str2, textdistance.LCSSeq().distance(str1, str2))) print("textdistance.lcsstr({}, {}) = {}.".format( str1, str2, textdistance.lcsstr(str1, str2))) print("textdistance.lcsstr.distance({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.distance(str1, str2))) print("textdistance.lcsstr.similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.similarity(str1, str2))) print("textdistance.lcsstr.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.normalized_distance(str1, str2))) print("textdistance.lcsstr.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.lcsstr.normalized_similarity(str1, str2))) print("textdistance.LCSStr(qval={}).distance({}, {}) = {}.".format( qval, str1, str2, textdistance.LCSStr(qval=qval).distance(str1, str2))) print("textdistance.ratcliff_obershelp({}, {}) = {}.".format( str1, str2, textdistance.ratcliff_obershelp(str1, str2))) print("textdistance.ratcliff_obershelp.distance({}, {}) = {}.".format( str1, str2, textdistance.ratcliff_obershelp.distance(str1, str2))) print( "textdistance.ratcliff_obershelp.similarity({}, {}) = {}.".format( str1, str2, textdistance.ratcliff_obershelp.similarity(str1, str2))) print( "textdistance.ratcliff_obershelp.normalized_distance({}, {}) = {}." .format( str1, str2, textdistance.ratcliff_obershelp.normalized_distance( str1, str2))) print( "textdistance.ratcliff_obershelp.normalized_similarity({}, {}) = {}." .format( str1, str2, textdistance.ratcliff_obershelp.normalized_similarity( str1, str2))) print("textdistance.RatcliffObershelp().distance({}, {}) = {}.".format( str1, str2, textdistance.RatcliffObershelp().distance(str1, str2))) #-------------------- # Compression-based. if False: print("textdistance.arith_ncd({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd(str1, str2))) print("textdistance.arith_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd.distance(str1, str2))) print("textdistance.arith_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd.similarity(str1, str2))) print( "textdistance.arith_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.arith_ncd.normalized_distance(str1, str2))) print("textdistance.arith_ncd.normalized_similarity({}, {}) = {}.". format(str1, str2, textdistance.arith_ncd.normalized_similarity(str1, str2))) #print("textdistance.ArithNCD(base=2, terminator=None, qval={}).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.ArithNCD(base=2, terminator=None, qval=qval).distance(str1, str2))) print("textdistance.ArithNCD().distance({}, {}) = {}.".format( str1, str2, textdistance.ArithNCD().distance(str1, str2))) print("textdistance.rle_ncd({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd(str1, str2))) print("textdistance.rle_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.distance(str1, str2))) print("textdistance.rle_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.similarity(str1, str2))) print("textdistance.rle_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.normalized_distance(str1, str2))) print( "textdistance.rle_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.rle_ncd.normalized_similarity(str1, str2))) print("textdistance.RLENCD().distance({}, {}) = {}.".format( str1, str2, textdistance.RLENCD().distance(str1, str2))) print("textdistance.bwtrle_ncd({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd(str1, str2))) print("textdistance.bwtrle_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd.distance(str1, str2))) print("textdistance.bwtrle_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd.similarity(str1, str2))) print( "textdistance.bwtrle_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.bwtrle_ncd.normalized_distance(str1, str2))) print("textdistance.bwtrle_ncd.normalized_similarity({}, {}) = {}.". format(str1, str2, textdistance.bwtrle_ncd.normalized_similarity(str1, str2))) print("textdistance.BWTRLENCD(terminator='\0').distance({}, {}) = {}.". format( str1, str2, textdistance.BWTRLENCD(terminator='\0').distance(str1, str2))) print("textdistance.sqrt_ncd({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd(str1, str2))) print("textdistance.sqrt_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.distance(str1, str2))) print("textdistance.sqrt_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.similarity(str1, str2))) print("textdistance.sqrt_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.normalized_distance(str1, str2))) print( "textdistance.sqrt_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.sqrt_ncd.normalized_similarity(str1, str2))) print("textdistance.SqrtNCD(qval={}).distance({}, {}) = {}.".format( qval, str1, str2, textdistance.SqrtNCD(qval=qval).distance(str1, str2))) print("textdistance.entropy_ncd({}, {}) = {}.".format( str1, str2, textdistance.entropy_ncd(str1, str2))) print("textdistance.entropy_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.entropy_ncd.distance(str1, str2))) print("textdistance.entropy_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.entropy_ncd.similarity(str1, str2))) print("textdistance.entropy_ncd.normalized_distance({}, {}) = {}.". format(str1, str2, textdistance.entropy_ncd.normalized_distance(str1, str2))) print("textdistance.entropy_ncd.normalized_similarity({}, {}) = {}.". format( str1, str2, textdistance.entropy_ncd.normalized_similarity(str1, str2))) print( "textdistance.EntropyNCD(qval={}, coef=1, base=2).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.EntropyNCD(qval=qval, coef=1, base=2).distance(str1, str2))) print("textdistance.bz2_ncd({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd(str1, str2))) print("textdistance.bz2_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.distance(str1, str2))) print("textdistance.bz2_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.similarity(str1, str2))) print("textdistance.bz2_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.normalized_distance(str1, str2))) print( "textdistance.bz2_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.bz2_ncd.normalized_similarity(str1, str2))) print("textdistance.BZ2NCD().distance({}, {}) = {}.".format( str1, str2, textdistance.BZ2NCD().distance(str1, str2))) print("textdistance.lzma_ncd({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd(str1, str2))) print("textdistance.lzma_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.distance(str1, str2))) print("textdistance.lzma_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.similarity(str1, str2))) print("textdistance.lzma_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.normalized_distance(str1, str2))) print( "textdistance.lzma_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.lzma_ncd.normalized_similarity(str1, str2))) print("textdistance.LZMANCD().distance({}, {}) = {}.".format( str1, str2, textdistance.LZMANCD().distance(str1, str2))) print("textdistance.zlib_ncd({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd(str1, str2))) print("textdistance.zlib_ncd.distance({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.distance(str1, str2))) print("textdistance.zlib_ncd.similarity({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.similarity(str1, str2))) print("textdistance.zlib_ncd.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.normalized_distance(str1, str2))) print( "textdistance.zlib_ncd.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.zlib_ncd.normalized_similarity(str1, str2))) print("textdistance.ZLIBNCD().distance({}, {}) = {}.".format( str1, str2, textdistance.ZLIBNCD().distance(str1, str2))) #-------------------- # Phonetic. if False: print("textdistance.mra({}, {}) = {}.".format( str1, str2, textdistance.mra(str1, str2))) print("textdistance.mra.distance({}, {}) = {}.".format( str1, str2, textdistance.mra.distance(str1, str2))) print("textdistance.mra.similarity({}, {}) = {}.".format( str1, str2, textdistance.mra.similarity(str1, str2))) print("textdistance.mra.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.mra.normalized_distance(str1, str2))) print("textdistance.mra.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.mra.normalized_similarity(str1, str2))) print("textdistance.MRA().distance({}, {}) = {}.".format( str1, str2, textdistance.MRA().distance(str1, str2))) print("textdistance.editex({}, {}) = {}.".format( str1, str2, textdistance.editex(str1, str2))) print("textdistance.editex.distance({}, {}) = {}.".format( str1, str2, textdistance.editex.distance(str1, str2))) print("textdistance.editex.similarity({}, {}) = {}.".format( str1, str2, textdistance.editex.similarity(str1, str2))) print("textdistance.editex.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.editex.normalized_distance(str1, str2))) print("textdistance.editex.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.editex.normalized_similarity(str1, str2))) print( "textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance({}, {}) = {}." .format( str1, str2, textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance(str1, str2))) #-------------------- # Simple. if False: print("textdistance.prefix({}, {}) = {}.".format( str1, str2, textdistance.prefix(str1, str2))) print("textdistance.prefix.distance({}, {}) = {}.".format( str1, str2, textdistance.prefix.distance(str1, str2))) print("textdistance.prefix.similarity({}, {}) = {}.".format( str1, str2, textdistance.prefix.similarity(str1, str2))) print("textdistance.prefix.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.prefix.normalized_distance(str1, str2))) print("textdistance.prefix.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.prefix.normalized_similarity(str1, str2))) print( "textdistance.Prefix(qval={}, sim_test=None).distance({}, {}) = {}." .format( qval, str1, str2, textdistance.Prefix(qval=qval, sim_test=None).distance(str1, str2))) print("textdistance.postfix({}, {}) = {}.".format( str1, str2, textdistance.postfix(str1, str2))) print("textdistance.postfix.distance({}, {}) = {}.".format( str1, str2, textdistance.postfix.distance(str1, str2))) print("textdistance.postfix.similarity({}, {}) = {}.".format( str1, str2, textdistance.postfix.similarity(str1, str2))) print("textdistance.postfix.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.postfix.normalized_distance(str1, str2))) print( "textdistance.postfix.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.postfix.normalized_similarity(str1, str2))) #print("textdistance.Postfix(qval={}, sim_test=None).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.Postfix(qval=qval, sim_test=None).distance(str1, str2))) print("textdistance.Postfix().distance({}, {}) = {}.".format( str1, str2, textdistance.Postfix().distance(str1, str2))) print("textdistance.length({}, {}) = {}.".format( str1, str2, textdistance.length(str1, str2))) print("textdistance.length.distance({}, {}) = {}.".format( str1, str2, textdistance.length.distance(str1, str2))) print("textdistance.length.similarity({}, {}) = {}.".format( str1, str2, textdistance.length.similarity(str1, str2))) print("textdistance.length.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.length.normalized_distance(str1, str2))) print("textdistance.length.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.length.normalized_similarity(str1, str2))) print("textdistance.Length().distance({}, {}) = {}.".format( str1, str2, textdistance.Length().distance(str1, str2))) print("textdistance.identity({}, {}) = {}.".format( str1, str2, textdistance.identity(str1, str2))) print("textdistance.identity.distance({}, {}) = {}.".format( str1, str2, textdistance.identity.distance(str1, str2))) print("textdistance.identity.similarity({}, {}) = {}.".format( str1, str2, textdistance.identity.similarity(str1, str2))) print("textdistance.identity.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.identity.normalized_distance(str1, str2))) print( "textdistance.identity.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.identity.normalized_similarity(str1, str2))) print("textdistance.Identity().distance({}, {}) = {}.".format( str1, str2, textdistance.Identity().distance(str1, str2))) print("textdistance.matrix({}, {}) = {}.".format( str1, str2, textdistance.matrix(str1, str2))) print("textdistance.matrix.distance({}, {}) = {}.".format( str1, str2, textdistance.matrix.distance(str1, str2))) print("textdistance.matrix.similarity({}, {}) = {}.".format( str1, str2, textdistance.matrix.similarity(str1, str2))) print("textdistance.matrix.normalized_distance({}, {}) = {}.".format( str1, str2, textdistance.matrix.normalized_distance(str1, str2))) print("textdistance.matrix.normalized_similarity({}, {}) = {}.".format( str1, str2, textdistance.matrix.normalized_similarity(str1, str2))) print( "textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance({}, {}) = {}." .format( str1, str2, textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance(str1, str2)))
def ratcliff_obershelp_similarity(row, attr): x = row[attr + "_l"].lower() y = row[attr + "_r"].lower() return td.ratcliff_obershelp(x, y)
def SearchFilingView(request): model = Company, Filing, Proxies template_name = 'companyFiling.html' extended_template = 'base_company.html' if request.user.is_authenticated: extended_template = 'base_company_member.html' matches = [] exectable = [] query = request.GET.get('q') fid = request.GET.get('fid') mycompany = Company.objects.get(ticker=query) filings = Filing.objects.filter(cik=mycompany.cik).order_by('-filingdate') filing = Filing.objects.get(id=fid) # the filing requested by fid name = mycompany.name name = name.upper() name = name.replace('INTERNATIONAL', 'INTL') name = name.replace(' /DE', '') name = name.replace('/DE', '') name = name.replace('INC.', 'INC') name = name.replace(',', '') funds = Funds.objects.raw( 'SELECT * FROM edgarapp_funds WHERE company = %s ORDER BY share_prn_amount+0 DESC LIMIT 100', [name]) directors = Directors.objects.filter( company=mycompany.name).order_by('-director') allDirectors = Directors.objects.all() executives = Executives.objects.filter(company=mycompany.name) today = datetime.today() currYear = today.year for year in executives: if year.filingdate.split('-')[0] == str(currYear): exectable.append(year) for person in directors: if person: personA = person.director.replace("Mr.", '') personA = person.director.replace("Dr.", '') personA = person.director.replace("Ms.", '') a = set([s for s in personA if s != "," and s != "." and s != " "]) aLast = personA.split(' ')[-1] if (len(personA.split(' ')) == 1): aLast = personA.split('.')[-1] comps = [] for check in allDirectors: if person: personB = check.director.replace("Mr.", '') personB = check.director.replace("Dr.", '') personB = check.director.replace("Ms.", '') bLast = personB.split(' ')[-1] if (len(personB.split(' ')) == 1): bLast = personB.split('.')[-1] # print(personA, aLast, person.company, personB, bLast, check.company) if aLast == bLast: # first check jaccard index to speed up algo, threshold of .65 b = set([ s for s in personB if s != "," and s != "." and s != " " ]) if (len(a.union(b)) != 0): jaccard = float( len(a.intersection(b)) / len(a.union(b))) else: jaccard = 1 # print(personA, personB, jaccard) if (jaccard > 0.65): # run Ratcliff-Obershel for further matching, threshold of .75 and prevent self-match sequence = textdistance.ratcliff_obershelp( personA, personB) # print(sequence) if sequence > 0.75 and mycompany.name != check.company: comps.append(check.company) if not comps: comps.append('Director is not on the board of any other companies') matches.append(comps) url = '/mnt/filings-static/capitalrap/edgarapp/static/filings/' + filing.filingpath # page = open(url) # finder = filing.filingpath.split('/')[1]+"#" # soup = BeautifulSoup(page.read()) links = [] verify = [] # for link in soup.find_all('a'): # x = link.get('href') # if str(x).startswith('https') or str(x).startswith('http'): # if x.find('#') != -1: # if link.string.find('Table of Contents') == -1 or x.endswith("#INDEX") == -1: # # print(link.string.endswith("Index")) # if link.string.endswith("Index") == False: # # print('not present') # if x in verify: # for item in links: # if x.find(item["url"]) != -1: # # print(link.string) # itemIndex = links.index(item) # # print("index", itemIndex) # del links[itemIndex] # store = { # "value": item["value"] + " " + link.string, # "url": item["url"] # } # links.append(store) # else: # # print('false') # verify.append(x) # store = { # "value": link.string, # "url": "#"+x.split('#')[1] # } # links.append(store) object_list = [] object_list.append((query, fid)) object_list.append((mycompany.name, mycompany.ticker)) object_list.append(filings) object_list.append(filing) object_list.append(funds) object_list.append(zip(directors, matches)) object_list.append(zip(exectable, matches)) object_list.append(links) # print(finder) toc_extractor = TOCExtractor() with open(url) as file: filing_html = file.read() try: extract_data = toc_extractor.extract(filing_html) table_of_contents = extract_data.table except: table_of_contents = "" # object_list is ((q, fid), (companyname, name), (filings object), (filing)) return render( request, template_name, { 'object_list': object_list, 'extended_template': extended_template, 'table_of_contents': table_of_contents, 'filing_html': filing_html })