def GetScore(src_name, input_name, min_score): src_name = src_name.translate(str.maketrans('', '', string.punctuation)) input_name = input_name.translate( str.maketrans('', '', string.punctuation)) jarowinkler = JaroWinkler() result = [] total_score_scr_part = 0 for input_name_part in input_name.split(): column = [] for src_name_part in src_name.split(): winkler_part = jarowinkler.similarity( input_name_part, src_name_part) difference = ParsedDifference(input_name_part, src_name_part) avg = (winkler_part + difference) / 2 column.append(avg) result.append(max(column)) full_inputted_jaro = jarowinkler.similarity(input_name, src_name) score = Average(result) if (full_inputted_jaro > score): score = full_inputted_jaro return score * 100
def fuzzy_line_equality_detection(self, lines): new_lines = [] jarowinkler = JaroWinkler() #Compare all lines against each other for k in range(len(lines.split("\n"))): max_sim = 0 for l in range(len(lines.split("\n"))): if k == l: continue jaro_sim = jarowinkler.similarity( lines.split("\n")[k].lower(), lines.split("\n")[l].lower()) #Get maximum similarity if jaro_sim > max_sim: max_sim = jaro_sim #If maximum similarity >= similarity threshold: make all tokens technical(T) if max_sim >= self.similarity_threshold and lines.split( "\n")[k].replace(" ", ""): new_lines.append(" ".join( [w + "_T" for w in lines.split("\n")[k].split(" ")])) else: new_lines.append(" ".join( [w + "_N" for w in lines.split("\n")[k].split(" ")])) return "\n".join(new_lines)
for i in range(len(data)): temp_article.append(data[i][0]) print(len(data)) my_string = "human moblity prediction spatiotemporal next place future location point-of-interest hotspot forecasting modelling mobility behaviors traffic trajectory mobile phone" p = [] filter_thresh_45 = [] for i in range(len(temp_article)): jarowinkler = JaroWinkler() sim = jarowinkler.similarity(my_string, temp_article[i]) if sim > 0.45: filter_thresh_45.append(data[i]) normalized_levenshtein = NormalizedLevenshtein() filter_normalized_levenshtein = [] for i in range(len(filter_thresh_45)): sim = normalized_levenshtein.distance(my_string, filter_thresh_45[i][0])