def similarityMetrics(df_train): #### -- Distances: Levenshtein, Cosine, Jaccard, Jaro ## -- Search term vs Product Title # Levensthein distance between 'PT_text' & 'ST_text' df_train['Leven_sim_PT'] = df_train.apply( lambda x: 1 - stringdist.levenshtein_norm(x['PT_text'], x['ST_text']), axis=1) df_train['JC_sim_PT'] = df_train.apply( lambda x: utils.get_jaccard_sim(x['PT_stem'], x['ST_stem']), axis=1) # cosine df_train['Cosine_sim_PT'] = df_train.apply(lambda x: utils.get_cosine_sim( ' '.join(x['PT_stem']), ' '.join(x['ST_stem'])), axis=1) ## -- Search term vs Description # Levensthein distance between 'Keywords_Descr' as text & 'ST_text' df_train['Leven_sim_PD'] = df_train.apply( lambda x: 1 - stringdist.levenshtein_norm( ' '.join(x['Keywords_Descr']), x['ST_text']), axis=1) # Jaccard similarity df_train['JC_sim_PD'] = df_train.apply( lambda x: utils.get_jaccard_sim(x['PD_stem'], x['ST_stem']), axis=1) df_train['Cosine_sim_PD'] = df_train.apply(lambda x: utils.get_cosine_sim( ' '.join(x['PD_stem']), ' '.join(x['ST_stem'])), axis=1) ## -- Search term vs Attributes df_train['Atrr_stem'] = df_train['Atrr_stem'].apply( lambda d: d if isinstance(d, list) else []) # Levensthein distance between 'PT_text' & 'ST_text' df_train['Leven_sim_Atrr'] = df_train.apply(lambda x: utils.get_leven(x), axis=1) # Jaccard similarity df_train['JC_sim_Atrr'] = df_train.apply( lambda x: utils.get_jaccard_sim(x['Atrr_stem'], x['ST_stem']), axis=1) # Cosine similarity df_train['Cosine_sim_Atrr'] = df_train.apply( lambda x: utils.get_cosine_sim(' '.join(x['Atrr_stem']), ' '.join(x[ 'ST_stem'])), axis=1) df_train_sims = df_train[[ 'id', 'product_uid', 'JC_sim_PT', 'Cosine_sim_PT', 'Leven_sim_PD', 'JC_sim_PD', 'Cosine_sim_PD', 'JC_sim_Atrr', 'Cosine_sim_Atrr', 'Leven_sim_Atrr' ]] df_similarities = df_train_sims return df_train, df_similarities
def classify_enum_value(col_value, cde_index, g): output_dict = {'observedValue': str(col_value), 'permissibleValue': {}} query = "CALL db.index.fulltext.queryNodes(\"ansindex\",\"{0:s}\") YIELD node as a, score ".format( str(col_value)) query += "MATCH (n:CDE) - [:PERMITS] - (ans:Answer) - [:CAN_BE] - (a:AnswerText) WHERE ID(n) = {0:d} ".format( cde_index) query += "RETURN ID(ans), 'Answer', score, a.name" result = query_graph(query, g) answer_values = result.values() query = "CALL db.index.fulltext.queryNodes(\"nameindex\",\"{0:s}\") YIELD node as s, score ".format( str(col_value)) query += "MATCH (n:CDE) - [:PERMITS] - (ans:Answer) - [:EQUALS] - (con:Concept) - [:IS_CALLED] - (s:Synonym) WHERE ID(n) = {0:d} ".format( cde_index) query += "RETURN ID(ans), 'Synonym', score, s.name, con.CODE" result = query_graph(query, g) syn_values = result.values() all_results = answer_values + syn_values all_results = [i for i in all_results if i[2] > FT_SEARCH_CUTOFF] if len(all_results) > 0: all_results.sort(key=lambda z: z[2], reverse=True) ans_index = all_results[0][0] ans_results = [i for i in all_results if i[0] == ans_index] # Now we need to choose the best synonym synonyms = [i for i in ans_results if i[1] == 'Synonym'] if len( synonyms ) > 0: #Choose the best based on 1: search score, and 2: stringdist synonyms.sort(key=lambda z: (-z[ 2], stringdist.levenshtein_norm(z[3], col_value))) output_dict['permissibleValue']['value'] = str(synonyms[0][3]) output_dict['permissibleValue']['conceptCode'] = 'ncit:' + str( synonyms[0][4]) else: query = "MATCH (a:Answer) - [:EQUALS] - (c:Concept) - [:IS_CALLED] - (s:Synonym) WHERE ID(a) = {0:d} RETURN c.CODE,s.name".format( ans_index) result = query_graph(query, g) values = result.values() if len(values) > 0: values.sort(key=lambda z: stringdist.levenshtein_norm( str(col_value).lower(), str(z[1]).lower())) output_dict['permissibleValue']['value'] = str(values[0][1]) output_dict['permissibleValue']['conceptCode'] = 'ncit:' + str( values[0][0]) else: output_dict['permissibleValue']['value'] = str( ans_results[0][3]) output_dict['permissibleValue']['conceptCode'] = None else: output_dict['permissibleValue']['value'] = 'NOMATCH' output_dict['permissibleValue']['conceptCode'] = None return output_dict
def genetic_diversity(pop): dists = [ stringdist.levenshtein_norm(pop[x].genome, pop[y].genome) for x in range(len(pop)) for y in range(x + 1, len(pop)) ] variety = round(sum(dists) * 100 / len(dists)) return variety
def find_best_match(orig_name, names_list: list) -> dict: min_res = 1.0 best_name = "" postfixes_list = [ "AGA", "Levels", "Activision", "Sega", "Demo", "Demo1", "Demo2", "CD32", "CDTV", "Psygnosis", "NTSC", "Disk", "DemoPlay" ] name1 = orig_name for postfix in postfixes_list: if name1.endswith(postfix): name1 = name1[:-len(postfix)] if name1.endswith("Fr") or name1.endswith("De") or name1.endswith("Pl"): name1 = name1[:-2] for name2 in names_list: if min_res == 0.0: break for temp_name in permut(name2): res = stringdist.levenshtein_norm(name1, temp_name) if temp_name.replace(" ", "").lower().startswith(name1.lower()): min_res = 0 best_name = name2 break if res < min_res: min_res = res best_name = name2 return {"res": min_res, "retro": orig_name, "lemon": best_name}
def query_editdist(nsentence): distList = [ sd.levenshtein_norm(str(d), nsentence) for d in train_set['Keyword'].values ] resind = np.argmin(distList) return train_set.iloc[resind]['Topic']
def matching_NED(gold_fragments, matches_df, plot_hist=False): " percentage of phonemes shared by the two strings \ normalization is done wrt number of different labels instead of frame counts " if len(matches_df) == 0: return 1.0 neds = np.zeros(len(matches_df)) for i, row in matches_df.iterrows(): filename, start, end = (row['f1'], row['f1_start'], row['f1_end']) labels1 = fragment_tokenizer(gold_fragments, filename, start, end) filename, start, end = (row['f2'], row['f2_start'], row['f2_end']) labels2 = fragment_tokenizer(gold_fragments, filename, start, end) try: neds[i] = stringdist.levenshtein_norm(labels1, labels2) except: neds[i] = strdist(labels1, labels2) if plot_hist: plt.hist(neds) plt.title('Normalized Edit Distance Histogram') plt.show() return sum(neds) / len(neds)
def clus_NED(gold_fragments, nodes_df, clusters_list): " frameler uzerinden degil, transcription uzerinden hesaplaniyor " if len(nodes_df) == 0: return 1.0 P_clus = [] for clus in clusters_list: for pair in itertools.combinations(clus, 2): P_clus.append(list(pair)) neds = np.zeros(len(P_clus)) for i, pair in enumerate(P_clus): labels = [] for p in pair: labels.append(nodes_df.types[p]) # in order not to count garbage classes from different sequences as the same label if (len(labels[0]) == 0) | (len(labels[1]) == 0): neds[i] = 1. else: try: neds[i] = stringdist.levenshtein_norm(labels[0], labels[1]) except: neds[i] = strdist(labels[0], labels[1]) # neds[i] = stringdist.levenshtein_norm(labels[0], labels[1]) return sum(neds) / len(neds)
def compare_with_levenshtein_distance(value_a, value_b): str_a = 'a' + _to_str_for_distance_calculation(value_a) str_b = 'a' + _to_str_for_distance_calculation(value_b) try: value = stringdist.levenshtein_norm(str_a, str_b) return value except Exception as exc: print(str_a, str_b) raise exc
def compute_levenshtein_sim(str1, str2): """ Computer the Levenshtein Similarity between two strings using 3-grams, if one string is not contained in the other. """ if str1 in str2 or str2 in str1: return 1 return 1 - stringdist.levenshtein_norm(str1, str2)
def preview_author(author="rembrandt"): distance = {} for n in filter(None, favorites.keys()): lh = stringdist.levenshtein_norm(author, n) distance[n] = lh chosen_author = min(distance, key=distance.get) art_ids = favorites[chosen_author] chosen_art_id = random.choice(art_ids) #print(chosen_art_id) # preview item found by keyword on device 8292 preview_item(token, chosen_art_id) return(author_artwork[chosen_art_id],chosen_author)
def mergeNames(row): names = [row['product_name'],row['Name']] if type(names[0])!=str: return names[1] if type(names[1])!=str: return names[0] #If names are differents, we use the levenshtein distance to measure the difference if names[0]!=names[1]: m = min(len(names[0]),len(names[1])) dist = stringdist.levenshtein_norm(names[0][:m],names[1][:m]) if dist>0.3: return float('NaN') return names[0]
def compare_tweets(tweet): for item in search_tweets(): row = item if stringdist.levenshtein_norm(row, tweet) > DISTANCE: cprint( 'GetTweetsError: New Tweet is too similar to old tweets. Trying again.' ) logging.error( str('GetTweetsError: New Tweet is too similar to old tweets. Trying again.' )) raise GetTweetsError return False else: return True
def common_words_leven(tokens_1, tokens_2): # N = 0 common_terms = [] tokens_1 = list(set(tokens_1)) tokens_2 = list(set(tokens_2)) for token1 in tokens_1: for token2 in tokens_2: if 1 - stringdist.levenshtein_norm(token1, token2) > 0.85: # N += 1 common_terms.append(token2) try: return common_terms except: return common_terms
def metaphone_suggestions(word, count): spelling_phone = mphone(word) suggestions = [] if spelling_phone in metaphone_dict: suggestions.extend(metaphone_dict[spelling_phone]) additional_suggestions = [] for eword in edit_distance_1(spelling_phone): if eword.upper() in metaphone_dict: additional_suggestions.extend(metaphone_dict[eword.upper()]) additional_suggestions.sort(key=lambda x: stringdist.levenshtein_norm(x, word)) #suggestions.sort(key=lambda x: stringdist.levenshtein_norm(x, word)) suggestions.extend(additional_suggestions) #return list(dict.fromkeys(suggestions))[0:5] suggestions = [sug[0].upper() + sug[1:] if word[0].upper() == word[0] else sug for sug in list(dict.fromkeys(suggestions)) if len(sug) > 1] return suggestions[:count]
def suggestions(word, count=5): spelling_phone = pkey(word) suggestions = [] #Primary Keys if spelling_phone in phonetic_dict: suggestions.extend(phonetic_dict[spelling_phone]) #Supplementary Keys if len(suggestions) < count: additional_suggestions = [] for eword in edit_distance_1(spelling_phone): if eword.upper() in phonetic_dict: additional_suggestions.extend(phonetic_dict[eword.upper()]) additional_suggestions.sort( key=lambda x: stringdist.levenshtein_norm(x, word)) suggestions.extend(additional_suggestions) suggestions = [ sug[0].upper() + sug[1:] if word[0].upper() == word[0] else sug for sug in list(dict.fromkeys(suggestions)) if len(sug) > 1 ] return suggestions[:count]
def getClosestDocs(wiki_entities, entities): entities = list(entities) for i in range(len(entities)): entities[i] = str(entities[i]) selected_docs = [] for ent in entities: ent = ud.normalize('NFC', ent) if ent in wiki_entities: best_match = ent else: best = 1.1 best_match = "" for we in wiki_entities: dist = stringdist.levenshtein_norm(we, ent) if dist < best: best = dist best_match = we best_match = best_match.replace(" ", "_") best_match = best_match.replace("/", "-SLH-") best_match = best_match.replace("(", "-LRB-") best_match = best_match.replace(")", "-RRB-") selected_docs.append(best_match) return selected_docs, entities
def get_corpus_dist_set(query, idf_dict, normalized_tf_list, soundex_dict, avg, inverse_mapping, doc_title_list, C, thresh=0.2, K=5, wt=0.5): """ This method uses other methods to: 1. preprocess the query. 2. Matches the closest terms in the corpus according to stemming and levenstein distance. 3. Computes distance score by : wt*(lev_distance) + (1-wt)*soundex_distance 4. Suggests closest K words with score <= threshold. 5. Removes the terms in close set whose idf < average idf across the corpus(to supress false positives) 6. Uses the get_top_K method to retrieve documents with highest score on the updated query """ query = preprocess_query(query, C) corpus_tokens = [] #close_term_dict = {} soundex = fuzzy.Soundex(4) #query = unicode(source, 'utf-8') query_tokens = nltk.word_tokenize(query) #print(query_tokens) for key in idf_dict.keys(): corpus_tokens.append(key) #print(len(corpus_tokens)) cnt = 0 lev_query = "" for token in query_tokens: query_lev_dict = {} for term in corpus_tokens: query_lev_dict[term] = 0 soundex_notation_term = soundex_dict[term] soundex_notation_token = get_soundex(token) soundex_distance = stringdist.levenshtein_norm( soundex_notation_term, soundex_notation_token) lev_distance = stringdist.levenshtein_norm(token, term) query_lev_dict[term] = wt * (lev_distance) + ( 1 - wt) * soundex_distance sorted_d = dict( sorted(query_lev_dict.items(), key=operator.itemgetter(1), reverse=False)) #print(sorted_d) k = 0 close_terms = [] for item in sorted_d: if (k == K): break if (sorted_d[item] > thresh): break close_terms.append(str(item)) #if(item == token): # break k = k + 1 #close_term_dict[token] = close_terms cnt = 0 flg = False for i in close_terms: if (i != query_tokens[cnt]): flg = True lev_query = lev_query + i + " " #print(token, close_terms) if (flg == True): print("Searching instead for " + lev_query + ":") get_top_K(lev_query, 10, normalized_tf_list, idf_dict, inverse_mapping, doc_title_list, C)
def levenshtein_norm(gold, system): return stringdist.levenshtein_norm(gold.lemma, system.lemma)
def getFeatures(df_train, df_descr, df_attr): #### -- Features from TRAINSET + Descriptions + Attributes # merge the above dataframes df_train = df_train.merge(df_descr, left_on='product_uid', right_on='product_uid', how='left') df_train = df_train.merge(df_attr, left_on='product_uid', right_on='product_uid', how='left') #### -- General counts about numerics and non numerics in Product title and Search term # number of numeric terms in product_title df_train['N_numerics_PT'] = df_train['PT_numerics'].apply(lambda x: len(x)) # number of numeric terms in search_term df_train['N_numerics_ST'] = df_train['ST_numerics'].apply(lambda x: len(x)) # number of non numeric terms in product_title df_train['N_non_numerics_PT'] = df_train['PT_Non_numerics'].apply( lambda x: len(x)) # number of non numeric terms in search_term df_train['N_non_numerics_ST'] = df_train['ST_Non_numerics'].apply( lambda x: len(x)) # common nonnumeric terms between 'PT_Non_numerics' & 'ST_Non_numerics' with levensthein distance df_train['Common_words_leven'] = df_train.apply( lambda x: utils.common_words_leven(x['PT_Non_numerics'], x[ 'ST_Non_numerics']), axis=1) #### -- Common terms between Search term & Product title # number of common nonnumeric terms between 'PT_Non_numerics' & 'ST_Non_numerics' with levensthein distance df_train['N_common_words_leven'] = df_train['Common_words_leven'].apply( lambda x: len(x)) # Jaccard similarity based on the above common words with levensthein df_train['JC_sim'] = df_train.apply(lambda x: utils.get_jaccard_sim( x['PT_Non_numerics'], x['ST_Non_numerics'], x['Common_words_leven']), axis=1) #### -- Non numeric terms of Search term that are substrings of Product title | Descrtiption | Attribute ## -- PRODUCT TITLE # list of terms of search_term_tokens that are substrings of PT_lower df_train['Substrs_PT_x'] = df_train.apply( lambda x: utils.n_substrings(x['ST_Non_numerics'], x['PT_lower']), axis=1) # Number of terms of search_term_tokens that are substrings of product_title_lower df_train['N_substrs_PT_x'] = df_train['Substrs_PT_x'].apply( lambda x: len(x)) ## -- PRODUCT DESCRIPTION # list of terms of search_term_tokens that are substrings of PD_lower df_train['Substrs_PD_x'] = df_train.apply( lambda x: utils.n_substrings(x['ST_Non_numerics'], x['PD_lower']), axis=1) # Number of terms of search_term_tokens that are substrings of product_title_lower df_train['N_substrs_PD_x'] = df_train['Substrs_PD_x'].apply( lambda x: len(x)) ## -- PRODUCT ATTRIBUTES # list of terms of search_term_tokens that are substrings of PD_lower df_train['Substrs_Atr_x'] = df_train.apply( lambda x: utils.n_substrings(x['ST_Non_numerics'], x['Atrr_text']), axis=1) # Number of terms of search_term_tokens that are substrings of product_title_lower df_train['N_substrs_Atr_x'] = df_train['Substrs_Atr_x'].apply( lambda x: len(x)) # percentage of terms of search_term_tokens that are substrings of PT_lower or PD_lower or Atrr_text df_train['Perc_substrs_x'] = df_train.apply( lambda x: utils.perc_xxx(x['Substrs_PT_x'], x['Substrs_PD_x'], x[ 'Substrs_Atr_x'], x['ST_Non_numerics']), axis=1) ## -- PRODUCT TITLE # list of terms of search_term_tokens that are substrings of PT_lower df_train['Substrs_PT_y'] = df_train.apply( lambda x: utils.n_substrings(x['ST_numerics'], x['PT_lower']), axis=1) # Number of terms of search_term_tokens that are substrings of product_title_lower df_train['N_substrs_PT_y'] = df_train['Substrs_PT_y'].apply( lambda x: len(x)) ## -- PRODUCT DESCRIPTION # list of terms of search_term_tokens that are substrings of PD_lower df_train['Substrs_PD_y'] = df_train.apply( lambda x: utils.n_substrings(x['ST_numerics'], x['PD_lower']), axis=1) # Number of terms of search_term_tokens that are substrings of product_title_lower df_train['N_substrs_PD_y'] = df_train['Substrs_PD_y'].apply( lambda x: len(x)) ## -- PRODUCT ATTRIBUTES # list of terms of search_term_tokens that are substrings of PD_lower df_train['Substrs_Atr_y'] = df_train.apply( lambda x: utils.n_substrings(x['ST_numerics'], x['Atrr_text']), axis=1) # Number of terms of search_term_tokens that are substrings of product_title_lower df_train['N_substrs_Atr_y'] = df_train['Substrs_Atr_y'].apply( lambda x: len(x)) # percentage of terms of search_term_tokens that are substrings of PT_lower or PD_lower or Atrr_text df_train['Perc_substrs_y'] = df_train.apply( lambda x: utils.perc_xxx(x['Substrs_PT_y'], x['Substrs_PD_y'], x[ 'Substrs_Atr_y'], x['ST_numerics']), axis=1) #### -- Levensthein distance between Search terms & Product title""" # Levensthein distance between 'product_title_text' & 'search_term_text' df_train['Leven_sim_ST_PT'] = df_train.apply( lambda x: 1 - stringdist.levenshtein_norm(x['PT_text'], x['ST_text']), axis=1) #### -- Keywords of Description that appear in the Search term (with levensthein distance) # list of Descripton Keywords that appear in the 'ST_Non_numerics' with levensthein distance df_train['Keywords_leven'] = df_train.apply( lambda x: utils.common_words_leven(x['Keywords_Descr'], x[ 'ST_Non_numerics']), axis=1) # number of Descripton Keywords that appear in the 'ST_Non_numerics' with levensthein distance df_train['N_keywords_leven'] = df_train['Keywords_leven'].apply( lambda x: len(x)) # keep only those columns df_train2 = df_train[[ 'product_uid', 'N_numerics_PT', 'N_numerics_ST', 'N_non_numerics_PT', 'N_non_numerics_ST', 'N_common_words_leven', 'JC_sim', 'N_substrs_PT_x', 'N_substrs_PD_x', 'N_substrs_Atr_x', 'Perc_substrs_x', 'N_substrs_PT_y', 'N_substrs_PD_y', 'N_substrs_Atr_y', 'Perc_substrs_y', 'Leven_sim_ST_PT', 'N_keywords_leven', 'relevance' ]] return df_train, df_train2
def get_leven(x): try: return 1 - stringdist.levenshtein_norm(' '.join(x['Atrr_stem']), x['ST_text']) except: # print('error') return 0
def test_levenshtein_norm_matching(self): """It should return right normalized dist when strings match""" self.assertEqual(levenshtein_norm('abcde', 'abcde'), 0)
def evaluateMetadata(folder, golden): jsonfiles = [f for f in os.listdir(folder) if f.endswith('.json')] number_of_files = len( jsonfiles) + 1 #add 1 for use in range(1, number_of_files) rb = open_workbook(golden) sheet1 = rb.sheet_by_index(0) database = [[sheet1.cell_value(r, c) for c in range(sheet1.ncols)] for r in range(sheet1.nrows)] #golden_database data = "" def fbase(index): return database[row][index] def fdata(index): return data[index] #title title = 0 title_extracted = 0 title_extracted_correctly = 0 title_NLD = 0 for i in range(1, number_of_files): row = i filename = folder + "/" + "metadata_" + database[i][0] + ".json" #print(filename) try: with open(filename) as f: data = json.load(f) try: X = fbase(50).lower().strip() except: X = "" try: Y = "".join(fdata("dc.title")).lower().strip() except: Y = "" for p in string.punctuation + "–" + "—": p = "\\" + str(p) X = re.sub(p, " ", re.sub("\s+", " ", X)).strip() Y = re.sub(p, " ", re.sub("\s+", " ", Y)).strip() if X != "": title += 1 if Y != "": title_extracted += 1 if X != "" and Y != "" and stringdist.levenshtein_norm( str(X).lower(), str(Y).lower()) <= .1: title_NLD += 1 if X != "" and X == Y: title_extracted_correctly += 1 except: data = "" title_precision = title_extracted_correctly / title_extracted NLD_precision = title_NLD / title_extracted recall = title_extracted_correctly / title NLD_recall = title_NLD / title try: F1 = 2.0 * title_precision * recall / (title_precision + recall) except: F1 = 0.0 try: NLD_F1 = 2.0 / (1.0 / NLD_precision + 1.0 / NLD_recall) except: NLD_F1 = 0.0 title = [title_precision, NLD_precision, recall, NLD_recall, F1, NLD_F1] #abstract description = 0 description_extracted = 0 description_extracted_correctly = 0 description_NLD = 0 i = 0 for i in range(1, number_of_files): row = i filename = folder + "/" + "metadata_" + database[i][0] + ".json" try: with open(filename) as f: data = json.load(f) X = fbase(16).lower() Y = "".join(fdata("dc.description.abstract")).lower() for p in string.punctuation: p = "\\" + str(p) X = re.sub(p, " ", re.sub("\s+", " ", X)).strip() Y = re.sub(p, " ", re.sub("\s+", " ", Y)).strip() if X != "": description += 1 if Y != "": description_extracted += 1 if X != "" and stringdist.levenshtein_norm(str(X), str(Y)) <= .1: description_NLD += 1 if X != "" and X == Y: description_extracted_correctly += 1 except: pass description_precision = description_extracted_correctly / description_extracted NLD_precision = description_NLD / description_extracted recall = description_extracted_correctly / description NLD_recall = description_NLD / description try: F1 = 2.0 * description_precision * recall / (description_precision + recall) except: F1 = 0.0 try: NLD_F1 = 2.0 / (1.0 / NLD_precision + 1.0 / NLD_recall) except: NLD_F1 = 0.0 abstract = [ description_precision, NLD_precision, recall, NLD_recall, F1, NLD_F1 ] #editor editor = 0 editor_extracted = 0 edit_precision = 0.0 edit_recall = 0.0 n = 0 NLD_precision = 0 NLD_recall = 0 for i in range(1, number_of_files): row = i filename = folder + "/" + "metadata_" + database[i][0] + ".json" try: with open(filename) as f: data = json.load(f) X = str(fbase(5)).strip() if X != "": editor += 1 try: Y = "".join(fdata("dc.contributor.editor")).strip() except: Y = "" X_list = X.split("||") Y_list = Y.split("||") Common = list(set(X_list) & set(Y_list)) if (len(Y_list) != 0): edit_precision += len(Common) / len(Y_list) if (len(X_list) != 0): edit_recall += len(Common) / len(X_list) #NLD_similarity calculation common_len = 0 check_list = Y_list.copy() for i in range(len(X_list)): for j in range(len(Y_list)): if X_list[i] != '' and Y_list[j] != '': if stringdist.levenshtein_norm( str(X_list[i]), str(Y_list[j])) <= .1: Y_list[j] = '' #added common_len += 1 break Y_list = check_list.copy() if len(Y_list) != 0: NLD_precision += common_len / len(Y_list) if len(X_list) != 0: NLD_recall += common_len / len(X_list) if Y != "": editor_extracted += 1 except: pass edit_precision /= editor_extracted edit_recall /= editor NLD_precision /= editor_extracted NLD_recall /= editor try: edit_F1 = 2.0 * edit_precision * edit_recall / (edit_precision + edit_recall) except: edit_F1 = 0.0 try: NLD_F1 = 2.0 / (1.0 / NLD_precision + 1.0 / NLD_recall) except: NLD_F1 = 0.0 editor = [ edit_precision, NLD_precision, edit_recall, NLD_recall, edit_F1, NLD_F1 ] #illustrator illustrator = 0 illustrator_extracted = 0 i = 0 illus_precision = 0.0 illus_recall = 0.0 NLD_precision = 0.0 NLD_recall = 0.0 for i in range(1, number_of_files): row = i filename = folder + "/" + "metadata_" + database[i][0] + ".json" with open(filename) as f: try: data = json.load(f) X = fbase(6) Y = "".join(fdata("dc.contributor.illustrator")) X_list = X.split("||") Y_list = Y.split("||") Common = list(set(X_list) & set(Y_list)) if (len(Y_list) != 0): illus_precision += len(Common) / len(Y_list) if (len(X_list) != 0): illus_recall += len(Common) / len(X_list) #NLD_similarity calculation common_len = 0 check_list = Y_list.copy() for i in range(len(X_list)): for j in range(len(Y_list)): if X_list[i] != '' and Y_list[j] != '': if stringdist.levenshtein_norm( str(X_list[i]), str(Y_list[j])) <= .1: common_len += 1 Y_list[j] = '' #added break Y_list = check_list.copy() if len(Y_list) != 0: NLD_precision += common_len / len(Y_list) if (len(X_list) != 0): NLD_recall += common_len / len(X_list) if X != "": illustrator += 1 if Y != "": illustrator_extracted += 1 except: pass illus_precision /= illustrator_extracted illus_recall /= illustrator NLD_precision /= illustrator_extracted NLD_recall /= illustrator try: illus_F1 = 2.0 * illus_precision * illus_recall / (illus_precision + illus_recall) except: illus_F1 = 0.0 try: NLD_F1 = 2.0 / (1.0 / NLD_precision + 1.0 / NLD_recall) except: NLD_F1 = 0.0 illustrator = [ illus_precision, NLD_precision, illus_recall, NLD_recall, illus_F1, NLD_F1 ] #isbn isbn = 0 isbn_extracted = 0 isbn_extracted_correctly = 0 for i in range(1, number_of_files): row = i filename = folder + "/" + "metadata_" + database[i][0] + ".json" with open(filename) as f: try: data = json.load(f) X = database[i][22] Y = "".join(fdata("dc.identifier.isbn")) if X != "": isbn += 1 if Y != "": isbn_extracted += 1 if X != "" and str(int(X)) == str(Y): isbn_extracted_correctly += 1 except: pass isbn_precision = isbn_extracted_correctly / isbn_extracted recall = isbn_extracted_correctly / isbn try: F1 = 2.0 * isbn_precision * recall / (isbn_precision + recall) except: F1 = 0.0 isbn = [isbn_precision, recall, F1] #copyright copyright = 0 copyright_extracted = 0 copyright_extracted_correctly = 0 for i in range(1, number_of_files): row = i filename = folder + "/" + "metadata_" + database[i][0] + ".json" with open(filename) as f: try: data = json.load(f) X = int(fbase(7)) Y = fdata("dc.date.copyright") if X != "": copyright += 1 if Y != "": copyright_extracted += 1 if X != "" and str(int(X)) == str(Y): copyright_extracted_correctly += 1 except: pass copyright_precision = copyright_extracted_correctly / copyright_extracted recall = copyright_extracted_correctly / copyright try: F1 = 2.0 * copyright_precision * recall / (copyright_precision + recall) except: F1 = 0.0 copyright = [copyright_precision, recall, F1] #education Level Educational_level = 0 Educational_level_extracted = 0 Educational_level_extracted_correctly = 0 i = 0 for i in range(1, number_of_files): #i+=1 row = i filename = folder + "/" + "metadata_" + database[i][0] + ".json" try: with open(filename) as f: data = json.load(f) X = fbase(57).lstrip('"') Y = fdata("dcterm.educationlevel") X = str(X).lower() Y = str(Y).lower() if X != "": Educational_level += 1 if Y != "": Educational_level_extracted += 1 if X is not None and X == Y: Educational_level_extracted_correctly += 1 except: pass Educational_level_precision = Educational_level_extracted_correctly / Educational_level_extracted recall = Educational_level_extracted_correctly / Educational_level try: F1 = 2.0 * Educational_level_precision * recall / ( Educational_level_precision + recall) except: F1 = 0.0 education_level = [Educational_level_precision, recall, F1] #DDC DDC = 0 DDC_extracted = 0 i = 1 ddc_precision = 0.0 ddc_recall = 0.0 for i in range(1, number_of_files): row = i filename = folder + "/" + "metadata_" + database[i][0] + ".json" try: with open(filename, 'r') as f: data = json.load(f) X = fbase(43).lstrip('"') Y = fdata("dc.subject.ddc") X = str(X).lower() Y = str(Y).lower() #print ("DDC1: " + str(X_list) + str(Y_list)) X = re.findall("level1.*?{.*?}", X)[0] #Take only level 1 DDC Y = re.findall("level1.*?{.*?}", Y)[0] #Take only level 1 DDC #X1=re.findall("level1.*?{.*?}",X)[0] #Y1=re.findall("level1.*?{.*?}",Y)[0] #X2=re.findall("level2.*?{.*?}",X)[0] #Y2=re.findall("level2.*?{.*?}",Y)[0] #X3=re.findall("level3.*?{.*?}",X)[0] #Y3=re.findall("level3.*?{.*?}",Y)[0] X_list = re.findall(r"\d00", X) Y_list = re.findall(r"\d00", Y) #X1_list=re.findall(r"\d00",X1) #Y1_list=re.findall(r"\d00",Y1) #X2_list=re.findall(r"\d00",X2) #Y2_list=re.findall(r"\d00",Y2) #X3_list=re.findall(r"\d00",X3) #Y3_list=re.findall(r"\d00",Y3) #X_list = list(set(X1_list + X2_list + X3_list)) #Y_list = list(set(Y1_list + Y2_list + Y3_list)) #X_list=re.findall(r"'\d+': '.*?'",X) #Y_list=re.findall(r"'\d+': '.*?'",Y) Common = list(set(X_list) & set(Y_list)) #print (database[i][0] + " DDC2--------->: " + str(X_list) + str(Y_list)) #print("Len of common, X_list, Y_list = " + str(len(Common)) + str(len(X_list)) + str(len(Y_list))) if (len(Y_list) != 0): ddc_precision += len(Common) / len(Y_list) if (len(X_list) != 0): ddc_recall += len(Common) / len(X_list) if len(X_list) != 0: DDC += 1 if len(Y_list) != 0: DDC_extracted += 1 except: pass ddc_precision /= DDC_extracted ddc_recall /= DDC try: ddc_F1 = 2.0 / (1.0 / ddc_precision + 1.0 / ddc_recall) except: ddc_F1 = 0.0 ddc = [ddc_precision, ddc_recall, ddc_F1] #Contents content = 0 content_extracted = 0 content_extracted_correctly = 0 cont_precision = 0.0 cont_recall = 0.0 for i in range(1, number_of_files): row = i filename = folder + "/" + "metadata_" + database[i][0] + ".json" try: with open(filename) as f: data = json.load(f) X = fbase(14) Y = fdata("dc.description.toc") X = str(X).lower() Y = str(Y).lower() X = re.sub("x\d+", "", X) for p in string.punctuation + "–" + "—" + "‘": if p is not ",": p = "\\" + str(p) X = re.sub(p, " ", str(X).lower().strip()) Y = re.sub(p, " ", str(Y).lower().strip()) X = re.sub("\s+", " ", X) Y = re.sub("\s+", " ", Y) X_list = X.split(',') Y_list = Y.split(',') X_list_1 = list() Y_list_1 = list() for elem in X_list: elem = elem.strip() X_list_1.append(elem) for elem in Y_list: elem = elem.strip() Y_list_1.append(elem) X_list = X_list_1 Y_list = Y_list_1 #print("FINAL CONTENTS ***-------------------------> (X, Y):") #print(X_list) #print(Y_list) Common = list(set(X_list) & set(Y_list)) #print("LEN CONTENTS: Common: " + str(len(Common)) + ", X_list:" + str(len(X_list)) + ", Y_list: " + str(len(Y_list))) if (Y != ''): cont_precision += len(Common) / len(Y_list) if (X != ''): cont_recall += len(Common) / len(X_list) if X != '': content += 1 if Y != '': content_extracted += 1 if X is not None and str(X) == str(Y): content_extracted_correctly += 1 except: pass #print("Content extracted exactly correct = " + str(content_extracted_correctly) + ", total extracted TOCs = " + str(content_extracted) + ", total TOCs = " + str(content)) #print("Content precision total = " + str(cont_precision) + ", Content recall total = " + str(cont_recall)) cont_precision /= content_extracted cont_recall /= content try: cont_F1 = 2.0 * cont_precision * cont_recall / (cont_precision + cont_recall) except: cont_F1 = 0.0 content = [cont_precision, cont_recall, cont_F1] #print("Contents eval: " + str(content)) perfMatrix1 = np.array([title, abstract, editor, illustrator], np.float) perfMatrix2 = np.array([content, isbn, copyright, education_level, ddc], np.float) #Return the matrices d = dict() d['matrix1'] = perfMatrix1 d['matrix2'] = perfMatrix2 d['error'] = False return d
def test_levenshtein_norm_substitution(self): """It should return right normalized dist when substitution involved""" self.assertEqual(levenshtein_norm('abcd!', 'abcde'), 0.2)