def matcher(self, source, target, matrix, i, j): from similarity.ngram import NGram twogram = NGram(2) sim_score = 1 - twogram.distance(source, target) matrix[i, j] = sim_score return matrix
def test_similarity(): from similarity.ngram import NGram twogram = NGram(2) print(twogram.distance('ABCD', 'ABTUIO')) s1 = 'Adobe CreativeSuite 5 Master Collection from cheap 4zp' s2 = 'Adobe CreativeSuite 5 Master Collection from cheap d1x' fourgram = NGram(4) print(fourgram.distance(s1, s2)) # print(twogram.distance(s1, s2)) # s2 = 'Adobe CreativeSuite 5 Master Collection from cheap 4zp' # print(fourgram.distance(s1, s2)) # # print(fourgram.distance('ABCD', 'ABTUIO')) print(1 - fourgram.distance(s1, s2))
def compute_similarity_ngram(word1, word2, n): ngram = NGram(n) sim = ngram.distance(word1, word2) # print(sim) return sim
DEBUG_MODE = False from similarity.ngram import NGram twogram = NGram(2) def matcher_name(src, tar, function): sim_score = 1 - function.distance(src, tar) return sim_score import pandas as pd import numpy as np def matcher_name_matrix(srcs, tars, function=twogram): sim_matrix = np.zeros((len(srcs), len(tars))) for i, s in enumerate(srcs): for j, t in enumerate(tars): sim_score = 1 - function.distance(s, t) sim_matrix[i, j] += sim_score sim_scores = pd.DataFrame(data=sim_matrix, columns=tars, index=srcs) return sim_scores import math def sigmoid(x):
def similarity(self, question, answer): stopword = self.read_from(folder_path + '上证专用停用词.txt') stopwords = [] for sw in stopword: sw = sw.strip('\n') sw = sw.strip(' ') stopwords.append(sw) # print(stopwords) meaningful_words1 = [] meaningful_words2 = [] words2 = jieba.cut(str(question)) words3 = jieba.cut(str(answer)) for word in words2: if word not in stopwords: meaningful_words1.append(word) for word in words3: if word not in stopwords: meaningful_words2.append(word) s2 = ''.join(meaningful_words1) # print(s2) s3 = ''.join(meaningful_words2) a1 = Cosine(1) b1 = Damerau() c1 = Jaccard(1) d1 = JaroWinkler() e1 = Levenshtein() f1 = LongestCommonSubsequence() g1 = MetricLCS() h1 = NGram(2) i1 = NormalizedLevenshtein() j1 = OptimalStringAlignment() k1 = QGram(1) l1 = SorensenDice(2) m1 = WeightedLevenshtein(character_substitution=CharSub()) line_sim = [] cos_s = a1.similarity(s2, s3) line_sim.append(cos_s) cos_d = a1.distance(s2, s3) line_sim.append(cos_d) dam = b1.distance(s2, s3) line_sim.append(dam) jac_d = c1.distance(s2, s3) line_sim.append(jac_d) jac_s = c1.similarity(s2, s3) line_sim.append(jac_s) jar_d = d1.distance(s2, s3) line_sim.append(jar_d) jar_s = d1.similarity(s2, s3) line_sim.append(jar_s) lev = e1.distance(s2, s3) line_sim.append(lev) lon = f1.distance(s2, s3) line_sim.append(lon) met = g1.distance(s2, s3) line_sim.append(met) ngr = h1.distance(s2, s3) line_sim.append(ngr) nor_d = i1.distance(s2, s3) line_sim.append(nor_d) nor_s = i1.similarity(s2, s3) line_sim.append(nor_s) opt = j1.distance(s2, s3) line_sim.append(opt) qgr = k1.distance(s2, s3) line_sim.append(qgr) sor_d = l1.distance(s2, s3) line_sim.append(sor_d) sor_s = l1.similarity(s2, s3) line_sim.append(sor_s) wei = m1.distance(s2, s3) line_sim.append(wei) return line_sim
def main(): v1 = 'text' v2 = 'text' # -----------------------------------------------Edit based ------------------------------------------------------ print( "-------------------------------- Edit based ----------------------------------" ) print("------- HAMMING ---------") ed = Hamming() #The return value is a float between 0 and 1, where 0 means totally different, and 1 equal. print("Hamming Similarity: ", ed.normalized_similarity(v1, v2)) print("\n-------- MLIPNS --------") ed = MLIPNS() print("MLIPNS similarity: ", ed.similarity(v1, v2)) print("\n-------- JaroWinkler --------") ed = JaroWinkler() print("JaroWinkler similarity: ", ed.similarity(v1, v2)) print("\n-------- Jaro --------") ed = Jaro() print("Jaro similarity: ", ed.similarity(v1, v2)) # ----------------------------------------------- Token based ------------------------------------------------------ print( "-------------------------------- Token based ----------------------------------" ) print("\n-------- JACCARD --------") ed = Jaccard() print("JACCARD similarity: ", ed.similarity(v1, v2)) #considera a quantidade de letras print("\n-------- Sorensen --------") ed = Sorensen() print("Sorensen similarity: ", ed.similarity(v1, v2)) print("\n-------- Tversky --------") ed = Tversky() print("Tversky similarity: ", ed.similarity(v1, v2)) print("\n-------- Overlap --------") ed = Overlap() print("Overlap similarity: ", ed.similarity(v1, v2)) print("\n-------- Cosine --------") ed = Cosine() print("Cosine similarity: ", ed.similarity(v1, v2)) # ----------------------------------------------- Sequence based ------------------------------------------------------ print( "-------------------------------- Sequence based ----------------------------------" ) print("\n-------- RatcliffObershelp --------") ed = RatcliffObershelp() print("RatcliffObershelp similarity: ", ed.similarity(v1, v2)) # ----------------------------------------------- Compression based ------------------------------------------------------ print( "-------------------------------- Compression based ----------------------------------" ) print("\n-------- EntropyNCD --------") ed = EntropyNCD() print("EntropyNCD similarity: ", ed.similarity(v1, v2)) print("\n-------- BZ2NCD --------") ed = BZ2NCD() print("BZ2NCD similarity: ", ed.similarity(v1, v2)) print("\n-------- LZMANCD --------") ed = LZMANCD() print("LZMANCD similarity: ", ed.similarity(v1, v2)) print("\n-------- ZLIBNCD --------") ed = ZLIBNCD() print("ZLIBNCD similarity: ", ed.similarity(v1, v2)) # ----------------------------------------------- Simple based ------------------------------------------------------ print( "-------------------------------- Simple based ----------------------------------" ) print("\n-------- Prefix --------") ed = Prefix() print("Prefix similarity: ", ed.similarity(v1, v2)) print("\n-------- Postfix --------") ed = Postfix() print("Postfix similarity: ", ed.similarity(v1, v2)) # ----------------------------------------------- strsim function ------------------------------------------------------ print( "-------------------------------- strsim function ----------------------------------" ) print("\n-------- Normalized Levenshtein --------") ed = NormalizedLevenshtein() print("Normalized Levenshtein similarity: ", ed.similarity(v1, v2)) print("\n-------- MetricLCS --------") ed = MetricLCS() print("MetricLCS similarity: ", ed.distance(v1, v2)) print("\n-------- NGram --------") ed = NGram() print("NGram similarity: ", ed.distance(v1, v2)) print("\n-------- Sorensen --------") ed = Sorensen() print("Sorensen similarity: ", ed.similarity(v1, v2))
def similaridade(function_name, string_1, string_2): if function_name == 'Hamming': ed = Hamming() return ed.normalized_similarity(string_1, string_2) elif function_name == 'MLIPNS': ed = MLIPNS() return ed.similarity(string_1, string_2) elif function_name == 'JaroWinkler': ed = JaroWinkler() return ed.similarity(string_1, string_2) elif function_name == 'Jaro': ed = Jaro() return ed.similarity(string_1, string_2) elif function_name == 'Jaccard': ed = Jaccard() return ed.similarity(string_1, string_2) elif function_name == 'Sorensen': ed = Sorensen() return ed.similarity(string_1, string_2) elif function_name == 'Tversky': ed = Tversky() return ed.similarity(string_1, string_2) elif function_name == 'Overlap': ed = Overlap() return ed.similarity(string_1, string_2) elif function_name == 'Cosine': ed = Cosine() return ed.similarity(string_1, string_2) elif function_name == 'RatcliffObershelp': ed = RatcliffObershelp() return ed.similarity(string_1, string_2) elif function_name == 'EntropyNCD': ed = EntropyNCD() return ed.similarity(string_1, string_2) elif function_name == 'BZ2NCD': ed = BZ2NCD() return ed.similarity(string_1, string_2) elif function_name == 'LZMANCD': ed = LZMANCD() return ed.similarity(string_1, string_2) elif function_name == 'ZLIBNCD': ed = ZLIBNCD() return ed.similarity(string_1, string_2) elif function_name == 'Prefix': ed = Prefix() return ed.similarity(string_1, string_2) elif function_name == 'Postfix': ed = Postfix() return ed.similarity(string_1, string_2) elif function_name == 'NormalizedLevenshtein': ed = NormalizedLevenshtein() return ed.similarity(string_1, string_2) elif function_name == 'MetricLCS': ed = MetricLCS() return ed.distance(string_1, string_2) elif function_name == 'NGram': ed = NGram() return ed.distance(string_1, string_2) elif function_name == 'StrCmp95': ed = StrCmp95() return ed.distance(string_1, string_2)
# a = array(pair) # for key in data_model.datasets: # data_instance = data_model.datasets[key] # print('Data Instance: ' + key) # # for resource in data_instance.resources: # print(resource['format']) # # data = resource['data'] # first_row = data[0] from difflib import SequenceMatcher from similarity.ngram import NGram twogram = NGram(2) fourgram = NGram(4) from similarity.metric_lcs import MetricLCS metric_lcs = MetricLCS() def build_local_similarity_matrix(source_schema, target_schema): matrix= np.zeros((len(source_schema), len(target_schema))) for i in range(len(source_schema)): for j in range(len(target_schema)): # TODO call matcher sim_score = 1 - twogram.distance(source_schema[i],target_schema[j]) # matrix[i,j] = np.int(100*SequenceMatcher(None,source_schema[i],target_schema[j]).ratio()) matrix[i, j] = sim_score DEBUG_MODE = False # TODO <=
def pre_clustering(stats_path, source, target, instance_matching_output): twogram = NGram(2) threshold = 0.7 weights = [0.5, 0.5] for src_table in source: src_path = stats_path + src_table + '.json' f = open(src_path) src_data = json.load(f) src_attrs = list(src_data.keys()) for tar_table in target: print('-----') print(src_table, tar_table) tar_path = stats_path + tar_table + '.json' f = open(tar_path) tar_data = json.load(f) tar_attrs = list(tar_data.keys()) sim_matrix = np.zeros((len(src_data), len(tar_data))) for i in range(len(src_attrs)): src_vals = src_data[src_attrs[i]] src_datatype = find_datatype(src_vals) for j in range(len(tar_attrs)): tar_vals = tar_data[tar_attrs[j]] tar_datatype = find_datatype(tar_vals) print(src_attrs[i], tar_attrs[j]) if src_datatype == 'str' and tar_datatype == 'str': n_a, n_b, D, n_D, t, n_t = dh.compute_sets( src_vals, tar_vals, threshold, matcher_name, twogram) U_set = dh.cdf(n_t, n_a, n_b, n_D) else: U_set = 0.0 name_sim = matcher_name(src_attrs[i], tar_attrs[j], twogram) print(U_set, name_sim) if U_set > 1.0: U_set = 1.0 sim_matrix[i, j] = U_set * weights[0] + name_sim * weights[1] df_sim_matrix = pd.DataFrame(data=sim_matrix, columns=tar_attrs, index=src_attrs) filename = instance_matching_output + src_table + '/' if not os.path.exists(filename): os.makedirs(filename) filename += '%s||%s.csv' % (src_table, tar_table) df_sim_matrix.to_csv(filename, sep=',', encoding='utf-8') msg = 'Matrix saved for src=%s tar=%s to %s' % ( src_table, tar_table, filename) logging.info(msg) return
def select_datasources(): import os ls = os.listdir('./metadata') ls_dict = {} for st in ls: st = st.replace('-', ' ') st = st.replace('.', ' ') st = st.split(' ') st = st[1:-1] st = ' '.join(st) st = st.lower() ls_dict[st] = {'csv': [], 'json': []} from similarity.ngram import NGram twogram = NGram(2) metadata_sources = ls_dict.keys() for root, dirs, files in os.walk("../thesis_project_dataset"): curr_dir_path = root.split("/") curr_dir_name = curr_dir_path[-1] for file in files: filename, file_extension = os.path.splitext(file) dataset = root.split('/') dataset = dataset[2:3] if len(dataset) != 0 and dataset[0] != '.git': dataset = dataset[0] dataset = dataset.replace('-', ' ') found = False found_val = None curr_score = 0 found_datasource = None if dataset in ls_dict: found = True if found: dataset_collection = ls_dict[dataset] found = True found_val = dataset_collection found_datasource = dataset curr_score = 1 if not found: curr_score = 0 for metadata_source in metadata_sources: dist = 1 - twogram.distance(dataset, metadata_source) if dist < 0.85: print('skip', root + '/' + file) continue if dist > curr_score: found = True found_val = ls_dict[metadata_source] curr_score = dist found_datasource = metadata_source print('found', found, found_datasource, curr_score, file_extension, root + '/' + file) if not found: continue if file_extension == '.json': found_val['json'].append((root + '/' + file, curr_score)) if file_extension == '.csv': found_val['csv'].append((root + '/' + file, curr_score)) print(ls_dict) for key in ls_dict: val = ls_dict[key] val['csv'] = sorted(val['csv'], key=lambda x: x[1]) val['json'] = sorted(val['json'], key=lambda x: x[1]) import json with open('datasource_and_metadata.json', 'w') as fp: json.dump(ls_dict, fp, sort_keys=True, indent=2)
from pathlib import Path import csv import pandas as pd from sklearn import metrics cpath = open( "/Users/shwetha/Desktop/Desktop/2019S2-COMP90049_proj1-data/candidates.txt", "r") dpath = open( "/Users/shwetha/Desktop/Desktop/2019S2-COMP90049_proj1-data/dict.txt", "r") bpath = open( "/Users/shwetha/Desktop/Desktop/2019S2-COMP90049_proj1-data/blends_org.txt", "r") wordscheck = [] twogram = NGram(2) candidates = csv.reader(cpath, dialect="excel") dictionary = csv.reader(dpath, dialect="excel") blends = pd.read_table(bpath, names=("blends", "w1", "w2")) blends.head() blends.tail() blends.head(20) dictwords = list(dictionary) blendwords = list(blends)
def test_instance_matching(): import numpy as np import pandas as pd tar = [['attr1', 'attr2', 'attr3'], ['aaaa', 'bbb', 'ccc'], ['xxx', 'yyyy', 'zzz']] # y = [['attr4', 'attr5', 'attr6'], ['xxx', 'yyy', 'zzz'], ['aaa', 'bbb', 'ccc']] src = [['attr4'], ['xxx'], ['aaa'], ['mmm']] data_tar = np.array([np.array(xi) for xi in tar]) df_tar = pd.DataFrame(data=data_tar[1:, 0:], columns=data_tar[0, 0:]) data_src = np.array([np.array(xi) for xi in src]) df_src = pd.DataFrame(data=data_src[1:, 0:], columns=data_src[0, 0:]) print(df_tar.to_string()) print(df_src.to_string()) schema_tar = list(df_tar.columns.values) schema_src = list(df_src.columns.values) print(schema_tar) print(schema_src) src_values = [] tar_values = [] src_val_len = 0 tar_val_len = 0 for attr in schema_src: src_values.extend(list(df_src[attr])) src_val_len = len(list(df_src[attr])) for attr in schema_tar: tar_values.extend(list(df_tar[attr])) tar_val_len = len(list(df_tar[attr])) from similarity.ngram import NGram twogram = NGram(2) match_threshold = 0.6 sim_matrix = np.zeros((len(schema_src), len(schema_tar))) for i in range(len(src_values)): src_value = src_values[i] src_ind = i // src_val_len src_attr = schema_src[src_ind] for j in range(len(tar_values)): tar_value = tar_values[j] tar_ind = j // tar_val_len tar_attr = schema_tar[tar_ind] sim_score = 1 - twogram.distance(str(src_value), str(tar_value)) if str(src_value) == 'None' or str(tar_value) == 'None': sim_score = 0 if sim_score > match_threshold: sim_matrix[src_ind, tar_ind] += sim_score print('sim_score >= ', match_threshold, ': ', src_attr, tar_attr, src_value, tar_value, sim_score) df_sim_matrix = pd.DataFrame(data=sim_matrix, columns=schema_tar, index=schema_src) print(df_sim_matrix.to_string())
def fourgram(self, s0, s1): fourgram = NGram(3) #print('Fourgram similarity \"%\" vs \"%\"'% (s0,s1)) return 1 - fourgram.distance(s0, s1)
from similarity.jaccard import Jaccard from similarity.sorensen_dice import SorensenDice from scipy.spatial.distance import euclidean, cosine, cityblock from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize import numpy as np # Inizializza all'import levenshtein = Levenshtein() norm_levenshtein = NormalizedLevenshtein() damerau = Damerau() optimal_string_alignment = OptimalStringAlignment() jarowinkler = JaroWinkler() lcs = LongestCommonSubsequence() metric_lcs = MetricLCS() ngram = NGram() qgram = QGram() dice = SorensenDice() cos = Cosine(5) jaccard = Jaccard(5) similarity_functions = [ norm_levenshtein.similarity, lambda a, b: 1 - metric_lcs.distance(a, b), lambda a, b: 1 - ngram.distance(a, b), cos.similarity, dice.similarity ] def mono_vector0(tup1, tup2): str1 = ' '.join(tup1).lower() str2 = ' '.join(tup2).lower()