def RatingSorensen(revista): #Get path file dirname = os.path.dirname(__file__) loc = os.path.join(dirname, r'JCR2018.xlsx') #Initialize reader workbook = xlrd.open_workbook(loc) sheet = workbook.sheet_by_index(0) tuplas = [] start_time = time() for i in range(sheet.nrows): valor = (sheet.cell_value(i, 1), sorensen(revista, sheet.cell_value(i, 1))) tuplas.append(valor) final_time = time() execution_time = round(final_time - start_time, 2) tuplas.sort(key=lambda revista: revista[1]) top_5 = tuplas[:10] result = (top_5, execution_time) return result
def compute_similarity(X): """ Compute similarity matrix with mean of 3 distances :param X: List of contracts ssdeep hashes :return: Similarity matrix """ jaccard_matrix = pdist(X, lambda x, y: distance.jaccard(x[0], y[0])) np.savetxt("../data/jaccard_matrix.csv", np.asarray(squareform(jaccard_matrix)), delimiter=",") sorensen_matrix = pdist(X, lambda x, y: distance.sorensen(x[0], y[0])) np.savetxt("../data/sorensen_matrix.csv", np.asarray(squareform(sorensen_matrix)), delimiter=",") # normalized, so that the results can be meaningfully compared # method=1 means the shortest alignment between the sequences is taken as factor levenshtein_matrix = pdist( X, lambda x, y: distance.nlevenshtein(x[0], y[0], method=1)) np.savetxt("../data/levenshtein_matrix.csv", np.asarray(squareform(levenshtein_matrix)), delimiter=",") mean_matrix = 1 - np.mean(np.array( [jaccard_matrix, sorensen_matrix, levenshtein_matrix]), axis=0) np.savetxt("../data/similarity_matrix.csv", np.asarray(mean_matrix), delimiter=",") print("Similarity matrix computed.") return mean_matrix
def sorensen_plus(a: str, b: str) -> float: length = min(len(a), len(b)) ng = [ distance.sorensen(ngrams(a, n), ngrams(b, n)) for n in range(1, length + 1) ] return 1 - np.sum(ng) / length
def title_similarity_np(row1, row2, method="difflib"): if method.lower() == "levenshtein": return 1 - distance.nlevenshtein(row1[1], row2[1], method=1) if method.lower() == "sorensen": return 1 - distance.sorensen(row1[1], row2[1]) if method.lower() == "jaccard": return 1 - distance.jaccard(row1[1], row2[1]) return difflib.SequenceMatcher(None, row1[1], row2[1]).quick_ratio()
def compare_ocr_strings_sorensen(ocr_string1, ocr_string2): """ Sorensen distance :param ocr_string1: :param ocr_string2: :return: """ result = distpkg.sorensen(ocr_string1, ocr_string2) return result
def extract_basic_distance_feat(df): ## jaccard coef/dice dist of n-gram print "generate jaccard coef and dice dist for n-gram" dists = ["jaccard_coef", "dice_dist"] grams = ["unigram", "bigram", "trigram"] feat_names = ["origsent", "candsent"] for stem in ["", "_stem"]: for dist in dists: for gram in grams: for i in range(len(feat_names) - 1): for j in range(i + 1, len(feat_names)): target_name = feat_names[i] obs_name = feat_names[j] df["%s_of_%s_between_%s_%s%s" % (dist, gram, target_name, obs_name, stem)] = list( df.apply(lambda x: compute_dist( x[target_name + "_" + gram + stem], x[ obs_name + "_" + gram + stem], dist), axis=1)) print "generate rest all features" gram_ext = [ "_unigram", "_bigram", "_trigram", "_char_unigram", "_char_bigram", "_char_trigram" ] for stem in ["", "_stem"]: for gram in gram_ext: df["levenshtein_%s%s" % (gram, stem)] = list( df.apply(lambda x: distance.nlevenshtein( x["origsent" + gram + stem], x["candsent" + gram + stem], method=2), axis=1)) df["sorensen_%s%s" % (gram, stem)] = list( df.apply(lambda x: distance.sorensen( x["origsent" + gram + stem], x["candsent" + gram + stem]), axis=1)) df["cosine_%s%s" % (gram, stem)] = list( df.apply(lambda x: cosine(x["origsent" + gram + stem], x[ "candsent" + gram + stem]), axis=1)) df["precision_%s%s" % (gram, stem)] = list( df.apply(lambda x: precision_recall( x["origsent" + gram + stem], x["candsent" + gram + stem], x["origsent" + gram + stem]), axis=1)) df["recall1gram_%s%s" % (gram, stem)] = list( df.apply(lambda x: precision_recall( x["origsent" + gram + stem], x["candsent" + gram + stem], x["candsent" + gram + stem]), axis=1)) df["f1gram_%s%s" % (gram, stem)] = list( df.apply( lambda x: fmeasure(x["precision_%s%s" % (gram, stem)], x["recall1gram_%s%s" % (gram, stem)]), axis=1))
def similarity_string(a,b,measure): a = a.lower() b = b.lower() measure = measure.lower() if (measure == "matcher"): return SequenceMatcher(None,a,b).ratio() elif (measure == "sorensen"): return 1 - distance.sorensen(a, b) else: return 0
def title_similarity_pd(row, method='difflib'): if method.lower() == "levenshtein": return 1 - distance.nlevenshtein( row["title"], row["title_R"], method=1) if method.lower() == "sorensen": return 1 - distance.sorensen(row["title"], row["title_R"]) if method.lower() == "jaccard": return 1 - distance.jaccard(row["title"], row["title_R"]) return difflib.SequenceMatcher(None, row["title"], row["title_R"]).quick_ratio()
def findPizza(pizzaType): lowestScore = 1 match = '' for pizza in PIZZAS: score = sorensen(pizza.lower(), pizzaType.lower()) # print(pizza, score) if score < lowestScore: lowestScore = score match = pizza return match
def get_features(raw_data): fet_data = pd.DataFrame() print "extracting count features..." fet_data["q_len"] = raw_data["query"].map(word_len) fet_data["t_len"] = raw_data["product_title"].map(word_len) fet_data["d_len"] = raw_data["product_description"].map(word_len) print "extracting basic distance features from q and t..." fet_data["nleven1"] = raw_data.apply(lambda x: distance.nlevenshtein(x.q, x.t, method=1), axis=1) fet_data["nleven2"] = raw_data.apply(lambda x: distance.nlevenshtein(x.q, x.t, method=2), axis=1) fet_data["sorensen"] = raw_data.apply(lambda x: distance.sorensen(x.q, x.t), axis=1) fet_data["jaccard"] = raw_data.apply(lambda x: distance.jaccard(x.q, x.t), axis=1) fet_data["ncd"] = raw_data.apply(lambda x: ncd(x.q, x.t), axis=1) print "extracting basic distance features from q_ex and t..." fet_data["sorensen_ex"] = raw_data.apply(lambda x: distance.sorensen(get_uniq_words_text(x.q_ex), x.t), axis=1) print "extracting basic distance features from q_ex and t..." fet_data["jaccard_ex"] = raw_data.apply(lambda x: distance.jaccard(get_uniq_words_text(x.q_ex), x.t), axis=1) print "extracting basic distance features from q_ex and t..." fet_data["ncd_ex"] = raw_data.apply(lambda x: ncd(get_uniq_words_text(x.q_ex), x.t), axis=1) return fet_data
def calculate_distance_numeric(u1, u2, d_type, weights): # if the data is preprocessed and all fields are converted to numeric return { "jaccard": distance.jaccard(u1, u2), "euclidean": sqrt(sum( pow((1 / w) * (a - b), 2) for a, b, w in zip(u1, u2, weights))), "cosine": spatial.distance.cosine(u1, u2), "sorensen": distance.sorensen(u1, u2), "hamming": distance.hamming(u1, u2, normalized=True) }[d_type]
def similarity_sentence_ngram(self, s1, s2): ng1 = self.init_list_of_objects(min(len(s1.split()) + 1, self.max_ngrams) - 2) ng2 = self.init_list_of_objects(min(len(s2.split()) + 1, self.max_ngrams) - 2) for j in range(2, min(len(s1.split()) + 1, self.max_ngrams)): for ngram in ngrams(s1.split(), j): ng1[j - 2].append(ngram) for j in range(2, min(len(s2.split()) + 1, self.max_ngrams)): for ngram in ngrams(s2.split(), j): ng2[j - 2].append(ngram) sum = 0 for j in range(min(min(len(s1.split()) + 1, len(s2.split()) + 1), self.max_ngrams) - 2): sum += np.sum(distance.sorensen(ng1[j][i], ng2[j][i]) for i in range(min(len(ng1[j]), len(ng2[j])))) / min(len(ng1[j]), len(ng2[j])) sum = sum / min(min(len(s1.split()) + 1, len(s2.split()) + 1), self.max_ngrams) return 1 - sum
def calculate_edit_distance(code_block1, code_block2, ignore_literals, distance_metric, verbose=False): if ignore_literals: # Todo. Just ignore difference in strings if they are substentially different block1 = abstract(code_block1) block2 = abstract(code_block2) if verbose: print("[.] Abstracted code blocks:") print(block1.strip()) print(block2.strip()) else: block1 = code_block1 block2 = code_block2 # Tokenize tokens1 = tokenize_fine_grained(block1, keep_whitespace=False) tokens2 = tokenize_fine_grained(block2, keep_whitespace=False) if not tokens1 or not tokens2: return float('inf') if not has_alpha(tokens1) or not has_alpha(tokens2): return float('inf') if verbose: print(tokens1) print(tokens2) # https://github.com/doukremt/distance if distance_metric == "j": return distance.jaccard(tokens1, tokens2) elif distance_metric == "l": return distance.levenshtein(tokens1, tokens2) elif distance_metric == "h": return distance.hamming(tokens1, tokens2) elif distance_metric == "s": return distance.sorensen(tokens1, tokens2) elif distance_metric == "n": # Normalized Levenshtein return distance.nlevenshtein(tokens1, tokens2) elif distance_metric == "c": # Collapsed Levenshtein edit distance return collapse_edit_distance(tokens1, tokens2, verbose=verbose) elif distance_metric == "nc": # Normalized collapsed Levenshtein edit distance collapsed = collapse_edit_distance(tokens1, tokens2, verbose=verbose) return collapsed / max(len(tokens1), len(tokens2))
def train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = []): allTrainX = list() allTrainY = list() with open("./data/train.csv") as f: for line in f: lin = line.split(",") if len(lin) == 3: st1 = lin[0].lower() st2 = lin[1].lower() temp = [ 1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))), lev.jaro(st1,st2), lev.jaro_winkler(st1,st2), lev.ratio(st1,st2), distance.sorensen(st1,st2), jaccard(set(st1),set(st2)), 1. - distance.nlevenshtein(st1,st2,method=1), 1. - distance.nlevenshtein(st1,st2,method=2), dice_coefficient(st1,st2,lenGram=2), dice_coefficient(st1,st2,lenGram=3), dice_coefficient(st1,st2,lenGram=4), cosineWords(st1,st2,dictTrain,tfidf_matrix_train), cosineBigrams(st1,st2,dictTrainBigrams,tfidf_matrix_trainBigrams,lenGram) ] if len(delete) > 0: for elem in delete: temp[elem] = 0. allTrainX.append(temp) allTrainY.append(int(lin[2])) X = np.array(allTrainX,dtype=float) y = np.array(allTrainY,dtype=float) clf = svm.LinearSVC(C=1.,dual=False,loss='l2', penalty='l1') clf2 = linear_model.LogisticRegression(C=1.,dual=False, penalty='l1') clf.fit(X, y) clf2.fit(X, y) weights = np.array(clf.coef_[0]) print(weights) weights = np.array(clf2.coef_[0]) print(weights) return clf,clf2
def update_page_recs(cls, document): res = {} for site in Page.objects(): if site != document: kwa, kwb = document.label_model, site.label_model ca = set(kwa) cb = set(kwb) if len(ca) > 0 and len(cb) > 0: res[site.id] = distance.sorensen(ca, cb) else: res[site.id] = 1 # 1 = totally different best = sorted(res.iteritems(), key=operator.itemgetter(1), reverse=False)[:10] ret = [] for (obj, score) in best: s = Page.objects(id=obj).first() ret.append(s) document.recs = ret document.save()
def update_site_recs(cls, document, sites=None): res = {} if not sites: sites = Site.objects() sites.timeout(False) for site in sites: if site != document: kwa, kwb = document.keywords, site.keywords ca = set(kwa) cb = set(kwb) if len(ca) > 0 and len(cb) > 0: res[site.id] = distance.sorensen(ca, cb) else: res[site.id] = 1 # 1 = totally different best = sorted(res.iteritems(), key=operator.itemgetter(1), reverse=False)[:10] ret = [] for (obj, score) in best: s = Site.objects(id=obj).first() ret.append(s) document.recs = ret document.save()
def similarity_sentence_ngram(s1, s2): ng1 = init_list_of_objects(min(len(s1.split()) + 1, MAX_NGRAM) - 2) ng2 = init_list_of_objects(min(len(s2.split()) + 1, MAX_NGRAM) - 2) for j in range(2, min(len(s1.split()) + 1, MAX_NGRAM)): for ngram in ngrams(s1.split(), j): ng1[j - 2].append(ngram) for j in range(2, min(len(s2.split()) + 1, MAX_NGRAM)): for ngram in ngrams(s2.split(), j): ng2[j - 2].append(ngram) summ = 0 for j in range( min(min(len(s1.split()) + 1, len(s2.split()) + 1), MAX_NGRAM) - 2): summ += np.sum( distance.sorensen(ng1[j][i], ng2[j][i]) for i in range(min(len(ng1[j]), len(ng2[j])))) / min( len(ng1[j]), len(ng2[j])) summ = summ / min(min(len(s1.split()) + 1, len(s2.split()) + 1), MAX_NGRAM) print(summ) return 1 - summ
def distance_vec(s1, s2): edit_distance = distance.levenshtein(s1, s2) jaccard_distance = distance.jaccard(s1, s2) sorensen_distance = distance.sorensen(s1, s2) # hamming_distnace = distance.hamming(s1, s2) fc_distance = distance.fast_comp(s1, s2, transpositions=True) substring_distince = distance.lcsubstrings(s1, s2, positions=True)[0] common_words_distcance = len(get_common_words(s1, s2)) tf_distance = tf_similarity(s1, s2) tfidf_distance = tfidf_similarity(s1, s2) vec = np.array([ edit_distance, # 编辑距离 jaccard_distance, # jaccard距离 sorensen_distance, # sorensen # hamming_distnace, # 汉明距离 fc_distance, # fast commaon substring_distince, # 最长公共子串长度 common_words_distcance, # 公共词个数 tf_distance, # 单文本tf tfidf_distance # 单文本tfidf ]) return vec
def str_sorensen(str1, str2): str1_list = str1.split(' ') str2_list = str2.split(' ') res = distance.sorensen(str1_list, str2_list) return res
def sorensen_word(s1, s2): N = min(len(s1), len(s2)) ng1 = [ngrams_word(s1, j) for j in range(2, min(len(s1) + 1, MAX_NGRAM))] ng2 = [ngrams_word(s2, j) for j in range(2, min(len(s2) + 1, MAX_NGRAM))] return 1 - np.sum( distance.sorensen(ng1[0][i], ng2[0][i]) for i in range(N)) / (N)
if not entry.name: delete_empty(id, entry) continue if len(entry.email) < 3: # most likely not an email leakage continue full_name = entry.name.full_name or entry.name.family_name or entry.name.given_name if full_name is not None: full_name = full_name.text if not full_name: print "empty full name" continue l_full_name = full_name.lower() min_distance = 0.5 keep_emails = [] for email in entry.email: username = email.address.split('@')[0] #d = distance.nlevenshtein(username.lower(), l_full_name) #d2 = distance.jaccard(username.lower(), l_full_name) d3 = distance.sorensen(username.lower(), l_full_name) if d3 <= min_distance: keep_emails.append(email.address) if len(keep_emails) != len(entry.email): delete_extra_emails(id, keep_emails, full_name, entry)
def calculate(self, row): seq1 = str(row['question1']) seq2 = str(row['question2']) jaccard = distance.jaccard(seq1, seq2) sorensen = distance.sorensen(seq1, seq2) return [jaccard, sorensen]
def sorensen(doc1, doc2): z = distance.sorensen(doc1.lower().strip(), doc2.lower().strip()) return z
def str_sorensen(str1, str2): res = distance.sorensen(str1, str2) return res
def dist_fn(self, xs, ys): try: return distance.sorensen(xs, ys) except ZeroDivisionError: return 1
def calsulateDistances(st1, st2): diffl = difflib.SequenceMatcher(None, st1, st2).ratio() lev = Levenshtein.ratio(st1, st2) sor = 1 - distance.sorensen(st1, st2) jac = 1 - distance.jaccard(st1, st2) return diffl, lev, sor, jac
def sorensen_similarity_ratio(actual_content, expected_content): return 1 - distance.sorensen(actual_content, expected_content)
def sorensen_word(self, ng1, ng2): #ng1 = [ngrams(a, i) for i in range(1, min(len(a), len(b)))] #ng2 = [ngrams(b, i) for i in range(1, min(len(a), len(b)))] N = min(len(ng1), len(ng2)) return 1 - np.sum(distance.sorensen(ng1[i], ng2[i]) for i in range(N)) / N
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5) affprop.fit(lev_similarity) for cluster_id in np.unique(affprop.labels_): exemplar = words[affprop.cluster_centers_indices_[cluster_id]] cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)]) cluster_str = ", ".join(cluster) print(" - *%s:* %s" % (exemplar, cluster_str)) t1 = ("de", "ci", "si", "ve") t2 = ("de", "ri", "si", "ve") distance.levenshtein(t1, t2) sent1 = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] sent2 = ['the', 'lazy', 'fox', 'jumps', 'over', 'the', 'crazy', 'dog'] distance.levenshtein(sent1, sent2) distance.hamming("fat", "cat", normalized=True) #0.3333333333333333 distance.nlevenshtein("abc", "acd", method=1) # shortest alignment #0.6666666666666666 distance.nlevenshtein("abc", "acd", method=2) # longest alignment #0.5 distance.sorensen("decide", "resize") #0.5555555555555556 distance.jaccard("decide", "resize") #0.7142857142857143
def stats(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = [],plotX=False): with open("./data/stats.csv") as infile: for i,line in enumerate(infile): pass dimMatrix = 16 predict = np.zeros((i+1,dimMatrix)) clf1,clf2 = train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete=delete) with open("./data/stats.csv") as infile: for i,line in enumerate(infile): a = line.rstrip().split("\t") ## create same vector with more distances st1 = a[0].lower() st2 = a[1].lower() temp = [ 1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))), lev.jaro(st1,st2), lev.jaro_winkler(st1,st2), lev.ratio(st1,st2), distance.sorensen(st1,st2), jaccard(set(st1),set(st2)), 1. - distance.nlevenshtein(st1,st2,method=1), 1. - distance.nlevenshtein(st1,st2,method=2), dice_coefficient(st1,st2,lenGram=2), dice_coefficient(st1,st2,lenGram=3), dice_coefficient(st1,st2,lenGram=4), cosineWords(st1,st2), cosineBigrams(st1,st2)] if len(delete) > 0: for elem in delete: temp[elem] = 0. predict[i,:-3] = temp predict[i,-3] = clf1.decision_function(np.array(temp,dtype=float)) predict[i,-2] = clf2.decision_function(np.array(temp,dtype=float)) predict[i,-1] = a[-1] if plotX: labelsM = ["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"] f1matrix = np.zeros((100,dimMatrix-1)) fig = plt.figure() fig.set_size_inches(9,6) ax = fig.add_subplot(111) iC = -1 for i in np.linspace(0,1,100): iC += 1 for j in range(dimMatrix-1): t = np.array(predict[:,j]) if j >= dimMatrix-3: t = (t - np.min(t))/(np.max(t)-np.min(t)) f1matrix[iC,j] = f1_score(y_pred=t>i ,y_true=predict[:,-1]) F1scores = [] for j in range(dimMatrix-1): F1scores.append(np.max(f1matrix[:,j])) #ax.plot(np.linspace(0,1,100),f1matrix[:,j],label=labelsM[j],color=tableau20[j]) ax.bar(range(dimMatrix-1),F1scores) plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45) ax.set_ylabel("F1 score") ax.set_xlabel("Parameter") plt.legend(loc=2) customaxis(ax) plt.savefig("f1_bar.pdf") plt.show() fig = plt.figure() fig.set_size_inches(9, 6) ax = fig.add_subplot(111) AUCScores = [] for j in range(dimMatrix-1): # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(predict[:,-1], predict[:,j]) AUCScores.append(auc(fpr, tpr)) # Plot ROC curve ax.plot(fpr, tpr, label=labelsM[j],color=tableau20[j]) ax.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.set_title('ROC Curve') plt.legend(loc=2) customaxis(ax) plt.savefig("roc.pdf") plt.show() fig = plt.figure() fig.set_size_inches(9, 6) ax = fig.add_subplot(111) ax.bar(range(dimMatrix-1),AUCScores) ax.set_ylabel('Area Under Curve') plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45) customaxis(ax) plt.savefig("roc_bar.pdf") plt.show()
def feature_extraction(features): try: features["origsent_unigram"] = list( features.apply(lambda x: preprocess_token(x["origsent"]), axis=1)) except: features["origsent_unigram"] = list( features.apply(lambda x: preprocess_data2(x["origsent"]), axis=1)) try: features["candsent_unigram"] = list( features.apply(lambda x: preprocess_token(x["candsent"]), axis=1)) except: features["candsent_unigram"] = list( features.apply(lambda x: preprocess_data2(x["candsent"]), axis=1)) features["origsent_unigram_stem"] = list( features.apply(lambda x: preprocess_data(x["origsent"]), axis=1)) features["candsent_unigram_stem"] = list( features.apply(lambda x: preprocess_data(x["candsent"]), axis=1)) features["origsent_stem"] = list(features["origsent"].apply(preprocess)) features["candsent_stem"] = list(features["candsent"].apply(preprocess)) print "generate bigram" join_str = "_" try: features["origsent_bigram"] = list( features.apply( lambda x: getBigram(x["origsent_unigram"], join_str), axis=1)) except: templist = [] for x in features["origsent_unigram"].iteritems(): templist.append(getBigram(x, join_str)) features["origsent_unigram"] = templist try: features["origsent_bigram_stem"] = list( features.apply( lambda x: getBigram(x["origsent_unigram_stem"], join_str), axis=1)) except: templist = [] for x in features["origsent_unigram_stem"].iteritems(): templist.append(getBigram(x, join_str)) features["origsent_unigram_stem"] = templist features["candsent_bigram"] = list( features.apply(lambda x: getBigram(x["candsent_unigram"], join_str), axis=1)) features["candsent_bigram_stem"] = list( features.apply( lambda x: getBigram(x["candsent_unigram_stem"], join_str), axis=1)) ## trigram print "generate trigram" join_str = "_" features["origsent_trigram"] = list( features.apply(lambda x: getTrigram(x["origsent_unigram"], join_str), axis=1)) features["candsent_trigram"] = list( features.apply(lambda x: getTrigram(x["candsent_unigram"], join_str), axis=1)) features["origsent_trigram_stem"] = list( features.apply( lambda x: getTrigram(x["origsent_unigram_stem"], join_str), axis=1)) features["candsent_trigram_stem"] = list( features.apply( lambda x: getTrigram(x["candsent_unigram_stem"], join_str), axis=1)) #print "Generate Wordnet Features" #features["wordnet-similarity"] = list(features.apply(lambda x: ss.similarity(x["origsent"], x["candsent"], False), axis=1)) #features["wordnet-similarity-norm"] = list(features.apply(lambda x: ss.similarity(x["origsent"], x["candsent"], True), axis=1)) print "generate char gram" feat_names = ["origsent", "candsent"] grams = ["unigram", "bigram", "trigram"] for stem in ["", "_stem"]: for feat in feat_names: for gram in grams: try: features["%s_char_%s%s" % (feat, gram, stem)] = list( features.apply( lambda x: word2ngrams(x[feat + stem], gram), axis=1)) except: continue nonnumeric_columns.add("%s_char_%s%s" % (feat, gram, stem)) features["candsent_char_trigram"] = list( features.apply(lambda x: word2ngrams(x["candsent"], "trigram"), axis=1)) features["origsent_char_bigram_stem"] = list( features.apply(lambda x: word2ngrams(x["candsent_stem"], "bigram"), axis=1)) features["origsent_char_trigram_stem"] = list( features.apply(lambda x: word2ngrams(x["candsent_stem"], "trigram"), axis=1)) print "generate common word features" gram_ext = [ "_unigram", "_bigram", "_trigram", "_char_unigram", "_char_bigram", "_char_trigram" ] for stem in ["", "_stem"]: for gram in gram_ext: features["common-words_%s%s" % (gram, stem)] = list( features.apply(lambda x: len( intersect(x["origsent" + gram + stem], x["candsent" + gram + stem])), axis=1)) features["origsent_tag"] = list( features.apply(lambda x: preprocess_tag(x["origsenttag"]), axis=1)) features["candsent_tag"] = list( features.apply(lambda x: preprocess_tag(x["candsenttag"]), axis=1)) features["origsent_tag_unigram"] = features["origsent_tag"] features["candsent_tag_unigram"] = features["candsent_tag"] features["origsent_tag_left"] = list( features.apply( lambda x: word_left(x["origsent_unigram"], x["origsent_tag"]), axis=1)) features["candsent_tag_left"] = list( features.apply( lambda x: word_left(x["candsent_unigram"], x["candsent_tag"]), axis=1)) features["origsent_tag_right"] = list( features.apply( lambda x: word_right(x["origsent_unigram"], x["origsent_tag"]), axis=1)) features["candsent_tag_right"] = list( features.apply( lambda x: word_right(x["candsent_unigram"], x["candsent_tag"]), axis=1)) features["origsent_NER"] = list( features.apply(lambda x: preprocess_NER(x["origsenttag"]), axis=1)) features["candsent_NER"] = list( features.apply(lambda x: preprocess_NER(x["candsenttag"]), axis=1)) features["origsent_NER_unigram"] = features["origsent_NER"] features["candsent_NER_unigram"] = features["candsent_NER"] features["origsent_Event"] = list( features.apply(lambda x: preprocess_Event(x["origsenttag"]), axis=1)) features["candsent_Event"] = list( features.apply(lambda x: preprocess_Event(x["candsenttag"]), axis=1)) features["origsent_Event_unigram"] = features["origsent_Event"] features["candsent_Event_unigram"] = features["candsent_Event"] print "generate bigram for Tags" feattag = [ "origsent_tag", "candsent_tag", "origsent_NER", "candsent_NER", "origsent_Event", "candsent_Event" ] for feat in feattag: join_str = "_" features["%s_bigram" % (feat)] = list( features.apply(lambda x: getBigram(x["%s_unigram" % (feat)], join_str), axis=1)) features["%s_trigram" % (feat)] = list( features.apply(lambda x: getTrigram(x["%s_unigram" % (feat)], join_str), axis=1)) gram_tags = ["_tag_unigram", "_tag_bigram", "_tag_trigram"] for gram in gram_tags: features["common-words_%s" % (gram)] = list( features.apply(lambda x: len( intersect(x["origsent" + gram], x["candsent" + gram])), axis=1)) features["levenshtein_%s" % (gram)] = list( features.apply(lambda x: distance.nlevenshtein( x["origsent" + gram], x["candsent" + gram], method=2), axis=1)) features["sorensen_%s" % (gram)] = list( features.apply(lambda x: distance.sorensen(x["origsent" + gram], x[ "candsent" + gram]), axis=1)) features["cosine_%s" % (gram)] = list( features.apply( lambda x: cosine(x["origsent" + gram], x["candsent" + gram]), axis=1)) features["precision_%s" % (gram)] = list( features.apply(lambda x: precision_recall(x["origsent" + gram], x[ "candsent" + gram], x["origsent" + gram]), axis=1)) features["recall1gram_%s" % (gram)] = list( features.apply(lambda x: precision_recall(x["origsent" + gram], x[ "candsent" + gram], x["candsent" + gram]), axis=1)) features["f1gram_%s" % (gram)] = list( features.apply(lambda x: fmeasure(x["precision_%s" % (gram)], x["recall1gram_%s" % (gram)]), axis=1)) features["common_Event"] = list( features.apply( lambda x: len(intersect(x["origsent_Event"], x["candsent_Event"])), axis=1)) features["common_NER"] = list( features.apply( lambda x: len(intersect(x["origsent_Event"], x["candsent_Event"])), axis=1)) return features
def similar_sorensen(a, b): return (1 - distance.sorensen(a, b))
def sorencen(q1, q2): return distance.sorensen(q1, q2)
def sorensen_plus(a, b): ng1 = [ngrams(a, i) for i in range(1, min(len(a), len(b)) + 1)] ng2 = [ngrams(b, i) for i in range(1, min(len(a), len(b)) + 1)] N = min(len(ng1), len(ng2)) return 1 - np.sum(distance.sorensen(ng1[i], ng2[i]) for i in range(N)) / N
if checked == 0: if a == b: checked += 1 for v1 in gt[a]: partials = [] levs = [] jacs = [] sors = [] for v2 in pc[b]: v2 = str(v2).translate(None, string.punctuation) v2 = str(v2).replace('\t',' ') try: partials.append((1-(fuzz.partial_ratio(v1, v2)/100.0))) levs.append(distance.levenshtein(v1,v2, normalized=True)) jacs.append(distance.jaccard(v1, v2)) sors.append(distance.sorensen(v1, v2)) except UnicodeDecodeError: partials.append(1) levs.append(1) jacs.append(1) sors.append(1) ls_partials.append(partials) ls_levs.append(levs) ls_jacs.append(jacs) ls_sors.append(sors) else: pass else: pass # create distance score matrices with row index as hand coded titles and # column index as parscit coded titles
def sor_tok_distance(q1, q2, t1, t2): return distance.sorensen(t1, t2)