def get_cosine_distances(self, tokens, ngrams): start_time = time.time() cos = Cosine(ngrams) distances = np.array([[int(100*cos.distance(w1, w2)) for w1 in tokens] for w2 in tokens]) end_time = time.time() logging.info("Cosine distances computation time: " + str(round(end_time-start_time, 2)) + " seconds") return distances
def get_similarity_score(): cosine = Cosine(2) f, g, ctr = open('model_res').readlines(), open('real_res').readlines(), 0 for i in range(len(f)): f[i], g[i] = f[i].replace("\n", ""), g[i].replace("\n", "") ctr += cosine.similarity_profiles(cosine.get_profile(f[i]), cosine.get_profile(g[i])) return ctr / len(f)
def triangulate(self, tweet, loc): print('Triangulating: ' + tweet) cosine = Cosine(2) cos_tweet = cosine.get_profile(tweet) with open("clean/clean_rss.txt", "r") as clean_rss: for rss in clean_rss: rss = rss.split('\n')[0] cos_rss = cosine.get_profile(rss) cos_result = cosine.similarity_profiles(cos_tweet, cos_rss) if cos_result > 0.7: print('\t[PASS: '******'] ' + rss) return True else: print('\t[FAIL: ' + str(cos_result) + '] ' + rss) with open("clean/clean_retweet.txt", "r") as clean_rt: for rtweet in clean_rt: rt = rtweet.rsplit(' ', 1)[0] rt_loc = rtweet.split('\n')[0].rsplit(' ', 1)[1] cos_rt = cosine.get_profile(rt) if loc == rt_loc: cos_result = cosine.similarity_profiles(cos_tweet, cos_rt) if cos_result > 0.7: print('\t[PASS: '******'] ' + rt) return True else: print('\t[FAIL: ' + str(cos_result) + '] ' + rt) with open('clean/clean_tweet.txt', 'r') as clean_tweet: for ctweet in clean_tweet: ct = ctweet.rsplit(' ', 1)[0] ct_loc = ctweet.split('\n')[0].rsplit(' ', 1)[1] cos_ct = cosine.get_profile(ct) if loc == ct_loc: cos_result = cosine.similarity_profiles(cos_tweet, cos_ct) if cos_result > 0.7 and cos_result != 1.0: print('\t[PASS: '******'] ' + ct) return True else: print('\t[FAIL: ' + str(cos_result) + '] ' + ct) print('\tNo matching results found...') return False
def choose_the_best_clause(union, question): if USE_COSINE: cos = Cosine(1) article_and_clause_no = [] max_cosine = 0 for ref in union: article, clause = ref[1] clause = corpus[article]['clauses'][clause]['text'] cur_cos = cos.similarity(clause, question) if max_cosine < cur_cos: max_cosine = cur_cos article_and_clause_no = ref[1] return article_and_clause_no else: union.sort(key=lambda x: x[0], reverse=True) return union[0][1]
def match(num): stop_plate = [0, "", "0"] suspected = '' from similarity.cosine import Cosine cosine = Cosine(2) s0 = str(num) f = open('lic.csv').readlines() for s1 in f: s1 = s1.replace("\n", "") if s1 in stop_plate: continue print(s1) p0 = cosine.get_profile(s0) p1 = cosine.get_profile(s1) if cosine.similarity_profiles(p0, p1) >= 0.65: suspected += s1 + " " return suspected
def title_similarity(self, page): try: s1 = normalize(self.reference.title) s2 = normalize(page.title) n = 3 p1_trigrams = Counter(nltk.ngrams(s1, n)) p2_trigrams = Counter(nltk.ngrams(s2, n)) p1_grams = Counter(nltk.ngrams(s1, 1)) p2_grams = Counter(nltk.ngrams(s2, 1)) cosine = Cosine(1) similarity = cosine.similarity_profiles(p1_trigrams, p2_trigrams) similarity = cosine.similarity_profiles(p1_grams, p2_grams) similarity = similarity / 2 except ZeroDivisionError: similarity = 0 return similarity
def footer(df, kgram=2, TOP_LINES=5): df['isFooter'] = False pgs = df['page'].unique() cosine = Cosine(kgram) for pg in pgs[1:]: prev_idx = df.index[df['page'] == (pg - 1)] pres_idx = df.index[df['page'] == pg] for ln in range(TOP_LINES): prev_ln = prev_idx[-1 * (ln + 1)] pres_ln = pres_idx[-1 * (ln + 1)] s0 = df.loc[prev_ln, 'text'] s1 = df.loc[pres_ln, 'text'] skip = 0 if s0.isdigit(): df.loc[prev_ln, 'isFooter'] = True skip = 1 if s1.isdigit(): df.loc[pres_ln, 'isFooter'] = True skip = 1 if (skip == 1) | (len(s0) < kgram) | (len(s1) < kgram): continue #print(s0,",", s1) p0 = cosine.get_profile(s0) p1 = cosine.get_profile(s1) sim = cosine.similarity_profiles(p0, p1) if (sim > 0.9): df.loc[prev_ln, 'isFooter'] = True df.loc[pres_ln, 'isFooter'] = True #print(pg,",", ln, ",", s0,",", s1,",", sim) return (df)
def met_cosine(s1, s2, n): cosine = Cosine(n) p1 = cosine.get_profile(s1) p2 = cosine.get_profile(s2) val = cosine.similarity_profiles(p1, p2) return val
def similarity(self, question, answer): stopword = self.read_from(folder_path + '上证专用停用词.txt') stopwords = [] for sw in stopword: sw = sw.strip('\n') sw = sw.strip(' ') stopwords.append(sw) # print(stopwords) meaningful_words1 = [] meaningful_words2 = [] words2 = jieba.cut(str(question)) words3 = jieba.cut(str(answer)) for word in words2: if word not in stopwords: meaningful_words1.append(word) for word in words3: if word not in stopwords: meaningful_words2.append(word) s2 = ''.join(meaningful_words1) # print(s2) s3 = ''.join(meaningful_words2) a1 = Cosine(1) b1 = Damerau() c1 = Jaccard(1) d1 = JaroWinkler() e1 = Levenshtein() f1 = LongestCommonSubsequence() g1 = MetricLCS() h1 = NGram(2) i1 = NormalizedLevenshtein() j1 = OptimalStringAlignment() k1 = QGram(1) l1 = SorensenDice(2) m1 = WeightedLevenshtein(character_substitution=CharSub()) line_sim = [] cos_s = a1.similarity(s2, s3) line_sim.append(cos_s) cos_d = a1.distance(s2, s3) line_sim.append(cos_d) dam = b1.distance(s2, s3) line_sim.append(dam) jac_d = c1.distance(s2, s3) line_sim.append(jac_d) jac_s = c1.similarity(s2, s3) line_sim.append(jac_s) jar_d = d1.distance(s2, s3) line_sim.append(jar_d) jar_s = d1.similarity(s2, s3) line_sim.append(jar_s) lev = e1.distance(s2, s3) line_sim.append(lev) lon = f1.distance(s2, s3) line_sim.append(lon) met = g1.distance(s2, s3) line_sim.append(met) ngr = h1.distance(s2, s3) line_sim.append(ngr) nor_d = i1.distance(s2, s3) line_sim.append(nor_d) nor_s = i1.similarity(s2, s3) line_sim.append(nor_s) opt = j1.distance(s2, s3) line_sim.append(opt) qgr = k1.distance(s2, s3) line_sim.append(qgr) sor_d = l1.distance(s2, s3) line_sim.append(sor_d) sor_s = l1.similarity(s2, s3) line_sim.append(sor_s) wei = m1.distance(s2, s3) line_sim.append(wei) return line_sim
import matplotlib.pyplot as plt import re import json from similarity.jarowinkler import JaroWinkler from similarity.qgram import QGram from similarity.cosine import Cosine from similarity.ngram import NGram from similarity.levenshtein import Levenshtein from similarity.normalized_levenshtein import NormalizedLevenshtein import csv from similarity.qgram import QGram with open('qgram_human_mobility.txt') as f: data = json.load(f) cosine = Cosine(2) profiles = [] for i in range(len(data)): temp = data[i][0].strip('"') profiles.append(cosine.get_profile(temp)) for i in range(len(profiles)): print(profiles[i]) profile_sim = []
from similarity.levenshtein import Levenshtein from similarity.normalized_levenshtein import NormalizedLevenshtein from similarity.cosine import Cosine lev = Levenshtein() nolev = NormalizedLevenshtein() cosine = Cosine(4) str1 = 'I enjoy playing football' str2 = 'I love to play soccer' print(lev.distance(str1, str2)) print('Levenshtein distance:') print(nolev.similarity(str1, str2)) print('Cosine similarity:') print(cosine.similarity(str1, str2))
def dashboard_carrer(): with open('.data/test.csv', 'rb') as f: result = chardet.detect(f.read()) p_test = pd.read_csv(".data/test.csv", encoding=result['encoding']) #Encoding p_test = p_test.replace('Enjoy', 5) p_test = p_test.replace('Slightly Enjoy', 4) p_test = p_test.replace('Neutral', 3) p_test = p_test.replace('Slightly Disagree', 2) p_test = p_test.replace('Strongly Disagree', 1) #Realistic Questions realistic = p_test[[ 'I like to work on cars', 'I like to build things', 'I like to take care of animals', 'I like putting things together or assembling things', 'I like to cook', 'I am a practical person', 'I like working outdoors' ]] #Investigative Questions investigative = p_test[[ 'I like to do puzzles', 'I like to do experiments', 'I enjoy science', 'I enjoy trying to figure out how things work', 'I like to analyze things (problems/situations)', 'I like working with numbers or charts', 'I am good at math' ]] #Artistic Questions artistic = p_test[[ 'I am good at working independently', 'I like to read about art and music', 'I enjoy creative writing', 'I am a creative person', 'I like to play instruments or sing', 'I like acting in plays', 'I like to draw' ]] #Social Questions social = p_test[[ 'I like to work in teams', 'I like to teach or train people', 'I like trying to help people solve their problems', 'I am interested in healing people', 'I enjoy learning about other cultures', 'I like to get into discussions about issues around me', 'I like helping people' ]] #Enterprising Questions enterprising = p_test[[ 'I am an ambitious person who set goals for myself', 'I like to try to influence or persuade people', 'I like selling things', 'I am quick to take on new responsibilities', 'I would like to start my own business', 'I like to give speeches', 'I like to lead' ]] #Conventional Questions conventional = p_test[[ 'I like to organize things', 'I wouldn’t mind working 8 hours per day in an office', 'I pay attention to details', 'I like to do filing or typing', 'I am good at keeping records of my work', 'I would like to work in an office' ]] #Summing Up realistic['R'] = realistic.sum(axis=1) investigative['I'] = investigative.sum(axis=1) artistic['A'] = artistic.sum(axis=1) social['S'] = social.sum(axis=1) enterprising['E'] = enterprising.sum(axis=1) conventional['C'] = conventional.sum(axis=1) code = realistic['R'] code = code.to_frame() code['I'] = investigative['I'] code['A'] = artistic['A'] code['S'] = social['S'] code['E'] = enterprising['E'] code['C'] = conventional['C'] n = 3 new_d = [ list(map(it(0), (row[1:].sort_values(ascending=False)[:n].iteritems()))) for _, row in code.iterrows() ] std = pd.DataFrame(new_d) std['code'] = std[0] + std[1] + std[2] #std has the test code std = std.drop([0, 1, 2], axis=1) #Read the course data course = pd.read_csv(".data/course.csv") df = pd.MultiIndex.from_product( [std["code"], course["course_code"], course["Course_short"]], names=["code", "course_code", "course"]).to_frame(index=False) df = df.dropna() #Cosine Similarity cosine = Cosine(2) df["p0"] = df["code"].apply(lambda s: cosine.get_profile(s)) df["p1"] = df["course_code"].apply(lambda s: cosine.get_profile(s)) df["cosine_sim"] = [ cosine.similarity_profiles(p0, p1) for p0, p1 in zip(df["p0"], df["p1"]) ] df.drop(["p0", "p1"], axis=1, inplace=True) #Sorting the Values top_n = df.sort_values(['cosine_sim'], ascending=False).groupby(df['code'].values).head(3) options = top_n["course"].to_numpy() # selecting rows based on condition rec = course.loc[course['Course_short'].isin(options)] recommendations = json.loads(rec.to_json(orient='records')) return render_template('./dashboard_carrer.html', title='Dashboard - Carrer', std=std, recommendations=recommendations)
def cosine(self, s0, s1): cosine = Cosine(15) p0 = cosine.get_profile(s0) p1 = cosine.get_profile(s1) #print('Cosine similarity \"%\" vs \"%\"'% (s0,s1)) return cosine.similarity_profiles(p0, p1)
from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize import numpy as np # Inizializza all'import levenshtein = Levenshtein() norm_levenshtein = NormalizedLevenshtein() damerau = Damerau() optimal_string_alignment = OptimalStringAlignment() jarowinkler = JaroWinkler() lcs = LongestCommonSubsequence() metric_lcs = MetricLCS() ngram = NGram() qgram = QGram() dice = SorensenDice() cos = Cosine(5) jaccard = Jaccard(5) similarity_functions = [ norm_levenshtein.similarity, lambda a, b: 1 - metric_lcs.distance(a, b), lambda a, b: 1 - ngram.distance(a, b), cos.similarity, dice.similarity ] def mono_vector0(tup1, tup2): str1 = ' '.join(tup1).lower() str2 = ' '.join(tup2).lower() simv = list(map(lambda x: x(str1, str2), similarity_functions))