def summarize(self, content: str, title: str = None, splitChar='(。|!|\!|?|\?|\n|\t)', proportion=0.3): contents = self._splitText(content, splitChar=splitChar) # 获取标题向量 if title != None: title = title.strip() if splitChar.find(title[len(title) - 1]) == -1: title += '。' contents.insert(0, title) # 获取文章向量 contents.append(content) # print(' len(contents)', len(contents)) if len(contents) <= 4: return contents sentencesVec = self.sif.getSentencesEmbedding(contents) sentencesVec = list(sentencesVec) contentVec = sentencesVec.pop() similarities = [(similarity.cosine_similarity(senVec, contentVec), index) for index, senVec in enumerate(sentencesVec)] similarities2 = [(similarity.cosine_similarity(senVec, sentencesVec[0]), index) for index, senVec in enumerate(sentencesVec)] similarities = [((sim1[0] * 0.382 + sim2[0] * 0.618), sim1[1]) for sim1, sim2 in zip(similarities, similarities2)] # 相似度平滑 KNN similarities = self._knnSmooth(similarities) # 排序 similarities.sort(reverse=True) summarySentenceIndexes = similarities[ 0:int(len(similarities) * proportion)] # print("summarySentenceIndexes:") # for i, sim in enumerate(summarySentenceIndexes): # print(i, "index:", sim[1], sim, contents[sim[1]]) summarySentences = [(index, contents[index]) for (cos, index) in summarySentenceIndexes] summarySentences.sort() return [sentence for (index, sentence) in summarySentences]
def knn_classify(test_tf, train_tf, train_class, k): tf_distance = {} # 計算每個訓練集合特徵關鍵字字詞頻率向量和輸入向量的距離 for place in train_tf.keys(): tf_distance[place] = cosine_similarity(train_tf.get(place), test_tf) # 把距離排序,取出k個最近距離的分類 class_count = {} # print('(2) 取K個最近鄰居的分類, k = %d' % k) for i, place in enumerate( sorted(tf_distance, key=tf_distance.get, reverse=True)): current_class = train_class.get(place) # print('\tTF(%s) = %f, class = %s' % (place, tf_distance.get(place), current_class)) class_count[current_class] = class_count.get(current_class, 0) + 1 if (i + 1) >= k: break print('(3) K個最近鄰居分類出現頻率最高的分類當作最後分類') input_class = '' for i, c in enumerate( sorted(class_count, key=class_count.get, reverse=True)): if i == 0: input_class = c print('\t%s, %d' % (c, class_count.get(c))) print('(4) 分類結果 = %s' % input_class) return str(input_class)
def score_batch_cosine(users, user, user_id, movie_ids): weights = [cosine_similarity(user, u) for u in users] ratings = [] for movie_id in movie_ids: sum_w = 0 rating = 0 for w, u_other in zip(weights, users): u_rating = u_other[movie_id] if u_rating == 0: continue sum_w += w rating += (w * u_rating) if sum_w != 0: rating /= sum_w else: # If no relevant info was found, guess a score of 3. rating = 3 rating = int(np.rint(rating)) ratings.append(rating) return clean_ratings(ratings)
def quadratic_entropy(example, train_term_dist, word2id, word2vec): """Calculates Quadratic Entropy.""" assert word2vec is not None, ('Error: Word vector representations have to ' 'be available for quadratic entropy.') summed = 0 for word_1 in set(example): if word_1 not in word2id or word_1 not in word2vec: continue # continue as the product will be 0 for word_2 in set(example): if word_2 not in word2id or word_2 not in word2vec: continue # continue as the product will be 0 p_1 = train_term_dist[word2id[word_1]] p_2 = train_term_dist[word2id[word_2]] vec_1 = word2vec[word_1] vec_2 = word2vec[word_2] sim = similarity.cosine_similarity(vec_1, vec_2) summed += sim * p_1 * p_2 return summed
def main_algo(features, tweetid, lastclusterid): fvecs, freqdict = tfidf_all.get_tfidf_freqdict(features) # Creating random vectors num_randvecs = 13 random_vectors = randomvecs.getVecs(len(freqdict), num_randvecs) # Initialising prefix trees a = [] b = [] prime = 13 P = [] # modP = int(input("Enter number of permutations to be used : ")) modP = 20 for i in range(modP): atemp = random.uniform(1, prime) btemp = random.uniform(0, prime) a.append(atemp) b.append(btemp) P.append(pygtrie.Trie()) index = 0 wordindexmap = {} for key in freqdict.keys(): wordindexmap[key] = index index = index + 1 # MAIN TWEET LOOP tweetclustermap = {} clusterdict = {} for fvec in fvecs: tweetsign = signature.getSign(fvec, random_vectors, wordindexmap) # Insert tweet signature in prefix tree and find its nearest neighbor in that tree nearestNeighbours = [] for i in range(modP): signPerm = [None] * len(tweetsign) for x in range(len(tweetsign)): ind = int(a[i] * x + b[i]) % prime signPerm[x] = tweetsign[ind] if P[i].has_key(signPerm): P[i][signPerm].append(tweetid) else: P[i][signPerm] = [tweetid] neighbor, hdist = nearest_neighbor.getNN(signPerm, P[i]) if (neighbor == None): None elif hdist == 0: neighbor.remove(tweetid) nearestNeighbours.append((neighbor, hdist)) elif hdist == 1: nearestNeighbours.append((neighbor, hdist)) elif (hdist > 1): templist = [] for item in neighbor: templist += item[1] nearestNeighbours.append((templist, hdist)) mindist = len(signPerm) + 10 closestNeighbors = [] for pair in nearestNeighbours: if pair[1] <= mindist: mindist = pair[1] for pair in nearestNeighbours: if pair[1] == mindist: for i in range(len(pair[0])): if not pair[0][i] in closestNeighbors: closestNeighbors.append(pair[0][i]) # T = float(input("Enter the similarity threshold : ")) T = 0.05 tweetclustermap[0] = 0 clusterdict[0] = [0] for cneighbor in closestNeighbors: if (similarity.cosine_similarity(fvec, fvecs[cneighbor]) >= T): if (tweetid in tweetclustermap.keys()): if (not (tweetclustermap[tweetid] == tweetclustermap[cneighbor])): tweetclustermap[tweetid] = tweetclustermap[cneighbor] clusterdict[tweetclustermap[cneighbor]].append(tweetid) else: tweetclustermap[tweetid] = tweetclustermap[cneighbor] clusterdict[tweetclustermap[cneighbor]].append(tweetid) else: if (not (tweetid in tweetclustermap.keys())): tweetclustermap[tweetid] = lastclusterid + 1 clusterdict[lastclusterid + 1] = [tweetid] lastclusterid += 1 tweetid = tweetid + 1 return clusterdict, fvecs, freqdict
#average centre #classe_i = np.argwhere(labels==i) #cur_centre_item = np.mean(feature[classe_i], axis=0) #min-distance centre classe_i = np.argwhere(labels == i) mean_centre = np.mean(feature[classe_i], axis=0) eudistance = np.array([eucliDist(mean_centre[0], r[0]) for r in feature[classe_i]]) min_centre = classe_i[np.argmin(eudistance)] min_centre = min_centre[0] is_old = 0 cur_centre = np.concatenate((cur_centre, [feature[min_centre]]), axis=0) if pre_centre.any(): for item in pre_centre: cos_value = cosine_similarity(item, feature[min_centre]) if cos_value > 0.8: old.append(train_data[min_centre]+[str(labels[min_centre])]) is_old = 1 break if not is_old: new.append(train_data[min_centre]+[str(labels[min_centre])]) pre_centre = cur_centre # write cluster result write_data = [] print(train_data[0]) for i, item in enumerate(train_data): item.append(labels[i]) write_data.append(item) write_data.sort(key= lambda a:a[-1]) write_data = ['\t'.join([l[0], l[4], l[1], str(l[5])]) for l in write_data]
def test_similar_cosine_similarity(self): with self.assertRaises(NotImplementedError): VideoHistory.similar('http://www.inf.puc-rio.br', 5, sim.cosine_similarity())
library_path=os.listdir(copyrighted_works) # list of text files files = [file for file in library_path if file.endswith('.txt')] for file in files: # read text data for each copyrighted content # extract TF_IDF terms and values # calculate the similarity between this vector and uploaded text file vector full_path_file = os.path.join(copyrighted_works, file) # read raw texts one by one from copyright library path try: with open(full_path_file, "r", encoding='iso-8859-1') as ifile: raw_text = ifile.read() # raw text of one of copyrighted works except: with open(full_path_file, "r", encoding='utf-8') as ifile: raw_text = ifile.read() # raw text of one of copyrighted works tfs_dict2 = tf_idf(raw_text) # dictionary of tfidf terms and values {'tfs_values':tfs_values, 'tfs_term':tfs_term} # calculate cosine similarity between this copyrighted work and new uploaded text similarity = cosine_similarity(tfs_dict1["tfs_term"], tfs_dict1["tfs_values"], tfs_dict2["tfs_term"], tfs_dict2["tfs_values"]) print(file, ": ", similarity) ifile.close()
def find_users_similarity(self, user1, user2): x = self.user_interface.get_user_vector(user1) y = self.user_interface.get_user_vector(user2) return similarity.cosine_similarity(x, y)
clusterdict = {} for i in range(len(rlist)): pair = json.loads(rlist[i][0]) clusterdict[pair[0]] = pair[1] None # for breakpoint # for clusterdata2.csv r = csv.reader(open('clusterdata2.csv', encoding='utf8')) clusterlist = list(r) for i in range(len(clusterlist)): clusterlist[i] = json.loads(clusterlist[i][0]) # Testing similarity of tweets in a cluster r = csv.reader(open('tfidfdata.csv', encoding='utf8')) fvecs = list(r) for i in range(len(fvecs)): fvecs[i] = json.loads(fvecs[i][0]) #clusterid = int(input("Enter id of the cluster to be tested : ") ) clusterid = 23 simlist = [similarity.cosine_similarity(fvecs[tweetid1],fvecs[tweetid2]) for tweetid1 in clusterdict[clusterid] for tweetid2 in clusterdict[clusterid] if tweetid1 != tweetid2] false_positives = 0 for sim in simlist: if sim<0.2: false_positives += 1 print(false_positives, false_positives/len(simlist))
else: test20_rated_indexes[user_id - 401].append(movie_id) file.close() ######################################################################################### test_user_averages = [] threshold = 0.84 k_users = 160 ######################################################################################### cosine_similarities = [] for i in range(100): test_user_sims = [] for j in range(200): test_user_sims.append(cosine_similarity(users[j], test5_users[i])) cosine_similarities.append(test_user_sims) for i, row in enumerate(cosine_similarities): for j, cell in enumerate(row): # TODO: don't use threshold if only few have non zero weights if cell < threshold: cosine_similarities[i][j] = 0 all_ratings = [] for k, each_target_user in enumerate(test5_target_indexes): final_ratings = [] for blank_rating in each_target_user:
import similarity import csv import json import random r = csv.reader(open('tfidfdata.csv', encoding='utf8')) fvecs = list(r) for i in range(len(fvecs)): fvecs[i] = json.loads(fvecs[i][0]) count = 1 topsims = [] while (count > 0): tweetid = int(random.uniform(0, len(fvecs))) simlist = [] for i in range(len(fvecs)): simlist.append((similarity.cosine_similarity(fvecs[tweetid], fvecs[i]), (tweetid, i))) simlist.sort(reverse=True) topsims.append(simlist[:50]) count -= 1 print( topsims ) # Stores top 5 similarities and the pairs for which this similarity is obtained, for 10 tweets
def test_cosine_similarity(self): for vecs, expected in self.cosine_similarity_tests: result = cosine_similarity(vecs[0], vecs[1]) self.assertAlmostEqual(result, expected, places=3)
mu, sigma, inf_mu, inf_sigma, z = model.get_distribution( center_vec, context_vec) #print(mu, sigma, inf_mu, inf_sigma, z) for gold_candidate in gold_dict[lst_item.target_word]: if gold_candidate not in vocab.index: # TODO print something out and track continue vec = torch.LongTensor(np.array([vocab[gold_candidate]])) if model.use_cuda: vec = center_vec.cuda() mu_s, sigma_s, inf_mu_s, inf_sigma_s, z_s = model.get_distribution( vec, context_vec) score_mu = cosine_similarity(mu.squeeze().detach().cpu().numpy(), mu_s.squeeze().detach().cpu().numpy()) scores_mu[lst_item.complete_word, lst_item.sentence_id].append( (gold_candidate, score_mu)) # TODO not sure if minus is reqd # posterior (word) (inf) || prior (candidate) kl_prior = -1 * kl_div(inf_mu, inf_sigma, mu_s, sigma_s) scores_kl_prior[lst_item.complete_word, lst_item.sentence_id].append( (gold_candidate, kl_prior.item())) kl_post = -1 * kl_div(inf_mu, inf_sigma, inf_mu_s, inf_sigma_s) scores_kl_post[lst_item.complete_word, lst_item.sentence_id].append( (gold_candidate, kl_post.item()))
preprocessed_git_issues = preprocessing.preprocessing(git_issues) #Preprocess Stack Overflow posts print ">>> PREPROCESSING SO POSTS <<<" preprocessed_so_posts = preprocessing.preprocessing(so_posts) #Similarity print ">>> TFIDF POSTS <<<" tfidf_posts = similarity.tfidf(preprocessed_so_posts) print ">>> TFIDF ISSUES <<<" tfidf_issues = similarity.tfidf(preprocessed_git_issues) for post in tfidf_posts: for issue in tfidf_issues: print ">>> COSINE SIMILARITY <<<" cosine_result = similarity.cosine_similarity(post, issue) print ">>> Cosine: ", cosine_result # TO TEST # teste1 = [["Buying! fdsfsfsd. of", "Buyed? the bug2", "I bug3 bBg3"], ["BUG bug4", "abs bug5", "bug6 bug6"]] # teste2 = [["Buying! fdsfsfsd. of", "Buyed? the bug2", "I bug3 bBg3"], ["BUG bug4", "abs bug5", "bug6 bug6"]] # text1 = preprocessing.preprocessing(teste1) # text2 = preprocessing.preprocessing(teste2) # print "preproceesded 1: ", text1 # print "preproceesded 2: ", text2 # # Similarity # result1 = similarity.tfidf(text1) # result2 = similarity.tfidf(text2)