def create_coherency_features(sentences_index,ref_doc,input_query,model,key): query = input_query+key ref_doc_sentences = sentences_index[query][ref_doc] for top_doc in sentences_index[query]: if top_doc==ref_doc: continue top_doc_sentences = sentences_index[query][top_doc] for i,top_doc_sentence in enumerate(top_doc_sentences,start=1): sentence_vec = get_sentence_vector(top_doc_sentence,model) for j,ref_sentence in enumerate(ref_doc_sentences): row={} comb = top_doc+"_"+str(i)+"_"+str(j+1) window = [] if j == 0: # if j+1 == len(ref_doc_sentences): # window.append(get_sentence_vector(ref_doc_sentences[0], model)) # window.append(get_sentence_vector(ref_doc_sentences[0], model)) # else: window.append(get_sentence_vector(ref_doc_sentences[1], model)) window.append(get_sentence_vector(ref_doc_sentences[1], model)) elif j+1 == len(ref_doc_sentences): window.append(get_sentence_vector(ref_doc_sentences[j - 1], model)) window.append(get_sentence_vector(ref_doc_sentences[j - 1], model)) else: window.append(get_sentence_vector(ref_doc_sentences[j - 1], model)) window.append(get_sentence_vector(ref_doc_sentences[j + 1], model)) ref_vector = get_sentence_vector(ref_sentence, model) row["docSimilarityToPrev"] = cosine_similarity(sentence_vec, window[0]) row["docSimilarityToRefSentence"] = cosine_similarity(ref_vector, sentence_vec) row["docSimilarityToPred"] = cosine_similarity(sentence_vec, window[1]) row["docSimilarityToPrevRef"] = cosine_similarity(ref_vector, window[0]) row["docSimilarityToPredRef"] = cosine_similarity(ref_vector, window[1]) write_files(row,query,comb)
def feature_values(centroid,s_in,s_out,past_winner_centroid): result={} result["docCosineToCentroidInVec"]= cosine_similarity(centroid,s_in) result["docCosineToCentroidOutVec"]= cosine_similarity(centroid,s_out) result["docCosineToWinnerCentroidInVec"]=cosine_similarity(past_winner_centroid,s_in) result["docCosineToWinnerCentroidOutVec"]=cosine_similarity(past_winner_centroid,s_out) return result
def get_memory_based_user_recommendation(): request_data = json.loads(request.data)["user"] # Get User, Location entity is not implemented. # So We use, a csv file for this API. header = ['user_id', 'location_id', 'frequency'] user_location_table = pd.read_csv('./data/preprocessed_data2.csv', sep='\t', names=header) n_users = user_location_table.user_id.unique().shape[0] n_locations = user_location_table.location_id.unique().shape[0] user_location_table = user_location_table.to_numpy() user_location_frequency_matrix = np.zeros((n_users, n_locations)) # User x Location matrix # user_location_frequency_matrix can be replaced to user-match-location table JOIN. for checkin in user_location_table: user_location_frequency_matrix[checkin[0], checkin[1]] = checkin[2] # User x User matrix user_similarity = cosine_similarity(user_location_frequency_matrix) selected_user_similarity = user_similarity[request_data] # return top-10 similarity user's id # Ref : https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array top10_id = selected_user_similarity.argsort()[-11:][::-1] # cost : O(N) top10_id = top10_id[1:11] # remove it self top10_sim = selected_user_similarity[top10_id] top10 = {'user_ids': top10_id.tolist(), 'user_sims': top10_sim.tolist()} return jsonify({'success': True, 'top10': top10}), HTTPStatus.OK
def __init__(self, name, dimensions=128, alpha=0.1, beta=1, times=1, clusters=5, decay=0.1, order=3, max_iteration=300): print("Model initialization started.\n") self.input = u"../data/{}.csv".format(name) self.name = name self.lambd = math.pow(10, 8) self.dimensions = dimensions self.alpha = alpha self.beta = beta self.times = times self.clusters = clusters self.order = order self.decay = decay self.max_iteration = max_iteration self.converge_threshold = math.pow(10, -3) self.lower_control = math.pow(10, -8) self.G = graph_reader(self.input) self.number_of_nodes = len(nx.nodes(self.G)) self.S = cosine_similarity(self.G) # cosine similarity self.Adj = np.array(nx.adjacency_matrix(self.G).todense()) # adjacency Matrix self.P = self.high_order_proximity() # high-order proximity self.current_loss = math.pow(10, 10) self.round = 0 self.V = self.matrix_random_initialization(self.number_of_nodes, self.dimensions) self.U = self.matrix_random_initialization(self.number_of_nodes, self.dimensions) self.H = self.matrix_random_initialization(self.number_of_nodes, self.clusters) self.W = self.matrix_random_initialization(self.clusters, self.dimensions)
def predict(self, Xtest): #Compute the Euclidean distance N, D = self.X.shape T, D = Xtest.shape y_pred = np.zeros((T, self.y.shape[1])) if self.method == "L2": distance = utils.euclidean_dist_squared(self.X, Xtest) elif self.method == "cosine": distance = utils.cosine_similarity(self.X, Xtest) #print(distance.shape) elif self.method == "pearson": distance = utils.pearson_corr(self.X, Xtest) #print(distance.shape) for t in range(T): sorted_distance_k = np.argsort(distance[:, t])[:self.k] #print(sorted_distance_k) for l in range(self.labels): #calculate the conditional probability that P(y_j = 1|x) p = (1/self.k)*np.sum(self.y[:,l][sorted_distance_k]) #print(p) if p>0.5: y_pred[t,l] = 1 else: y_pred[t, l] = 0 #y_pred[t] = utils.mode(self.y[sorted_distance_k] ) return y_pred
def sentiment_similarity_trg(train_line, trg_idxs, word_vectors, sentiment_vectors, method='cosine', lower=True): ''' Returns a sentiment matrix for consisting of similarity of the word to pretrained sentiment vectors TODO, it's possible to add a mask for targets here (exclude and replace by '2.0') so we know where the sentiment is in relation to the target ''' seq = [] words = train_line.strip().split(' ') for w in words: if lower: w = w.lower() if w in word_vectors: #print("found word", w) sim_vec = np.array([ cosine_similarity(sentiment_vectors[label], word_vectors[w]) for label in sentiment_vectors ]) #print(sim_vec) seq.append(sim_vec) else: seq.append(np.array([0.0, 0.0, 0.0])) return seq
def _find_similar(self, roll_no, x, threshold, verbose): if roll_no != x: cs = cosine_similarity(self.df.loc[roll_no], self.df.loc[x]) if cs < threshold: if verbose: print(f"{roll_no} and {x} has a cosine similarity of {cs}") return True
def softmax_embedding_loss(self, x, y, proxies): idx = torch.from_numpy(np.arange(len(x), dtype=np.int)).cuda() diff_iZ = cosine_similarity(x, proxies) numerator_ip = torch.exp(diff_iZ[idx, y] / self.temperature) denominator_ip = torch.exp(diff_iZ / self.temperature).sum(1) + 1e-8 return -torch.log(numerator_ip / denominator_ip)
def similarity_correlation(pairs, similarity_scores, embeddings, word2index): """ Returns pearson and spearman correlation coefficient between human judgement and ranking defined by cosine similarity of embeddings pairs: list of pairs of words similarity_scores: the corresponding human similarity judgement scores of pairs embeddings: the embedding matrix word2index: a dictionary mapping words to (row) indices of the embedding matrix """ scores = list(similarity_scores) not_in_embedds_count = 0 cosine_sims = [] for idx, pair in enumerate(pairs): w1, w2 = pair try: cosine_sim = cosine_similarity(embeddings[word2index[w1]], embeddings[word2index[w2]]) cosine_sims.append(cosine_sim) except KeyError: not_in_embedds_count += 1 scores.pop(idx) # correlations pearson_r, p_pearson = pearsonr(cosine_sims, scores) spearman_r, p_spearman = spearmanr(cosine_sims, scores) return pearson_r, spearman_r, p_pearson, p_spearman, not_in_embedds_count
def subgraph_to_leaf_vector(self, pred_subgraph_vector, strategy, redundancy_removal=False): weights = self.get_node_weights(redundancy_removal=redundancy_removal) leaf_node_vec = np.zeros(shape=[self.num_leafs]) subgraph_mat = np.zeros(shape=[self.num_leafs, len(weights)]) for subgraph in self.subgraphs.values(): subgraph_idx = self.get_subgraph_vector_index(subgraph["wd_id"], redundancy_removal=redundancy_removal) subgraph_vector = self.get_subgraph_vector(subgraph["wd_id"], redundancy_removal=redundancy_removal) leaf_node_vec[subgraph["class_idx"]] = pred_subgraph_vector[subgraph_idx] subgraph_mat[subgraph["class_idx"], :] = subgraph_vector if strategy == "leafprob": return leaf_node_vec if "cossim" in strategy: cos_sim = cosine_similarity(gt=subgraph_mat, prediction=pred_subgraph_vector, weights=weights) if strategy == "cossim": return cos_sim elif strategy == "leafprob*cossim": return leaf_node_vec * cos_sim logging.error("Unknown ont2cls strategy. Exiting!") return None
def get_country(city1, country1, city2, embeddings): # store the city1, country 1, and city 2 in a set called group group = set((city1, country1, city2)) # get embeddings of city 1 city1_emb = embeddings[city1] # get embedding of country 1 country1_emb = embeddings[country1] # get embedding of city 2 city2_emb = embeddings[city2] # get embedding of country 2 (it's a combination of the embeddings of country 1, city 1 and city 2) # Remember: King - Man + Woman = Queen vec = city2_emb - city1_emb+country1_emb # Initialize the similarity to -1 (it will be replaced by a similarities that are closer to +1) similarity = -1 # initialize country to an empty string country = '' # loop through all words in the embeddings dictionary for word in embeddings.keys(): # first check that the word is not already in the 'group' if word not in group: # get the word embedding word_emb = embeddings[word] # calculate cosine similarity between embedding of country 2 and the word in the embeddings dictionary cur_similarity = cosine_similarity(word_emb, vec) # if the cosine similarity is more similar than the previously best similarity... if cur_similarity > similarity: # update the similarity to the new, better similarity similarity = cur_similarity # store the country as a tuple, which contains the word and the similarity country = (word, similarity) return country
def HeadUpdate(self, ControlState, PrevWeights, Memory, IsWrite=False): """ For one head, takes the control state, previous weight and memory, and outputs the new weight, and for write-heads, the erase and add vectors as well. """ KeyVector = tf.tanh( Linear(ControlState, self.Params.MemoryDepth, 'KeyVector')) KeyStrength = tf.nn.softplus(Linear(ControlState, 1, 'KeyStrength')) Gate = tf.sigmoid(Linear(ControlState, 1, 'Gate')) ShiftWeights = tf.nn.softmax( Linear(ControlState, len(self.Params.ShiftOffsets), 'ShiftWeights')) Sharpen = tf.nn.softplus(Linear(ControlState, 1, 'Sharpen')) + 1. Weights = tf.exp(KeyStrength * cosine_similarity(KeyVector, Memory)) Weights /= tf.reduce_sum(Weights, 1) Weights = Gate * Weights + (1.0 - Gate) * PrevWeights Weights = circular_convolution(Weights, ShiftWeights, self.Params.ShiftOffsets) Weights = tf.pow(Weights, Sharpen) Weights /= tf.reduce_sum(Weights, 1) if IsWrite: Erase = tf.sigmoid( Linear(ControlState, self.Params.MemoryDepth, 'Erase')) Add = tf.tanh(Linear(ControlState, self.Params.MemoryDepth, 'Add')) return Weights, Erase, Add else: return Weights
def retrieve(K, query, tfidf_docs, text_docs): distances = {} for doc in tfidf_docs: distances[doc] = cosine_similarity(query, tfidf_docs[doc]) for k in range(K): key = max(distances, key=distances.get) print(str((k+1)) + ". " + key + " - " + text_docs[key] + " Score: " + str(distances[key])) distances.pop(key)
def classify(self, x, proxies): idx = torch.from_numpy(np.arange(len(x), dtype=np.int)).cuda() diff_iZ = cosine_similarity(x, proxies) numerator_ip = torch.exp(diff_iZ[idx, :] / self.temperature) denominator_ip = torch.exp(diff_iZ / self.temperature).sum(1) + 1e-8 probs = numerator_ip / denominator_ip[:, None] return probs
def get_similarity(self, data): print("Computing Item similarity...") similarity = utils.cosine_similarity(data, alpha=self.alpha, asym=self.asym, h=self.h, dtype=np.float32) # ARTIST similarity += utils.cosine_similarity( self.artists_mat, alpha=0.5, asym=True, h=0, dtype=np.float32) * self.artist_w # ALBUM similarity += utils.cosine_similarity( self.albums_mat, alpha=0.5, asym=True, h=0, dtype=np.float32) * self.album_w similarity = utils.knn(similarity, self.knn) return similarity
def sentence_similarity(target, pred, similarity_model): """Calculates the cosine similarity between the sentence embeddings of a target and predicted pair using the embedding model specified""" try: cosine_sim = utils.cosine_similarity( similarity_model.sentence_embedding(target).view(1, -1), similarity_model.sentence_embedding(pred).view(1, -1)) return np.around(cosine_sim.item(), 4) except: print('similarity rejected sentence: ', pred) return 0.01
def get_diversity(self, tweet, index, ranking): is_in = tweet.id in {id for _, id in ranking} count = (len(ranking) - (1 if is_in else 0)) if count == 0: return 0 v1 = index.get_tf_idf_vector(tweet.id) total = 0 for _, t in ranking: total += cosine_similarity(v1, index.get_tf_idf_vector(t)) return 1 - ( total / count ) # if tweet is in ranking, we take mean over len(ranking)-1, otherwise, over all len(ranking)
def find_similar_items(fv, threshold_similar=0.6, num_similar_vectors=3): if len(_labels) <= 0: return [] results = [] for k, v in _labels.items(): score = utils.cosine_similarity( fv, np.array(v['featureVectors'], dtype='float32')) if score >= threshold_similar: results.append((k, score)) return sorted(results, key=lambda x: x[1], reverse=True)[:num_similar_vectors]
def score(self, tweet: Tweet, query, normalize=True): tweet_v = [] query_v = [] tweet_tf = self.index.get_tf(query, tweet.id) query_tf = query.get_tf() idfs = self.index.get_idf(query) for i in range(len(query_tf)): tweet_v.append(tweet_tf[i] * idfs[i]) query_v.append(query_tf[i] * idfs[i]) return cosine_similarity(tweet_v, query_v, norm_a=normalize, norm_b=normalize) # assume get_tf normalizes
def predictSentence(sentence, vectorSentimentValues): # xây dựng vector biễu diễn trọng số cảm xúc inp = vectorSentimentValues.values.reshape((len(dbFeatures), )) G = inp P = (G > 0) * G # < 0 to take positive words N = (G < 0) * G # > ) to take negative words # print(G, P, N) if (np.count_nonzero(G) == 0): return "câu chưa biết" elif (np.count_nonzero(P) == 0): return "câu chê" elif (np.count_nonzero(N) == 0): return "câu khen" else: positiveCos = ut.cosine_similarity(P, G) negativeCos = ut.cosine_similarity(N, G) if positiveCos > negativeCos: return "câu khen" elif positiveCos == negativeCos: return "câu bình thường" else: return "câu chê"
def get_top_k_suggestions(data, intent_utils, search_postings, k=2): query = data["message"] query_tokens = utils.lemmatize_text(query.lower()) query_tokens_set = set(query_tokens) stemmed_query_tokens = utils.stem_text(query.lower()) for token in stemmed_query_tokens: if token not in query_tokens_set: query_tokens = query_tokens + [token] print "lemmatized, stemmed and stripped query tokens: " + json.dumps(query_tokens) # remove stop words # query_tokens = utils.remove_stop_words(query_tokens, input_type="list") results = [] # trigger # the trigger shall control whether to use cosine similarity or just a sum of scores trigger = True if trigger: # initializations unique_q_tokens_with_frequencies = dict() postings_vocab = search_postings.get_vocabulary() postings_word_mapping = search_postings.get_vocabulary(return_type="dict") query_vector = [0] * len(postings_vocab) doc_set = set() # get tf in query # and get a doc set for q_token in query_tokens: freq = unique_q_tokens_with_frequencies.get(q_token, 0) unique_q_tokens_with_frequencies[q_token] = freq + 1 if search_postings.get_token(q_token): doc_set = doc_set.union(set(map(lambda x: x["id"], search_postings.get_token(q_token).doc_list))) for q_token in query_tokens: # for this token, get the idf token_obj = search_postings.get_token(q_token) if token_obj: # compute tf-idf idf = token_obj.features["idf"] q_tf_idf = unique_q_tokens_with_frequencies[q_token] * idf # store in query vector query_vector[postings_word_mapping[q_token]] = q_tf_idf # compute cosine similarity for each doc for doc_id in list(doc_set): results.append([doc_id, utils.cosine_similarity(search_postings.doc_term_tf_idf[doc_id], query_vector)]) # return the top k results sorted_results = sorted(results, key=lambda x:x[1], reverse=True)[:k] return map(lambda x: x[0], sorted_results)
def get_test_results(sample_predictions, OntReader): node_results = {} for sample in sample_predictions.values(): # get sample ground truth gt_class_idx = sample["gt_leaf_class_idx"] gt_subgraph_vector = OntReader.get_subgraph_vector( sample["gt_leaf_wd_id"]) gt_subgraph_nodes = OntReader.get_subgraph_nodes( sample["gt_leaf_wd_id"]) # get sample prediction pred_leaf_node_vector = sample["leaf_node_vector"] pred_subgraph_vector = sample["subgraph_vector"] # calculate metrics accuracy = top_k_accuracy(gt_class_idx, pred_leaf_node_vector, kvals=[1, 3, 5]) jaccard = jaccard_similarity(gt_subgraph_vector, pred_subgraph_vector) cosine = cosine_similarity(gt_subgraph_vector, pred_subgraph_vector) # set results for each node in the subgraph of the gt leaf event node for node in gt_subgraph_nodes: if node["wd_id"] not in node_results: node_results[node["wd_id"]] = { "wd_id": node["wd_id"], "wd_label": node["wd_label"], "num_test_images": 0, "metrics": { "accuracy-top1": 0, "accuracy-top3": 0, "accuracy-top5": 0, "jaccard": 0, "cosine": 0, }, } node_results[node["wd_id"]]["num_test_images"] += 1 node_results[ node["wd_id"]]["metrics"]["accuracy-top1"] += accuracy[0] node_results[ node["wd_id"]]["metrics"]["accuracy-top3"] += accuracy[1] node_results[ node["wd_id"]]["metrics"]["accuracy-top5"] += accuracy[2] node_results[node["wd_id"]]["metrics"]["jaccard"] += jaccard node_results[node["wd_id"]]["metrics"]["cosine"] += cosine return node_results
def sentiment_similarity(train_line, word_vectors, sentiment_vectors, method='cosine', lower = True): ''' Returns a sentiment matrix for consisting of similarity of the word to pretrained sentiment vectors ''' seq = [] words = train_line.strip().split(' ') for w in words: if lower: w = w.lower() if w in word_vectors: sim_vec = np.array([cosine_similarity(sentiment_vectors[label], word_vectors[w]) for label in sentiment_vectors]) seq.append(sim_vec) else: seq.append(np.array([0.0, 0.0, 0.0])) return seq
def compute_men_spearman(dm_dict, annotation_file): pairs, humans = readMEN(annotation_file) system_actual = [] human_actual = [] count = 0 for i in range(len(pairs)): human = humans[i] a, b = pairs[i] if a in dm_dict and b in dm_dict: cos = utils.cosine_similarity(dm_dict[a], dm_dict[b]) system_actual.append(cos) human_actual.append(human) count += 1 sp = spearman(human_actual, system_actual) return sp, count
def _init_similarity(self, item, another_item): """ Description A function which computes and returns a similarity between a pair of items. Arguments :param item: The first item. :type item: int :param another_item: The second item. :type another_item: int """ return cosine_similarity( self.intersections_between(item, another_item), self.l1_norm_of(item), self.l1_norm_of(another_item))
def compile_similarity_lists(dm_dict, annotation_file): pairs, humans = readMEN(annotation_file) system_actual = [] human_actual = [] eval_pairs = [] for i in range(len(pairs)): human = humans[i] a, b = pairs[i] if a in dm_dict and b in dm_dict: cos = utils.cosine_similarity(dm_dict[a], dm_dict[b]) system_actual.append(cos) human_actual.append(human) eval_pairs.append(pairs[i]) return eval_pairs, human_actual, system_actual
def push(self, tweet: Tweet): tid = tweet.id vector = self.scorer.get_tweet_vector(tweet) self.vectors[tid] = vector score = self.scorer.score(tweet, self.query) self.scores[tid] = score total = 0 div = {} for t, tdiv in self.div.items(): d = cosine_similarity(vector, self.vectors[t]) div[t] = d tdiv[tid] = d total += d self.div[tid] = div self.total_div[tid] = total heappush(self.ranking, (score, tid))
def runScript(query_dist, dm_dict, pears_ids): best_pears = [] ############################################################# #Calculate score for each pear in relation to the user query ############################################################# if len(query_dist) > 0: pears_scores = {} for pear_name, v in pears_ids.items(): scoreSIM = 0.0 #Initialise score for similarity score = cosine_similarity(np.array(v), query_dist) if not isnan(score): pears_scores[pear_name] = score print pear_name, score best_pears = outputBestPears(pears_scores) return best_pears
def test_cosine_similarity(self): sim = cosine_similarity(0, 5, 5) sim1 = cosine_similarity(1, 5, 5) sim2 = cosine_similarity(2, 5, 5) sim3 = cosine_similarity(3, 5, 5) sim4 = cosine_similarity(4, 5, 5) sim5 = cosine_similarity(5, 5, 5) self.assertAlmostEqual(sim, 0, delta=0.01) self.assertAlmostEqual(sim1, 0.2, delta=0.01) self.assertAlmostEqual(sim2, 0.4, delta=0.01) self.assertAlmostEqual(sim3, 0.6, delta=0.01) self.assertAlmostEqual(sim4, 0.8, delta=0.01) self.assertAlmostEqual(sim5, 1.0, delta=0.01)
def evaluate_auc(self, X_test, Y_test): test_size = len(X_test) Y_true = [int(i) for i in Y_test] Y_score = [] for i in range(test_size): start_node_emb = np.array(self.embeddings[X_test[i][0]]).reshape( -1, 1) end_node_emb = np.array(self.embeddings[X_test[i][1]]).reshape( -1, 1) score = cosine_similarity(start_node_emb, end_node_emb) # ranging from [-1, +1] Y_score.append(score) if len(Y_true) == 0: print( '------- NOTE: two graphs do not have any change -> no testing data -> set result to 1......' ) auc = 1.0 else: auc = auc_score(y_true=Y_true, y_score=Y_score) print("cos sim; auc=", "{:.9f}".format(auc))
def scoreDS(query_dist,url_dict): DS_scores={} for url,doc_dist in url_dict.items(): score = cosine_similarity(doc_dist, query_dist) DS_scores[url] = score return DS_scores
def test_cosine_similarity(self): self.assertEqual(1.0, ut.cosine_similarity([0,1], [0,1])) self.assertEqual(0.0, ut.cosine_similarity([1,0], [0,1])) self.should_raise_for_diff_len_args(ut.cosine_similarity, [0, 1], [0]) self.should_raise_for_diff_len_args(ut.cosine_similarity, [0], [])