def sim_bow(self, pred, pred_lens, ref, ref_lens): """ :param pred - ndarray [batch_size x seqlen] :param pred_lens - list of integers :param ref - ndarray [batch_size x seqlen] """ # look up word embeddings for prediction and reference emb_pred = self.embedding(pred) # [batch_sz x seqlen1 x emb_sz] emb_ref = self.embedding(ref) # [batch_sz x seqlen2 x emb_sz] ext_emb_pred = self.extrema(emb_pred, pred_lens) ext_emb_ref = self.extrema(emb_ref, ref_lens) bow_extrema = cosine(ext_emb_pred, ext_emb_ref) # [batch_sz_pred x batch_sz_ref] avg_emb_pred = self.mean(emb_pred, pred_lens) # Calculate mean over seq avg_emb_ref = self.mean(emb_ref, ref_lens) bow_avg = cosine(avg_emb_pred, avg_emb_ref) # [batch_sz_pred x batch_sz_ref] batch_pred, seqlen_pred, emb_size = emb_pred.shape batch_ref, seqlen_ref, emb_size = emb_ref.shape cos_sim = cosine(emb_pred.reshape( (-1, emb_size)), emb_ref.reshape( (-1, emb_size))) # [(batch_sz*seqlen1)x(batch_sz*seqlen2)] cos_sim = cos_sim.reshape( (batch_pred, seqlen_pred, batch_ref, seqlen_ref)) # Find words with max cosine similarity max12 = cos_sim.max(1).mean(2) # max over seqlen_pred max21 = cos_sim.max(3).mean(1) # max over seqlen_ref bow_greedy = (max12 + max21) / 2 # [batch_pred x batch_ref(1)] return np.max(bow_extrema), np.max(bow_avg), np.max(bow_greedy)
def build_wgraph(alpha=2): if alpha != 2: return [[ int(cosine(H[i], H[j])[0][0] > alpha) for i in range(0, len(H)) ] for j in range(0, len(H))] else: return [[cosine(H[i], H[j])[0][0] for i in range(0, len(H))] for j in range(0, len(H))]
def parallelSimilarity(paramList): protein_embedding_dataframe = representation_dataframe i = paramList[0] j = paramList[1] aspect = paramList[2] if j > i: protein1 = proteinListNew[i] protein2 = proteinListNew[j] if protein1 in protein_names and protein2 in protein_names: prot1vec = np.asarray( protein_embedding_dataframe.query("Entry == @protein1") ['Vector'].item()) prot2vec = np.asarray( protein_embedding_dataframe.query("Entry == @protein2") ['Vector'].item()) #cosine will return in shape of input vectors first dimension cos = cosine(prot1vec.reshape(1, -1), prot2vec.reshape(1, -1)).item() manhattanDist = cdist(prot1vec.reshape(1, -1), prot2vec.reshape(1, -1), 'cityblock') manhattanDistNorm = manhattanDist / (norm(prot1vec, 1) + norm(prot2vec, 1)) euclideanDist = cdist(prot1vec.reshape(1, -1), prot2vec.reshape(1, -1), 'euclidean') euclideanDistNorm = euclideanDist / (norm(prot1vec, 2) + norm(prot2vec, 2)) real = paramList[3] # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled similarity_list.append((real, cos, 1 - manhattanDistNorm.item(), 1 - euclideanDistNorm.item())) return similarity_list
def infer_images(): ''' This function uses the trained model and predict the similarity between two test samples ''' trained_model = load_model('custom_model.h5') ''' created a keras model instance using our desired input and output Mainly because of usability ''' similarity_model = Model(inputs=trained_model.input, outputs=trained_model.get_layer( trained_model.layers[-3].name).output) ''' loaded the dataset ''' (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() ''' Reshape inputs ''' one = similarity_model.predict(x_test[0].reshape((-1, 28, 28, 1))) two = similarity_model.predict(x_test[1].reshape((-1, 28, 28, 1))) sim = cosine(one, two) print("Similarity between the above : {} ".format(round(sim.item(0), 2))) return round(sim.item(0), 2)
def instance_predict(instance, collection, labels, vectors, vect, doc_term_mtx, ks): """ - predicts the label of a new instance ('instance') with a Knn approach w.r.t. a collection ('collection') in terms of both the WMD and cosine similarity with TFIDF vectors for various values of K (stored in the 'ks' list) - returns a dictionary with two keys (names of the two methods) and lists of length len(ks) as values. Each list contains the predictions of the method for each value of K """ #### COMPUTE COSINE SIMILARITY WITH TFIDF VECTORS #### # get tfidf vector of the instance (using vocab from training set only - hence the 'transform' only (no fit)) instance_tfidf = vect.transform([instance[0]]) # compute cosine similarity between new instance and all elements in the collection sims = cosine(doc_term_mtx, Y=instance_tfidf, dense_output=True).tolist() sims = [elt[0] for elt in sims] # get indexes of elements sorted by DECREASING order (the greater the better for cosine sim) and store them in 'idx_st_cos' #### your code here #### #### COMPUTE WMD #### # wmdistance accepts lists of tokens (2nd entry of each tuple) dists = [vectors.wmdistance(instance[1],tuple[1]) for tuple in collection] # get indexes of elements sorted by INCREASING order (the smaller the better for the WMD) and store them in 'idx_st_wmd' #### your code here #### #### GENERATE PREDICTIONS #### predictions = {} predictions['tfidf'] = #### your code here #### use the 'majority_voting' function predictions['wmd'] = #### your code here #### use the 'majority_voting' function return predictions
def compute_context_sis_score(source_word, sis_context, substitution_selection, fasttext_dico, fasttext_emb): context_sis = [] word_context = [] for con in sis_context: if con == source_word or (con not in fasttext_dico): continue word_context.append(con) if len(word_context) != 0: for sub in substitution_selection: sub_emb = fasttext_emb[fasttext_dico.index(sub)].reshape(1, -1) all_sis = 0 for con in word_context: token_index_fast = fasttext_dico.index(con) all_sis += cosine(sub_emb, fasttext_emb[token_index_fast].reshape(1, -1)) context_sis.append(all_sis / len(word_context)) else: for i in range(len(substitution_selection)): context_sis.append(len(substitution_selection) - i) return context_sis
def my_cos_similarity(word1, word2, wv): ## fill the gap ## hint: use the functions my_vector_getter and cosine sim = cosine( my_vector_getter(word1, wv), my_vector_getter(word2, wv) ) return round(float(sim),4)
def similarity(x, y): k = split_coord(x) rt = [] for i in k: rt.append( cosine(np.array(i).reshape(1, -1), np.array(y).reshape(1, -1))[0][0]) return rt
def cosseno(matriz): mu = mediaMatriz(matriz) copia = full(matriz.shape, mu) linhas, colunas = matriz.nonzero() for l, c in zip(linhas, colunas): copia[l, c] = matriz[l, c] return cosine(copia)
def compute_similarities(vector, corpus, top_n=10): """Given an embedding and a corpus, returns the closest k embeddings.""" similarities = { k: (cosine(vector.reshape(1, -1), v.reshape(1, -1)))[0, 0] for k, v in corpus.items() } similarities = dict( sorted(similarities.items(), key=itemgetter(1), reverse=True)[:top_n]) return similarities
def main(): # 加载数据集 dataset = dataloader() dicl = list(dataset.keys()) for dic in dicl: print(dic, ': ', dataset[dic].shape) test_x = dataset[dicl[1]] # (2933, 1024) test_y = dataset[dicl[2]] # (2933, 1) x_train, x_test, y_train, y_test = train_test_split(test_x, test_y, test_size=0.3) # preprocessing # stadard scaler = preprocessing.StandardScaler().fit(dataset[dicl[-1]]) train_att = scaler.transform(dataset[dicl[-1]]) scaler = preprocessing.StandardScaler().fit(dataset[dicl[0]]) test_att = scaler.transform(dataset[dicl[0]]) # MinMaxScaler # train_att = preprocessing.MinMaxScaler().fit_transform(dataset[dicl[-1]]) # test_att = preprocessing.MinMaxScaler().fit_transform(dataset[dicl[0]]) # Nomalization # train_att = preprocessing.normalize(dataset[dicl[-1]], norm="l1") # test_att = preprocessing.normalize(dataset[dicl[0]], norm="l1") # train_att = preprocessing.normalize(dataset[dicl[-1]], norm="l2") # test_att = preprocessing.normalize(dataset[dicl[0]], norm="l2") # cumpute W W = sae(dataset[dicl[-3]].T, train_att.T, lambda_) # print(W.shape) # reconstruct s s_ = s2f(W, dataset[dicl[1]]) print(s_.shape) # compute consine similarity between s_ and test_att dist = cosine(s_.T, test_att) # print(dist.shape) # get the index of the most similarity label y_ = np.argmax(dist, axis=1) # print(y_.shape) # print(y_) # cumpute the accuracy of testSet print("The accuracy is : ", (np.equal(dataset[dicl[2]], dataset[dicl[-2]][y_])).mean()) x_train = s2f(W, test_x) score = nlpway(x_train.T, test_y) print('Score : ', score)
def build_weight(rate, adj, user_to_idx, idx_to_user, ratings): '''Do some tests, and compute cosine weights''' print("user_to_idx[1] = ", user_to_idx[1]) print("idx_to_user[0] = ", idx_to_user[0]) print("Rating list of user 1, rate[1] = ", rate[user_to_idx[1]]) print("Rating list of user 18157, rate[user_to_idx[18157]] = ", rate[user_to_idx[18157]]) print("Rating list of user 48524, rate[user_to_idx[48524]] = ", rate[user_to_idx[48524]]) rate_set = [0 for i in range(len(rate))] for i in rate: rate_set[i] = set(rate[i]) print("Rating set of user 1, rate[user_to_idx[0]] = ", rate_set[user_to_idx[1]]) print("Rating set of user 18157, rate[user_to_idx[18157]] = ", rate_set[user_to_idx[18157]]) print("Rating set of user 48524, rate[user_to_idx[48524]] = ", rate_set[user_to_idx[48524]]) weight = [{} for _ in range(len(adj))] for u in range(len(adj)): if u % 1000 == 0: print("User {0}/{1}".format(u, len(adj))) for v in adj[u]: if v < u: continue mutual_set = rate_set[u].intersection(rate_set[v]) vector_u, vector_v = np.zeros(len(mutual_set)), np.zeros( len(mutual_set)) for i, ele in enumerate(mutual_set): # print(u, v, ele) vector_u[i] = ratings[(u, ele)] vector_v[i] = ratings[(v, ele)] weight[u][v] = cosine(vector_u, vector_v) weight[v][u] = weight[u][v] if u == 1788 and v == 6897 or u == 1162 and v == 37593: print(u, v, vector_u, vector_v) # if len(mutual_set) > 10 and random.random() < 0.0001: # print(u, v, idx_to_user[u], idx_to_user[v], mutual_set) print(adj[1788][6897]) print(weight[6897][1788]) #save weight matrix to file, to save computation time (~8mins using NumPy cosine, ~20mins using scipy cosine) with open("weight.txt", "w+") as f: for i in range(len(adj)): print("User {0}/ {1}".format(i, len(adj))) for v in adj[i]: if v < i: f.writelines( str(i) + " " + str(v) + " " + str(weight[i][v]) + "\n") f.close() return weight
def compute_scores_and_similarities(dataset): score_list = [] similarity_list = [] for i in tqdm(range(len(dataset))): query_pair = dataset[i] question1_vec = (query_pair["id_1_title_vec"] + query_pair["id_1_body_vec"]) / 2.0 question2_vec = (query_pair["id_2_title_vec"] + query_pair["id_2_body_vec"]) / 2.0 score = (1 - cosine(question1_vec.numpy(), question2_vec.numpy()))[0] similarity = query_pair["similarity"] score_list.append(score) similarity_list.append(similarity) return score_list, similarity_list
def preprocess_SR(source_word, substitution_selection, fasttext_dico, fasttext_emb, word_count): ss = [] # ss_score=[] sis_scores=[] count_scores=[] # source_count = 10 # if source_word in word_count: # source_count = word_count[source_word] isFast = True if (source_word not in fasttext_dico): isFast = False else: source_emb = fasttext_emb[fasttext_dico.index(source_word)].reshape(1,-1) if isFast == False and source_word.lower() in fasttext_dico: isFast = True source_emb = fasttext_emb[fasttext_dico.index(source_word.lower())].reshape(1,-1) # ss.append(source_word) for sub in substitution_selection: if sub.lower() not in word_count: continue else: sub_count = word_count[sub.lower()] # if sub_count<source_count: # continue if isFast: if sub not in fasttext_dico: if sub.lower() not in fasttext_dico: continue else: sub_emb = fasttext_emb[fasttext_dico.index(sub.lower())].reshape(1, -1) else: sub_emb = fasttext_emb[fasttext_dico.index(sub)].reshape(1, -1) sis = cosine(source_emb, sub_emb)[0][0] # if sis<0.35: # continue sis_scores.append(sis) ss.append(sub) count_scores.append(sub_count) return ss, sis_scores, count_scores
def neighbors_predict(instance, collection, my_labels, wmd_or_tfidf): # compute WMD or cosine similarity between the new (never seen) instance and each instance in the collection if wmd_or_tfidf == 'wmd': sims = [] for doc in collection: # ! wmdistance works on lists of strings (idx 1 in the tuples) sims.append( my_model.wmdistance(' '.join(instance[1]).lower().split(), ' '.join(doc[1]).lower().split())) # get indexes of elements sorted by INCREASING order (!distance) sorted_idx = sorted(range(len(sims)), key=lambda x: sims[x]) elif wmd_or_tfidf == 'tfidf': # ! tfidf_vectorizer works on raw text (idx 0 in the tuples) doc_term_matrix = tfidf_vectorizer.fit_transform( [elt[0] for elt in collection]) # note that we just transform # fitting has been done on the collection instance_vector = tfidf_vectorizer.transform([instance[0]]) # computes cosine similarity between new instance and all elements in the collection sims = cosine(doc_term_matrix, Y=instance_vector, dense_output=True).tolist() sims = [elt[0] for elt in sims] # get indexes of elements sorted by DECREASING order sorted_idx = sorted(range(len(sims)), key=lambda x: sims[x], reverse=True) predictions = [] # we use odd numbers to break ties for k_nn in [3, 7, 11, 17]: # get labels of k_nn nearest neighbors nn_labels = [my_labels[i] for i in sorted_idx][:k_nn] # get most represented label counts = dict(Counter(nn_labels)) max_counts = max(counts.values()) prediction = [k for k, v in counts.iteritems() if v == max_counts][0] predictions.append(prediction) return predictions
def pre_SR(source_word, substitution_selection, fasttext_dico, fasttext_emb, word_count): ss = [] ##ss_score=[] sis_scores = [] count_scores = [] can = {} score = [] isFast = True if (source_word not in fasttext_dico): isFast = False else: source_emb = fasttext_emb[fasttext_dico.index(source_word)].reshape( 1, -1) #ss.append(source_word) for sub in substitution_selection: if sub not in word_count: continue else: sub_count = word_count[sub] if (sub_count <= 3): continue #if sub_count<source_count: # continue if isFast: if sub not in fasttext_dico: continue token_index_fast = fasttext_dico.index(sub) sis = cosine(source_emb, fasttext_emb[token_index_fast].reshape(1, -1)) sis1 = sis[0, 0] score.append(sis1) #if sis<0.35: # continue sis_scores.append(sis) ss.append(sub) count_scores.append(sub_count) can = dict(zip(ss, score)) can = sorted(can.items(), key=lambda d: d[1], reverse=True) return can
def test_auc_step(nn_model, batch): title1 = batch[ID1_TITLE_VEC] body1 = batch[ID1_BODY_VEC] title2 = batch[ID2_TITLE_VEC] body2 = batch[ID2_BODY_VEC] question1_vec = nn_model.evaluate(title1, body1).data.cpu().numpy()[:, :, 0] question2_vec = nn_model.evaluate(title2, body2).data.cpu().numpy()[:, :, 0] assert question1_vec.shape == question2_vec.shape scores = 1 - cosine(question1_vec, question2_vec) similarities = batch[SIMILARITY].cpu().numpy().flatten() return torch.FloatTensor(scores), torch.LongTensor(similarities)
def evaluate_tfidf(data, tfidf_vectors, query_to_index, eval_func): rrs = [] for entry_id, eval_query_result in data.items(): similar_ids = eval_query_result.similar_ids candidate_ids = eval_query_result.candidate_ids entry_encoding = tfidf_vectors[query_to_index[entry_id]] candidate_similarities = [] for candidate_id in candidate_ids: candidate_encoding = tfidf_vectors[query_to_index[candidate_id]] similarity = cosine(entry_encoding, candidate_encoding) candidate_similarities.append((candidate_id, similarity)) ranked_candidates = sorted(candidate_similarities, key=lambda x: x[1], reverse=True) ranked_candidates = [x[0] for x in ranked_candidates] rrs.append(eval_func(similar_ids, ranked_candidates)) return np.mean(rrs)
def preprocess_SR(source_word, substitution_selection, fasttext_dico, fasttext_emb, word_count): ss = [] ##ss_score=[] sis_scores = [] count_scores = [] source_count = 10 if source_word in word_count: source_count = word_count[source_word] isFast = True if (source_word not in fasttext_dico): isFast = False else: source_emb = fasttext_emb[fasttext_dico.index(source_word)].reshape( 1, -1) #ss.append(source_word) for sub in substitution_selection: if sub not in word_count: continue else: sub_count = word_count[sub] #if sub_count<source_count: # continue if isFast: if sub not in fasttext_dico: continue token_index_fast = fasttext_dico.index(sub) sis = cosine(source_emb, fasttext_emb[token_index_fast].reshape(1, -1)) #if sis<0.35: # continue sis_scores.append(sis) ss.append(sub) count_scores.append(sub_count) return ss, sis_scores, count_scores
def evaluate_tfidf_auc(data, tfidf_vectors, query_to_index): auc = AUCMeter() for entry_id, eval_query_result in data.items(): similar_ids = eval_query_result.similar_ids positives = set(similar_ids) candidate_ids = eval_query_result.candidate_ids entry_encoding = tfidf_vectors[query_to_index[entry_id]] candidate_similarities = [] targets = [] for candidate_id in candidate_ids: candidate_encoding = tfidf_vectors[query_to_index[candidate_id]] similarity = cosine(entry_encoding, candidate_encoding) candidate_similarities.append(similarity.item(0)) targets.append(IS_SIMMILAR_LABEL if candidate_id in positives else NOT_SIMMILAR_LABEL) similarities = torch.Tensor(candidate_similarities) auc.add(similarities, torch.Tensor(targets)) return auc.value(MAXIMUM_FALSE_POSITIVE_RATIO)
def predict_similarity(image1, image2): ''' This function calculates the similarity of two images the model used is pretrained vgg on imagenet dataset we eliminate the last layers and take the output from last convolution layer then flattens it to calculate similarity ''' pretrained_model = VGG16(include_top=False, weights="imagenet", input_shape=(224, 224, 3)) data1 = np.array(Image.open(image1).convert('RGB').resize((224, 224))) data2 = np.array(Image.open(image2).convert('RGB').resize((224, 224))) data1 = (data1.reshape(-1, 224, 224, 3) / 255).astype(np.float32) data2 = (data2.reshape(-1, 224, 224, 3) / 255).astype(np.float32) pred1 = pretrained_model.predict(data1).reshape(1, -1) pred2 = pretrained_model.predict(data2).reshape(1, -1) sim = cosine(pred1, pred2) print("similarity between images {} ".format(round(sim.item(0), 2))) return round(sim.item(0), 2)
def get_similarity(self, feat, stat, cls): max_id = -1 max_cos = -1 if stat: nID = self.id_count else: nID = self.id_count a = feat[None, :] b = self.embedding_bank[:nID, :] if len(b) > 0: alive = np.array(self.alive, dtype=np.int) - 1 cosim = cosine(a, b) cosim = np.reshape(cosim, newshape=(-1)) cosim[alive] = -2 cosim[nID - 1] = -2 cosim[np.where(self.cat_bank[:nID] != cls)[0]] = -2 max_id = int(np.argmax(cosim) + 1) max_cos = np.max(cosim) return max_id, max_cos
def raw_score_substitutions(source_word, substitution_selection, wv_dict, wv_emb, word_count): """ Scoring substitutions according to cosine similarity of word vectors to the replaced word's one and according to the counts """ filtered_substitutions = [] cosine_distance_scores = [] count_scores = [] is_fast = True source_emb = None # computing source word's word vector if source_word not in wv_dict: is_fast = False print("NOT FAST!") else: source_emb = wv_emb[wv_dict.index(source_word)].reshape(1, -1) for sub in substitution_selection: # skipping substitution not in word stats dictionary if sub in word_count: sub_count = word_count[sub] if is_fast: if sub not in wv_dict: continue # computing substitution's word vector's distance to the source word's sub_embedding = wv_emb[wv_dict.index(sub)].reshape(1, -1) cosine_distance_scores.append(cosine(source_emb, sub_embedding)) filtered_substitutions.append(sub) count_scores.append(sub_count) return filtered_substitutions, cosine_distance_scores, count_scores
def compute_baselines_part2(): android_database = AndroidDatabase(use_count_vectorizer=True) validation_set = android_database.get_validation_dataset() testing_set = android_database.get_testing_dataset() metrics_list = [] for dataset in (validation_set, testing_set): meter = metrics.AUCMeter() for query_pair in tqdm(dataset): question1_vec = (query_pair["id_1_title_vec"] + query_pair["id_1_body_vec"]) / 2.0 question2_vec = (query_pair["id_2_title_vec"] + query_pair["id_2_body_vec"]) / 2.0 score = (1 - cosine(question1_vec.numpy(), question2_vec.numpy()))[0] similarity = query_pair["similarity"] meter.add(torch.FloatTensor([score]), torch.LongTensor([similarity])) metrics_list.append(meter.value(0.05)) return {"validation": metrics_list[0], "testing": metrics_list[1]}
def test_step(nn_model, batch): questions_title_batch = batch[TITLE_VEC] questions_body_batch = batch[BODY_VEC] candidate_questions_title_batch = batch[CAND_TITLE_VECS] candidate_questions_body_batch = batch[CAND_BODY_VECS] similarity_vector_batch = batch[SIMILARITY_VEC].numpy() question_vector_batch = nn_model.evaluate( questions_title_batch, questions_body_batch).data.cpu().numpy() candidate_vector_batch = evaluate_multi_questions( nn_model, candidate_questions_title_batch, candidate_questions_body_batch).data.cpu().numpy() candidate_questions_vec = candidate_vector_batch[0] similarity_vector = similarity_vector_batch[0] question_vec = question_vector_batch[0].repeat(len( candidate_questions_vec[0]), axis=1).swapaxes(1, 0) cosines = 1 - cosine(question_vec, candidate_questions_vec.swapaxes(1, 0)) return cosines, similarity_vector
def __getSumVal(self, query, answer): terms1 = self.nlp(query) terms2 = self.nlp(answer) vector1 = [] vector2 = [] for index in range(300): vector1.append(0) vector2.append(0) for term in terms1: if term.has_vector: vector1 = [ term.vector[i] + vector1[i] for i in range(len(term.vector)) ] for term in terms2: if term.has_vector: vector2 = [ term.vector[i] + vector2[i] for i in range(len(term.vector)) ] vector1 = array(vector1).reshape(1, -1) vector2 = array(vector2).reshape(1, -1) return cosine(vector1, vector2)[0][0]
def get_excel_graph_model(datafeatures, opts): # datafeatures = np.loadtxt(opts['output']+'features_for_{}.txt'.format(opts['tradeday']), delimiter = ',') stock_num = np.shape(datafeatures)[0] with open( opts['output'] + 'index_stocks_for_{}.txt'.format(opts['tradeday']), 'r') as f: stock_index = f.readlines() stock_name = [] for i in range(len(stock_index)): find_code = False sl = re.split(r'[\/\\]+', stock_index[i]) for j in range(len(sl)): if re.match(r'\d{6}', sl[j]): stock_name.append(sl[j]) find_code = True assert find_code assert len(stock_name) == stock_num cos_similarity = cosine(datafeatures) # np.savetxt('./output/cosine_similarity_for_{}.txt'.format(opts['tradeday']), cos_similarity, delimiter=',') feature_ungraph = pd.DataFrame(columns=['vertex1', 'vertex2', 'weights']) vertex1 = [] vertex2 = [] weights = [] for i in list(range(stock_num))[0:-1]: for j in list(range(stock_num))[i + 1:]: vertex1.append(stock_name[i]) vertex2.append(stock_name[j]) weights.append(cos_similarity[i, j]) feature_ungraph['vertex1'] = vertex1 feature_ungraph['vertex2'] = vertex2 feature_ungraph['weights'] = weights feature_ungraph.to_excel( opts['output'] + 'feature_graph_for_{}.xls'.format(opts['tradeday']), sheet_name='sheet1') return feature_ungraph
def generate_clusters(self, to_cluster): instruments = list(to_cluster.keys()) similarity_measures = dict( ) # key is 2 instruments, value is cosine similarity for i in range(0, len(instruments) - 1): for j in range(i + 1, len(instruments)): similarity_measures[tuple([ instruments[i], instruments[j] ])] = cosine(to_cluster[instruments[i]], to_cluster[instruments[j]])[0][0] G = nx.Graph() G.add_nodes_from(instruments) for edge in similarity_measures.keys(): edge_tuple = list(edge) # edge_tuple.append({'weight' : similarity_measures[edge]}) G.add_edge(edge_tuple[0], edge_tuple[1], weight=similarity_measures[edge]) print(G) communities = community.best_partition(G) return communities
def my_cos_similarity(word1, word2): sim = cosine(Wt[vocab[word1], ].reshape(1, -1), Wt[vocab[word2], ].reshape(1, -1)) return round(float(sim), 4)
import pandas as pd import numpy as np data = defaultdict(lambda: defaultdict(lambda: int)) dd = [2, 32, 53, 64] for a in dd[1:3]: print(a) user1 = np.array([2, 3, 3, 4, 4, 3, 5, 0, 0, 0, 0, 0, 0]) user2 = np.array([0, 0, 0, 0, 0, 0, 2, 3, 3, 4, 4, 3, 5]) #cosine similarity from sklearn.metrics.pairwise import cosine_similarity as cosine print(cosine(user1.reshape(1, -1), user2.reshape(1, -1))) #correlation #print(np.corrcoef(user1, user2)) user1 = np.array([2, 3, 3, 4, 4, 3, 5, 0, 0, 0, 0, 0, 0]) user2 = np.array([5, 4, 5, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) #cosine similarity from sklearn.metrics.pairwise import cosine_similarity as cosine print(cosine(user1.reshape(1, -1), user2.reshape(1, -1))) #correlation #print(np.corrcoef(user1, user2)) user1 = np.array([ 2, 3, 3, 4, 4, 3, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
def cosine_similarity(s_1, s_2): #remove stopwords s_1 = np.reshape(s_1,(1,-1) ) s_2 = np.reshape(s_2,(1,-1) ) return round(cosine(s_1,s_2), 5)