def _analyse_second_cossim(self, queries, normed_embs, nodes, k, pair): """ This function is called in the multiprocessing of the second order cosine similarity. """ # Convert the indices of nearest neighbors back into numpy indices_0 = np.asarray(queries[pair[0]]) indices_1 = np.asarray(queries[pair[1]]) # Convert the embeddings and nodes back into numpy norm_emb_0 = np.asarray(normed_embs[pair[0]]) norm_emb_1 = np.asarray(normed_embs[pair[1]]) nodes = np.asarray(nodes) # Compute the second order cosine similarity pair_results = [] for i in range(len(nodes)): # Build the set of nearest neighbors w.r.t. both embeddings # Use indices from 1 to k+1, because the first entry will always be the node itself neighbors_union = np.union1d(indices_0[i, 1:(k + 1)], indices_1[i, 1:(k + 1)]) # Vectors of cosine similarity values of nearest neighbors m0 = cos_sim(norm_emb_0[neighbors_union], norm_emb_0[nodes[i]].reshape(1, -1)) m1 = cos_sim(norm_emb_0[neighbors_union], norm_emb_0[nodes[i]].reshape(1, -1)) # Cosine similarity between similarity vectors pair_results.append( float( np.dot(m0, m1) / (np.linalg.norm(m0) * np.linalg.norm(m1)))) return pair_results
def next_prob(h, h_next, window, sim, w2v): h_next_words = [''.join(w) for w in h_next.seg['SEG']] # ['abc', 'de', 'f'] if h.m == h_next.m: return h.prob else: prob_i = [] for i in range(1, h_next.m - h.m + 1): prob_j = [] center = h.m - 1 + i for j in range(1, min(window + 1, center + 1)): # print(center, j, h_next_words[center], h_next_words[center - j]) try: prob_j.append(sim['|'.join( sorted( [h_next_words[center], h_next_words[center - j]]))]) except Exception: prob_j.append( float( cos_sim( np.asarray(w2v[h_next_words[center]], dtype='float32').reshape(1, -1), np.asarray(w2v[h_next_words[center - j]], dtype='float32').reshape(1, -1)))) prob_j = sum(prob_j) / len(prob_j) prob_i.append(prob_j) return ((h.m - 1) * h.prob + sum(prob_i)) / (h_next.m - 1)
def similarity(X,Y=None,Slice=None): ''' Parameters ---------- X : DataFrame DESCRIPTION. Y : DataFrame, optional DESCRIPTION. The default is None. Slice : index, optional Slice of single table for memory managment purposes. The default is None. Returns ------- None. ''' if(Y is None): Y=X index = X.index columns = Y.columns if(Slice is not None): Y = [text_vec.T[Slice]] columns = [Slice] out = pd.DataFrame( data = cos_sim(X,Y), index = index, columns = columns,) return(out)
def get_cos_sim_list(model, patient_word_list, collocation_function=None): """ calculates pairwise cosine similarities of words or collocations in a word list cosine similarity for a word pair that has a collocation is calculated as cosine similarity between a word vector and an average of word vectors from the collocation :param model: gensim.word2vec model :param patient_word_list: list of strings, words produced by the patient :param collocation_function: function combining two word vectors in a collocation, if None (default) mean is taken :return patient_cos_sim_list: list of float, pairwise cosine similarities :return not_found: int, number of words missing from the model vocabulary """ patient_cos_sim_list = [] not_found = 0 for j, word in enumerate(patient_word_list): if j > 0: previous_word = patient_word_list[j - 1] word_vector, nf = collocation_handler(model, word, collocation_function) not_found += nf previous_word_vector, nf = collocation_handler( model, previous_word, collocation_function) not_found += nf if word_vector and previous_word_vector: patient_cos_sim_list.append( cos_sim(word_vector, previous_word_vector)) else: continue not_found = math.ceil(not_found / 2) return patient_cos_sim_list, not_found
def evaluateSimilarity(self, corpusFileName, outputFileName): print("Evaluating Word Similarity") machine_scores = [] human_scores = [] not_found = 0 words_not_found = [] output = open(outputFileName, 'w') output.write("# Word 1\tWord 2\tHuman (mean)\tMachine\n") with open(corpusFileName) as corpus_lines: for corpus_line in corpus_lines: if corpus_line[0] == "#": continue # Reading word from the corpus line = {} line['tag'], line['word_1'], line['word_2'], line[ 'human_score'] = corpus_line.rstrip().split('\t') # Retrieving the vectors of the words if line['word_1'] not in self.glove: not_found += 1 words_not_found.append(line['word_1']) continue if line['word_2'] not in self.glove: not_found += 1 words_not_found.append(line['word_2']) continue word1_vec = np.array(self.glove[line['word_1']]) word2_vec = np.array(self.glove[line['word_2']]) # Computing the score based on the two vectors machine_score = cos_sim(word1_vec.reshape(1, -1), word2_vec.reshape(1, -1))[0][0] * 10 machine_scores.append(machine_score) # Human score human_scores.append(float(line['human_score'])) # the pair, the human score, and the word embeddings score, and the overall correlation. o = '\t'.join([ line['tag'], line['word_1'], line['word_2'], line['human_score'], str(round(machine_score, 4)) ]) output.write(o + '\n') # Evaluate score - compute correlation of the two scores evaluation = correlation(human_scores, machine_scores) evaluation = round(evaluation[0], 4) output.write("# Correlation = " + str(evaluation) + "\n") output.close() print("Evaluation complete.") return evaluation
def chapter10(in_data, out_path): model = word2vec.load('out90.bin') with open(out_path, 'w') as f_out: for a, b, _ in in_data: try: cs = cos_sim([model[a]], [model[b]])[0][0] except Exception as e: cs = -1 print(f'{a} {b} {_} {cs:f}', file=f_out)
def similarite_offsets(list_offsets): sim_offsets = [] for i in range(len(list_offsets)): sim_offsets.append([]) list_tuples = list(list_offsets[i]) for j in range(len(list_tuples)): for k in range(j+1,len(list_tuples)): sim_offsets[-1].append(cos_sim([list_tuples[j]], [list_tuples[k]])[0][0]) return(np.array(sim_offsets))
def mutual_information_similarity(file_name): """ Calculates MI between all pairs of short_genre based on their word's MI. Prints to file the similarity :return: """ from sklearn.metrics.pairwise import cosine_similarity as cos_sim import math SimilarityScore = collections.namedtuple("SimilarityScore", ("g1", "g2", "score")) # a type # fetch all short genres mi_coll = MutualInformation() # all possible pairs of genre with no repeat genres = [] # calculate cosine similarity b/w pairs dv = DictVectorizer() def extract_bow_add_to_genres(genre, bow): if genre not in genres: genres.append(genre) new_bow = {} for k in bow.keys(): curr = bow[k] new_bow[k] = 0 if math.isnan(curr) or math.isinf(curr) else curr new_bow == 0 and print("Eliminated element") return new_bow bow_matrix = dv.fit_transform( extract_bow_add_to_genres(mi_obj.short_genre, mi_obj.bow) for mi_obj in mi_coll.iterable() ) print("Done with making vector") # sort the pairs by the cosine similarity score similarity_matrix = cos_sim(bow_matrix) print("Done with similarity calculation") sorted_list = [] # sort the similarity scores for x, y in itertools.combinations(range(0, len(genres)), 2): sorted_list.append(SimilarityScore(genres[x], genres[y], similarity_matrix[x][y])) # sort! sorted_list = sorted(sorted_list, key=operator.itemgetter(2), reverse=True) print("printing file") with open(file_name, mode="a", errors="ignore", encoding="latin-1") as file: for l in sorted_list: file.write("{}, {} value: {}\n".format(l[0], l[1], l[2]))
def store_sim(w2v, coo, sim_path): print("Storing cosine similarity of co-occurring word pairs ...") sim_dic = {} for keys in tqdm(coo): x_1, x_2 = w2v[keys.split('|')[0]], w2v[keys.split('|')[1]] sim_dic[keys] = float(cos_sim(x_1, x_2)) with open(sim_path, 'w') as f: json.dump(sim_dic, f) print("Cosine similarity stored in {}".format(sim_path)) return sim_dic
def chapter09(in_data, out_path): ft = load('ft') t2i = {token: i for i, token in enumerate(ft)} vec = sio.loadmat('../chapter09/pickles/X_300.mat')['X_300'] with open(out_path, 'w') as f_out: for a, b, _ in in_data: try: cs = cos_sim([vec[t2i[a]]], [vec[t2i[b]]])[0][0] except Exception as e: cs = -1 print(f'{a} {b} {_} {cs:f}', file=f_out)
def _analyse_angle_divergence(self, queries, normed_embs, nodes, k, pair): """ This function is called in the multiprocessing of the k-NN angle divergence. """ # Convert the indices of nearest neighbors back into numpy indices_0 = np.asarray(queries[pair[0]]) indices_1 = np.asarray(queries[pair[1]]) # Convert the embeddings and nodes back into numpy norm_emb_0 = np.asarray(normed_embs[pair[0]]) norm_emb_1 = np.asarray(normed_embs[pair[1]]) nodes = np.asarray(nodes) # Compute the second order cosine similarity pair_results = [] for i in range(len(nodes)): # Build the set of nearest neighbors w.r.t. both embeddings # Use indices from 1 to k+1, because the first entry will always be the node itself neighbors_union = np.union1d(indices_0[i, 1:(k + 1)], indices_1[i, 1:(k + 1)]) # Vectors of cosine similarity values of nearest neighbors cossim_vec0 = np.squeeze( cos_sim(norm_emb_0[neighbors_union], norm_emb_0[nodes[i]].reshape(1, -1))) cossim_vec1 = np.squeeze( cos_sim(norm_emb_1[neighbors_union], norm_emb_1[nodes[i]].reshape(1, -1))) # clip cossim values to feasible interval, which it might leave due to numerical issues cossim_vec0 = np.clip(cossim_vec0, a_min=-1, a_max=1) cossim_vec1 = np.clip(cossim_vec1, a_min=-1, a_max=1) # convert to degrees m0 = np.degrees(np.arccos(cossim_vec0)) m1 = np.degrees(np.arccos(cossim_vec1)) # Cosine similarity between similarity vectors pair_results.append(np.mean(np.abs(m0 - m1))) return pair_results
def calc_sim(dic, smiles_0, smiles_1, func, pickle_dic, conf_type, fp_kwargs): """ Calculate the similatiy between conformers of two different species. Args: dic (dict): prediction dictionary smiles_0 (str): first SMILES string smiles_1 (str): second SMILES string external_fp_fn (str): name of external fingerprinting function func (callable): actual external fingerprinting function pickle_dic (dict): dictionary of the form {smiles: full_pickle_path} for each smiles conf_type (str): whether you're comparing conformers picked randomly for each species or based on their attention weight. fp_kwargs (dict): any keyword arguments you may need for your fingerprinting function. Returns: sim (float): cosine similarity between two conformers, one from each species. """ sub_dic_0 = dic[smiles_0] sub_dic_1 = dic[smiles_1] if func is not None: paths = [pickle_dic[smiles_0], pickle_dic[smiles_1]] fp_0_choices, fp_1_choices = choices_from_pickle(paths) else: fp_0_choices = sub_dic_0["conf_fps"] fp_1_choices = sub_dic_1["conf_fps"] if conf_type == "att": conf_0_idx = sub_dic_0["max_weight_conf"] conf_1_idx = sub_dic_1["max_weight_conf"] fp_0 = fp_0_choices[conf_0_idx] fp_1 = fp_1_choices[conf_1_idx] elif conf_type == "random": fp_0 = random.choice(fp_0_choices) fp_1 = random.choice(fp_1_choices) fps = [fp_0, fp_1] for j, fp in enumerate(fps): if fp_kwargs is None: fp_kwargs = {} if isinstance(fp, Chem.rdchem.Mol): fps[j] = func(fp, **fp_kwargs) sim = cos_sim(fps[0].reshape(1, -1), fps[1].reshape(1, -1)).item() return sim
def plot_sims( W, # (n_samples, n_features) points, labels, title=None): im = plt.imshow(cos_sim(W), vmin=-1.0, vmax=1.0, cmap='seismic') #'hot') im.axes.xaxis.tick_top() plt.colorbar() plt.xticks(points, labels, rotation='vertical', verticalalignment='bottom') plt.yticks(points, labels) if title: plt.xlabel(title) plt.show()
def get_cosine(input_list, y_train): X_train, X_test = input_list if type(X_train) is sparse.csr.csr_matrix: X_train = X_train.toarray() X_test = X_test.toarray() n_samples = X_train.shape[0] n_categs = len(np.unique(y_train)) kfolds = StratifiedKFold(y_train, 4) X_train_features = np.zeros([n_samples, n_categs]) for train, test in kfolds: X1 = X_train[train, :] y1 = y_train[train] X2 = X_train[test, :] temp = pd.DataFrame(np.c_[y1.reshape(-1, 1), X1]) m = np.array(temp.groupby(0).mean()) X_train_features[test, :] = cos_sim(X2, m) temp = pd.DataFrame(np.c_[y_train, X_train]) m = np.array(temp.groupby(0).mean()) features_euc = [X_train_features, cos_sim(X_test, m)] return features_euc
def thread(self, analogy): a, b, c = analogy a_, b_, c_ = self.__getVectors(a), self.__getVectors( b), self.__getVectors(c) d_ = b_ - a_ + c_ d = "" max_score = 0 for i in self.glove: if i == a or i == b or i == c: continue score = cos_sim(d_, np.array(self.glove[i]).reshape(1, -1))[0][0] * 10 if score > max_score: max_score = score d = i return d, max_score
def rating_recommender(self, user): similarity_matrix = cos_sim(self.ratings_matrix) prediction_matrix = np.zeros(self.ratings_matrix.shape) index_top30 = [np.argsort(similarity_matrix[:, user])[-2:-30 - 2:-1]] for item in range(self.rating_matrix.shape[1]): if self.rating_matrix[user][item] == 0: # Denominator is the sum of similarity for each user with its top 30 users: denom = np.sum(similarity_matrix[user, :][index_top30]) # Numerator numer = similarity_matrix[user, :][index_top30].dot( self.rating_matrix[:, item][index_top30]) prediction_matrix[user, item] = numer / denom movie_ids = [i for i in np.argsort(prediction_matrix[user, :])[-30:]] return movie_ids
def get_graph(df): embeddings = None use_module = hub.Module(USE_MODEL_PATH) df['text'] = df['title'] + ' ' + df['summary'] df = df[df['text'].apply(lambda x: isinstance(x, str) and len(x) >= MINIMUM_CHARACTER_THRESHOLD)] df['text'] = df['text'].apply(lambda x: x[:MAXIMUM_CHARACTER_THRESHOLD]) df = df.reset_index(drop=True) with tf.Session() as sess: sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) embeddings_tf = use_module(df['text'].values) embeddings = sess.run(embeddings_tf) similarities = cos_sim(embeddings) edges = np.argwhere(similarities >= SIMILARITY_THRESHOLD) weights = [(u, v, similarities[u, v]) for u, v in edges] weights.sort(key=itemgetter(2), reverse=True) return weights
def fetch_list(movie_user_likes): ##Step 1: Read CSV File #print(df.head()) #print(df.columns) ##Step 2: Select Features features = ['keywords', 'cast', 'genres', 'director'] ##Step 3: Create a column in DF which combines all selected features for feature in features: df[feature] = df[feature].fillna('') df["combined"] = df.apply(combine_features, axis=1) #print(df["combined"].head()) ##Step 4: Create count matrix from this new combined column cv = CountVectorizer() count = cv.fit_transform(df["combined"]) ##Step 5: Compute the Cosine Similarity based on the count_matrix similarity_score = cos_sim(count) #print(similarity_score) #movie_user_likes = "Avatar" ## Step 6: Get index of this movie from its title index = get_index_from_title(movie_user_likes) ## Step 7: Get a list of similar movies in descending order of similarity score movies_to_recommend_scores = list(similarity_score[index]) numbers = list(range(len(movies_to_recommend_scores))) result = dict(zip(numbers, movies_to_recommend_scores)) sorted_keys = sorted(result, key=result.get) sorted_keys = sorted_keys[::-1] ## Step 8: Print titles of first 50 movies movies_to_recommend_list = sorted_keys[1:10] movies_to_recommend = [] for i in movies_to_recommend_list: movies_to_recommend.append(get_title_from_index(i)) return movies_to_recommend
def sentence_similarity_with_all_sentence(sentence_bow,document_bow): """ Attribute 5: Similarity between each sentence and all the other sentences in the model Difference between sentence s and all text in the page First, convert the sentence and target sentence to TF-IDF. Then, get the cosine similarity. :param sentence_bow: bag of word of the current sentence under consideration :param document_bow:bag of words of all other setences in the document :return: """ s_cosine_sim=functools.reduce(lambda mean,curr_other_sentence: mean+cos_sim(sentence_bow,curr_other_sentence)/len(document_bow), document_bow,0) return s_cosine_sim
def search(vectorizer, index_matrix, query=''): """ :param vectorizer: CountVectorizer or TfIdfVectorizer :param index_matrix: tf-idf array :param query: string :return: an array of document paths, sorted by relevance """ if query != '': clean_req = [preproc_req(query)] with open('files_index.txt', 'r') as f_idx: paths = f_idx.read().strip('\n').split('\n') q = vectorizer.transform(clean_req).toarray().reshape(1, -1) rel_dict = defaultdict() for i in range(len(index_matrix)): rel_dict[paths[i]] = cos_sim(index_matrix[i].reshape(1, -1), q) result = sorted(rel_dict, key=rel_dict.get, reverse=True) print(result) return search( vectorizer, index_matrix, input('Введите запрос или нажмите enter, чтобы закончить')) else: return None
def embed_drinks(corpus_path): """ Embed recipe instruction corpus to pretrained fasttext model Params ---- corpus_path: str filepath to recipe instruction corpus to train and embed ing_path: str filepath to information that contains recipe ingredient x drinks Returns ---- pd.DataFrame pandas dataframe that contains cosine similarity of embedded drinks """ df = pd.read_csv("../data/recipe_cleaned_v1.csv", index_col=0, dtype=str) df = df.fillna("0") model = fasttext.train_unsupervised("../data/instruction_corpus.txt") embedded_drinks = [model.get_word_vector(x) for x in df.columns] # compute cosine similarity between drinks sim_matrix = pd.DataFrame(cos_sim(embedded_drinks), columns=phrase_to_word(list(df.columns)), index=phrase_to_word(list(df.columns))) return sim_matrix
def get_features(df_train, df_test): n_dep = len(np.unique(np.concatenate( [df_train['Department_'], df_test['Department_']]))) n_fn = len(np.unique(np.concatenate( [df_train['FinelineNumber_'], df_test['FinelineNumber_']]))) n_upc = len(np.unique(np.concatenate( [df_train['Upc_'], df_test['Upc_']]))) # labels y_train = df_train.groupby(['VisitNumber_']).first()['TripType_'] Y_train = pd.get_dummies(y_train).as_matrix() eps = 2**-52 tfidf = TfidfTransformer(norm='l2', sublinear_tf=True, use_idf=True) # tfidf = TfidfTransformer(norm='l2', sublinear_tf=False, use_idf=True) n_br_fn = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'FinelineNumber_']).sum().reset_index() g['br'] = np.logical_and( g['ScanCount_binary'] > 0, g['ScanCount_binary_neg'] > 0) n_br_fn.append( g.groupby(['VisitNumber_']).sum().reset_index()['br']) n_br_upc = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'Upc_', 'ScanCount_binary']).sum().reset_index() g['br'] = np.logical_and( g['ScanCount_binary'] > 0, g['ScanCount_binary_neg'] > 0) n_br_upc.append( g.groupby(['VisitNumber_']).sum().reset_index()['br']) b_bought = [] n_bought = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_']).sum().reset_index() b_bought.append(g['ScanCount_binary'] > 0) n_bought.append(g['ScanCount_rect']) b_returned = [] n_returned = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_']).sum().reset_index() b_returned.append(g['ScanCount_binary_neg'] > 0) n_returned.append(g['ScanCount_rect_neg']) # fn raw and tfidf fn = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'FinelineNumber_', 'ScanCount_binary']).sum().reset_index() n = len(np.unique(df['VisitNumber_'])) g = g[g['ScanCount_binary'] == 1] s = sparse.csr_matrix( (g['ScanCount_rect'], (g['VisitNumber_'], g['FinelineNumber_'])), shape=(n, n_fn), dtype='float64') fn.append(s) # upc raw and tfidf upc = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'Upc_', 'ScanCount_binary']).sum().reset_index() n = len(np.unique(df['VisitNumber_'])) g = g[g['ScanCount_binary'] == 1] s = sparse.csr_matrix( (g['ScanCount_rect'], (g['VisitNumber_'], g['Upc_'])), shape=(n, n_upc), dtype='float64') upc.append(s) tfidf.fit(fn[0]) fn_tfidf = [] for sm in fn: fn_tfidf.append(tfidf.transform(sm)) print('Getting dot product between mean fn and datasets') fn_dot = get_dot(fn, y_train) print('Getting dot product between mean fn_tfidf and datasets') fn_tfidf_dot = get_dot(fn_tfidf, y_train) tfidf.fit(upc[0]) upc_tfidf = [] for sm in upc: upc_tfidf.append(tfidf.transform(sm)) print('Doing SVD on Fineline ScanCounts...') svd = TruncatedSVD(n_components=100) svd.fit(sparse.hstack([fn[0], upc[0]])) fnupc_red = [] for sm1, sm2 in zip(fn, upc): fnupc_red.append(svd.transform(sparse.hstack([sm1, sm2]))) print('Doing SVD on Fineline/UPC TFIDF ScanCounts...') svd = TruncatedSVD(n_components=1500) svd.fit(sparse.hstack([fn_tfidf[0], upc_tfidf[0]])) fnupc_tfidf_red = [] for sm1, sm2 in zip(fn_tfidf, upc_tfidf): fnupc_tfidf_red.append(svd.transform(sparse.hstack([sm1, sm2]))) print('Doing SVD on Fineline TFIDF ScanCounts...\n') svd = TruncatedSVD(n_components=100) svd.fit(fn_tfidf[0]) fn_tfidf_red = [] for sm in fn_tfidf: fn_tfidf_red.append(svd.transform(sm)) fn_r = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'FinelineNumber_', 'ScanCount_binary']).sum().reset_index() n = len(np.unique(df['VisitNumber_'])) g = g[g['ScanCount_binary'] == 0] s = sparse.csr_matrix( (g['ScanCount_rect_neg'], (g['VisitNumber_'], g['FinelineNumber_'])), shape=(n, n_fn), dtype='float64') fn_r.append(s) tfidf.fit(fn_r[0]) fn_r_tfidf = [] for sm in fn_r: fn_r_tfidf.append(tfidf.transform(sm)) print('Getting dot product between mean fn_r and datasets') fn_r_dot = get_dot(fn_r, y_train) print('Getting dot product between mean fn_r_tfidf and datasets') fn_r_tfidf_dot = get_dot(fn_r_tfidf, y_train) print('Doing SVD on Fineline Return TFIDF ScanCounts...') svd = TruncatedSVD(n_components=50) svd.fit(fn_r_tfidf[0]) fn_r_tfidf_red = [] for sm in fn_r_tfidf: fn_r_tfidf_red.append(svd.transform(sm)) # ######################################### print('Doing SVD on Fineline Difference ScanCounts...\n') diff_br = [] diff_br.append(fn[0] - fn_r[0]) diff_br.append(fn[1] - fn_r[1]) svd = TruncatedSVD(n_components=100) svd.fit(diff_br[0]) diff_br_red = [] for sm in diff_br: diff_br_red.append(svd.transform(sm)) # department total scan counts dep = [] dep_p = [] dep_entropy = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'Department_', 'ScanCount_binary']).sum().reset_index() n = len(np.unique(df['VisitNumber_'])) g = g[g['ScanCount_binary'] == 1] s = sparse.csr_matrix( (g['ScanCount_rect'], (g['VisitNumber_'], g['Department_'])), shape=(n, n_dep), dtype='float64') dep.append(s.toarray()) m = s.toarray() p = m / np.sum(m, axis=1)[:, np.newaxis] p[np.isnan(p)] = 0 entropy = -np.sum(p * np.log(p + eps), axis=1) dep_p.append(p) dep_entropy.append(entropy) tfidf.fit(dep[0]) dep_tfidf = [] for sm in dep: dep_tfidf.append(tfidf.transform(sm).toarray()) sim_matrix = cos_sim(dep[0].T, dep[0].T) sim_matrix /= np.sum(sim_matrix, axis=0) dep = [d.dot(sim_matrix) for d in dep] # dep = [np.log(i + 1) for i in dep] print('Getting dot product between mean dep and datasets') dep_dot = get_dot(dep, y_train) print('Getting dot product between mean dep_p and datasets') dep_p_dot = get_dot(dep_p, y_train) print('Getting dot product between mean dep_tfidf and datasets') dep_tfidf_dot = get_dot(dep_tfidf, y_train) print('Getting distances between mean dep and datasets') # dep_maha = get_mahalanobis(dep, y_train) dep_manh = get_manhattan(dep, y_train) print('Getting distances between mean dep_p and datasets') # dep_p_maha = get_mahalanobis(dep_p, y_train) dep_p_manh = get_manhattan(dep_p, y_train) print('Getting distances between mean dep_tfidf and datasets') # dep_tfidf_maha = get_mahalanobis(dep_tfidf, y_train) dep_tfidf_manh = get_manhattan(dep_tfidf, y_train) print('Getting euclidean for dep') dep_euclidean = get_euclidean(dep, y_train) print('Getting euclidean for dep_p') dep_p_euclidean = get_euclidean(dep_p, y_train) print('Getting euclidean for dep_tfidf\n') dep_tfidf_euclidean = get_euclidean(dep_tfidf, y_train) print('Getting cosine for dep') dep_cosine = get_cosine(dep, y_train) print('Getting cosine for dep_p') dep_p_cosine = get_cosine(dep_p, y_train) print('Getting cosine for dep_tfidf\n') dep_tfidf_cosine = get_cosine(dep_tfidf, y_train) enc = OneHotEncoder(n_values=n_dep) enc.fit(np.argmax(dep_p[0], axis=1).reshape(-1, 1)) top_dep = [] for m in dep_p: onehot = enc.transform(np.argmax(m, axis=1).reshape(-1, 1)).toarray() no_buy = m.sum(axis=1) == 0 onehot[no_buy, :] = 0 top_dep.append(onehot) dep_sorted = [] dep_p_sorted = [] for m1, m2 in zip(dep, dep_p): dep_sorted.append(np.sort(m1, axis=1)[:, -20:]) dep_p_sorted.append(np.sort(m2, axis=1)[:, -20:]) dep_sorted = [np.log(i + 1) for i in dep_sorted] print('Getting dot product between mean dep_sorted and datasets') dep_sorted_dot = get_dot(dep_sorted, y_train) print('Getting dot product between mean dep_p_sorted and datasets') dep_p_sorted_dot = get_dot(dep_p_sorted, y_train) print('Getting distances between mean dep_sorted and datasets') # dep_sorted_maha = get_mahalanobis(dep_sorted, y_train) dep_sorted_manh = get_manhattan(dep_sorted, y_train) print('Getting distances between mean dep_p_sorted and datasets') # dep_p_sorted_maha = get_mahalanobis(dep_p_sorted, y_train) dep_p_sorted_manh = get_manhattan(dep_p_sorted, y_train) print('Getting euclidean for dep_sorted') dep_sorted_euclidean = get_euclidean(dep_sorted, y_train) print('Getting euclidean for dep_p_sorted\n') dep_p_sorted_euclidean = get_euclidean(dep_p_sorted, y_train) print('Getting cosine for dep_sorted') dep_sorted_cosine = get_cosine(dep_sorted, y_train) print('Getting cosine for dep_p_sorted\n') dep_p_sorted_cosine = get_cosine(dep_p_sorted, y_train) # department unique UPCs dep_uniq = [] dep_uniq_p = [] dep_uniq_entropy = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'Department_', 'ScanCount_binary']).size().reset_index() g.rename(columns={0: 'n_unique'}, inplace=True) n = len(np.unique(df['VisitNumber_'])) g = g[g['ScanCount_binary'] == 1] s = sparse.csr_matrix( (g['n_unique'], (g['VisitNumber_'], g['Department_'])), shape=(n, n_dep), dtype='float64') dep_uniq.append(s.toarray()) m = s.toarray() p = m / np.sum(m, axis=1)[:, np.newaxis] p[np.isnan(p)] = 0 entropy = -np.sum(p * np.log(p + eps), axis=1) dep_uniq_p.append(p) dep_uniq_entropy.append(entropy) tfidf.fit(dep_uniq[0]) dep_uniq_tfidf = [] for sm in dep_uniq: dep_uniq_tfidf.append(tfidf.transform(sm).toarray()) sim_matrix = cos_sim(dep_uniq[0].T, dep_uniq[0].T) sim_matrix /= np.sum(sim_matrix, axis=0) dep_uniq = [d.dot(sim_matrix) for d in dep_uniq] # dep_uniq = [np.log(i + 1) for i in dep_uniq] print('Getting dot product between mean dep_uniq and datasets') dep_uniq_dot = get_dot(dep_uniq, y_train) print('Getting dot product between mean dep_uniq_p and datasets') dep_uniq_p_dot = get_dot(dep_uniq_p, y_train) print('Getting dot product between mean dep_uniq_tfidf and datasets') dep_uniq_tfidf_dot = get_dot(dep_uniq_tfidf, y_train) print('Getting distances between mean dep_uniq and datasets') # dep_uniq_maha = get_mahalanobis(dep_uniq, y_train) dep_uniq_manh = get_manhattan(dep_uniq, y_train) print('Getting distances between mean dep_uniq_p and datasets') # dep_uniq_p_maha = get_mahalanobis(dep_uniq_p, y_train) dep_uniq_p_manh = get_manhattan(dep_uniq_p, y_train) print('Getting distances between mean dep_uniq_tfidf and datasets') # dep_uniq_tfidf_maha = get_mahalanobis(dep_uniq_tfidf, y_train) dep_uniq_tfidf_manh = get_manhattan(dep_uniq_tfidf, y_train) print('Getting euclidean for dep_uniq') dep_uniq_euclidean = get_euclidean(dep_uniq, y_train) print('Getting euclidean dep_uniq_p') dep_uniq_p_euclidean = get_euclidean(dep_uniq_p, y_train) print('Getting euclidean for mean dep_uniq_tfidf\n') dep_uniq_tfidf_euclidean = get_euclidean(dep_uniq_tfidf, y_train) print('Getting cosine for dep_uniq') dep_uniq_cosine = get_cosine(dep_uniq, y_train) print('Getting cosine dep_uniq_p') dep_uniq_p_cosine = get_cosine(dep_uniq_p, y_train) print('Getting cosine for mean dep_uniq_tfidf\n') dep_uniq_tfidf_cosine = get_cosine(dep_uniq_tfidf, y_train) # department unique Finelines dep_uniq_fn = [] dep_uniq_fn_p = [] dep_uniq_fn_entropy = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'Department_', 'FinelineNumber_', 'ScanCount_binary']).size().reset_index() g.rename(columns={0: 'n_unique'}, inplace=True) g['n_unique'][g['n_unique'] > 1] = 1 n = len(np.unique(df['VisitNumber_'])) g = g[g['ScanCount_binary'] == 1] s = sparse.csr_matrix( (g['n_unique'], (g['VisitNumber_'], g['Department_'])), shape=(n, n_dep), dtype='float64') dep_uniq_fn.append(s.toarray()) m = s.toarray() p = m / np.sum(m, axis=1)[:, np.newaxis] p[np.isnan(p)] = 0 entropy = -np.sum(p * np.log(p + eps), axis=1) dep_uniq_fn_p.append(p) dep_uniq_fn_entropy.append(entropy) tfidf.fit(dep_uniq_fn[0]) dep_uniq_fn_tfidf = [] for sm in dep_uniq_fn: dep_uniq_fn_tfidf.append(tfidf.transform(sm).toarray()) sim_matrix = cos_sim(dep_uniq_fn[0].T, dep_uniq_fn[0].T) sim_matrix /= np.sum(sim_matrix, axis=0) dep_uniq_fn = [d.dot(sim_matrix) for d in dep_uniq_fn] # dep_uniq_fn = [np.log(i + 1) for i in dep_uniq_fn] print('Getting dot product between mean dep_uniq_fn and datasets') dep_uniq_fn_dot = get_dot(dep_uniq_fn, y_train) print('Getting dot product between mean dep_uniq_fn_tfidf and datasets') dep_uniq_fn_tfidf_dot = get_dot(dep_uniq_fn_tfidf, y_train) print('Getting dot product between mean dep_uniq_fn_p and datasets') dep_uniq_fn_p_dot = get_dot(dep_uniq_fn_p, y_train) print('Getting distances between mean dep_uniq_fn and datasets') # dep_uniq_fn_maha = get_mahalanobis(dep_uniq_fn, y_train) dep_uniq_fn_manh = get_manhattan(dep_uniq_fn, y_train) print('Getting distances between mean dep_uniq_fn_tfidf and datasets') # dep_uniq_fn_tfidf_maha = get_mahalanobis(dep_uniq_fn_tfidf, y_train) dep_uniq_fn_tfidf_manh = get_manhattan(dep_uniq_fn_tfidf, y_train) print('Getting distances between mean dep_uniq_fn_p and datasets') # dep_uniq_fn_p_maha = get_mahalanobis(dep_uniq_fn_p, y_train) dep_uniq_fn_p_manh = get_manhattan(dep_uniq_fn_p, y_train) print('Getting euclidean for dep_uniq_fn') dep_uniq_fn_euclidean = get_euclidean(dep_uniq_fn, y_train) print('Getting euclidean for dep_uniq_fn_tfidf') dep_uniq_fn_tfidf_euclidean = get_euclidean(dep_uniq_fn_tfidf, y_train) print('Getting euclidean for mean dep_uniq_fn_p\n') dep_uniq_fn_p_euclidean = get_euclidean(dep_uniq_fn_p, y_train) print('Getting cosine for dep_uniq_fn') dep_uniq_fn_cosine = get_cosine(dep_uniq_fn, y_train) print('Getting cosine for dep_uniq_fn_tfidf') dep_uniq_fn_tfidf_cosine = get_cosine(dep_uniq_fn_tfidf, y_train) print('Getting cosine for mean dep_uniq_fn_p\n') dep_uniq_fn_p_cosine = get_cosine(dep_uniq_fn_p, y_train) dep_uniq_fn_sorted = [] dep_uniq_fn_p_sorted = [] for m1, m2 in zip(dep_uniq_fn, dep_uniq_fn_p): dep_uniq_fn_sorted.append(np.sort(m1, axis=1)[:, -20:]) dep_uniq_fn_p_sorted.append(np.sort(m2, axis=1)[:, -20:]) dep_uniq_fn_sorted = [np.log(i + 1) for i in dep_uniq_fn_sorted] print('Getting dot product between mean dep_uniq_fn_sorted and datasets') dep_uniq_fn_sorted_dot = get_dot(dep_uniq_fn_sorted, y_train) print('Getting dot product between mean dep_uniq_fn_p_sorted and datasets') dep_uniq_fn_p_sorted_dot = get_dot(dep_uniq_fn_p_sorted, y_train) print('Getting distances between mean dep_uniq_fn_sorted and datasets') # dep_uniq_fn_sorted_maha = get_mahalanobis(dep_uniq_fn_sorted, y_train) dep_uniq_fn_sorted_manh = get_manhattan(dep_uniq_fn_sorted, y_train) print('Getting distances between mean dep_uniq_fn_p_sorted and datasets') dep_uniq_fn_p_sorted_manh = get_manhattan(dep_uniq_fn_p_sorted, y_train) print('Getting euclidean for dep_uniq_fn_sorted') dep_uniq_fn_sorted_euclidean = get_euclidean(dep_uniq_fn_sorted, y_train) print('Getting for dep_uniq_fn_p_sorted\n') dep_uniq_fn_p_sorted_euclidean = get_euclidean(dep_uniq_fn_p_sorted, y_train) print('Getting cosine for dep_uniq_fn_sorted') dep_uniq_fn_sorted_cosine = get_cosine(dep_uniq_fn_sorted, y_train) print('Getting for dep_uniq_fn_p_sorted\n') dep_uniq_fn_p_sorted_cosine = get_cosine(dep_uniq_fn_p_sorted, y_train) # departments scan binaries dep_bin = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'Department_', 'ScanCount_binary']).sum().reset_index() n = len(np.unique(df['VisitNumber_'])) g = g[g['ScanCount_binary'] == 1] s = sparse.csr_matrix( (g['ScanCount_binary'], (g['VisitNumber_'], g['Department_'])), shape=(n, n_dep), dtype='float64') dep_bin.append(s.toarray()) tfidf.fit(dep_bin[0]) dep_bin_tfidf = [] for sm in dep_bin: dep_bin_tfidf.append(tfidf.transform(sm).toarray()) sim_matrix = cos_sim(dep_bin[0].T, dep_bin[0].T) sim_matrix /= np.sum(sim_matrix, axis=0) dep_bin = [d.dot(sim_matrix) for d in dep_bin] print('Getting dot product between mean dep_bin and datasets') dep_bin_dot = get_dot(dep_bin, y_train) print('Getting dot product between mean dep_bin_tfidf and datasets') dep_bin_tfidf_dot = get_dot(dep_bin_tfidf, y_train) print('Getting distances between mean dep_bin and datasets') # dep_bin_maha = get_mahalanobis(dep_bin, y_train) dep_bin_manh = get_manhattan(dep_bin, y_train) print('Getting distances between mean dep_bin_tfidf and datasets') # dep_bin_tfidf_maha = get_mahalanobis(dep_bin_tfidf, y_train) dep_bin_tfidf_manh = get_manhattan(dep_bin_tfidf, y_train) print('Getting euclidean for dep_bin') dep_bin_euclidean = get_euclidean(dep_bin, y_train) print('Getting euclidean for dep_bin_tfidf\n') dep_bin_tfidf_euclidean = get_euclidean(dep_bin_tfidf, y_train) print('Getting cosine for dep_bin') dep_bin_cosine = get_cosine(dep_bin, y_train) print('Getting cosine for dep_bin_tfidf\n') dep_bin_tfidf_cosine = get_cosine(dep_bin_tfidf, y_train) # departments returns dep_r = [] dep_r_p = [] dep_r_entropy = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'Department_', 'ScanCount_binary']).sum().reset_index() n = len(np.unique(df['VisitNumber_'])) g = g[g['ScanCount_binary'] == 0] s = sparse.csr_matrix( (g['ScanCount_rect_neg'], (g['VisitNumber_'], g['Department_'])), shape=(n, n_dep), dtype='float64') dep_r.append(s.toarray()) m = s.toarray() p = m / np.sum(m, axis=1)[:, np.newaxis] p[np.isnan(p)] = 0 entropy = -np.sum(p * np.log(p + eps), axis=1) dep_r_p.append(p) dep_r_entropy.append(entropy) tfidf.fit(dep_r[0]) dep_r_tfidf = [] for sm in dep_r: dep_r_tfidf.append(tfidf.transform(sm).toarray()) sim_matrix = cos_sim(dep_r[0].T, dep_r[0].T) sim_matrix[np.diag_indices(sim_matrix.shape[0])] = 1 sim_matrix /= np.sum(sim_matrix, axis=0) dep_r = [d.dot(sim_matrix) for d in dep_r] # dep_r = [np.log(i + 1) for i in dep_r] print('Getting dot product between mean dep_r and datasets') dep_r_dot = get_dot(dep_r, y_train) print('Getting dot product between mean dep_r_tfidf and datasets') dep_r_tfidf_dot = get_dot(dep_r_tfidf, y_train) print('Getting distances between mean dep_r and datasets') dep_r_manh = get_manhattan(dep_r, y_train) print('Getting distances between mean dep_r_tfidf and datasets') dep_r_tfidf_manh = get_manhattan(dep_r_tfidf, y_train) print('Getting euclidean for dep_r') dep_r_euclidean = get_euclidean(dep_r, y_train) print('Getting euclidean for dep_r_tfidf\n') dep_r_tfidf_euclidean = get_euclidean(dep_r_tfidf, y_train) print('Getting cosine for dep_r') dep_r_cosine = get_cosine(dep_r, y_train) print('Getting cosine for dep_r_tfidf\n') dep_r_tfidf_cosine = get_cosine(dep_r_tfidf, y_train) dep_bought_mr = [] dep_r_sorted = [] dep_r_p_sorted = [] for i, (m1, m2) in enumerate(zip(dep_r, dep_r_p)): n = dep[i].shape[0] no_buy = dep_p[i].sum(axis=1) == 0 temp = dep[i][np.arange(n), np.argmax(m1, axis=1)] temp[no_buy] = 0 dep_bought_mr.append(temp) dep_r_sorted.append(np.sort(m1, axis=1)[:, -5:]) dep_r_p_sorted.append(np.sort(m2, axis=1)[:, -5:]) # departments uniques return dep_r_uniq = [] dep_r_uniq_p = [] dep_r_uniq_entropy = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'Department_', 'ScanCount_binary']).size().reset_index() g.rename(columns={0: 'n_unique'}, inplace=True) n = len(np.unique(df['VisitNumber_'])) g = g[g['ScanCount_binary'] == 0] s = sparse.csr_matrix( (g['n_unique'], (g['VisitNumber_'], g['Department_'])), shape=(n, n_dep), dtype='float64') dep_r_uniq.append(s.toarray()) m = s.toarray() p = m / np.sum(m, axis=1)[:, np.newaxis] p[np.isnan(p)] = 0 entropy = -np.sum(p * np.log(p + eps), axis=1) dep_r_uniq_p.append(p) dep_r_uniq_entropy.append(entropy) tfidf.fit(dep_r_uniq[0]) dep_r_uniq_tfidf = [] for sm in dep_r_uniq: dep_r_uniq_tfidf.append(tfidf.transform(sm).toarray()) sim_matrix = cos_sim(dep_r_uniq[0].T, dep_r_uniq[0].T) sim_matrix[np.diag_indices(sim_matrix.shape[0])] = 1 sim_matrix /= np.sum(sim_matrix, axis=0) dep_r_uniq = [d.dot(sim_matrix) for d in dep_r_uniq] # dep_r_uniq = [np.log(i + 1) for i in dep_r_uniq] # departments scan binaries returned dep_r_bin = [] for df in [df_train, df_test]: g = df.groupby( ['VisitNumber_', 'Department_', 'ScanCount_binary']).sum().reset_index() n = len(np.unique(df['VisitNumber_'])) g = g[g['ScanCount_binary'] == 0] s = sparse.csr_matrix( (g['ScanCount_binary_neg'], (g['VisitNumber_'], g['Department_'])), shape=(n, n_dep), dtype='float64') dep_r_bin.append(s.toarray()) tfidf.fit(dep_r_bin[0]) dep_r_bin_tfidf = [] for sm in dep_r_bin: dep_r_bin_tfidf.append(tfidf.transform(sm).toarray()) sim_matrix = cos_sim(dep_r_bin[0].T, dep_r_bin[0].T) sim_matrix[np.diag_indices(sim_matrix.shape[0])] = 1 sim_matrix /= np.sum(sim_matrix, axis=0) dep_r_bin = [d.dot(sim_matrix) for d in dep_r_bin] print('Getting dot product between mean dep_r_bin and datasets') dep_r_bin_dot = get_dot(dep_r_bin, y_train) print('Getting dot product between mean dep_r_bin_tfidf and datasets') dep_r_bin_tfidf_dot = get_dot(dep_r_bin_tfidf, y_train) print('Getting distances between mean dep_r_bin and datasets') dep_r_bin_manh = get_manhattan(dep_r_bin, y_train) print('Getting distances between mean dep_r_bin_tfidf and datasets') dep_r_bin_tfidf_manh = get_manhattan(dep_r_bin_tfidf, y_train) print('Getting euclidean for dep_r_bin\n') dep_r_bin_euclidean = get_euclidean(dep_r_bin, y_train) print('Getting euclidean for dep_r_bin_tfidf\n') dep_r_bin_tfidf_euclidean = get_euclidean(dep_r_bin_tfidf, y_train) print('Getting cosine for dep_r_bin\n') dep_r_bin_cosine = get_cosine(dep_r_bin, y_train) print('Getting cosine for dep_r_bin_tfidf\n') dep_r_bin_tfidf_cosine = get_cosine(dep_r_bin_tfidf, y_train) n_unique_dep = [] for df in [df_train, df_test]: n_unique_dep.append( df.groupby(['VisitNumber_', 'Department_']).size().reset_index(). groupby(['VisitNumber_']).size().as_matrix()) n_unique_fn = [] for df in [df_train, df_test]: n_unique_fn.append( df.groupby(['VisitNumber_', 'FinelineNumber_']).size().reset_index(). groupby(['VisitNumber_']).size().as_matrix()) n_unique_upc = [] for df in [df_train, df_test]: n_unique_upc.append( df.groupby(['VisitNumber_', 'Upc_']).size().reset_index(). groupby(['VisitNumber_']).size().as_matrix()) max_scan_count = [] for df in [df_train, df_test]: max_scan_count.append(df.groupby(['VisitNumber_'])['ScanCount'].max()) min_scan_count = [] for df in [df_train, df_test]: min_scan_count.append(df.groupby(['VisitNumber_'])['ScanCount'].min()) mean_scan_count_per_dep = [] for i, df in enumerate([df_train, df_test]): mean_scan_count_per_dep.append( 1. * df.groupby(['VisitNumber_'])['ScanCount'].sum() / n_unique_dep[i]) # Weekday onehot = OneHotEncoder() day_train = onehot.fit_transform( df_train.groupby(['VisitNumber_']) .first()['Weekday_'][:, np.newaxis]).toarray() day_test = onehot.fit_transform( df_test.groupby(['VisitNumber_']) .first()['Weekday_'][:, np.newaxis]).toarray() X_train = np.c_[ fnupc_red[0], fnupc_tfidf_red[0], # fn_tfidf_red[0], fn_r_tfidf_red[0], diff_br_red[0], dep[0], dep_tfidf[0], dep_p[0], dep_entropy[0], dep_uniq[0], dep_uniq_tfidf[0], dep_uniq_p[0], dep_uniq_entropy[0], dep_uniq_fn[0], dep_uniq_fn_tfidf[0], dep_uniq_fn_p[0], dep_uniq_fn_entropy[0], dep_bin[0], dep_bin_tfidf[0], dep_sorted[0], dep_p_sorted[0], dep_uniq_fn_sorted[0], dep_uniq_fn_p_sorted[0], dep_r[0], dep_r_tfidf[0], # dep_r_uniq_tfidf[0], dep_r_bin[0], dep_r_bin_tfidf[0], dep_r_sorted[0], dep_r_p_sorted[0], dep_bought_mr[0], top_dep[0], n_br_fn[0], # n_br_upc[0], b_bought[0], n_bought[0], b_returned[0], n_returned[0], n_unique_dep[0], n_unique_fn[0], n_unique_upc[0], max_scan_count[0], min_scan_count[0], mean_scan_count_per_dep[0], day_train, fn_dot[0], fn_tfidf_dot[0], dep_dot[0], dep_tfidf_dot[0], dep_p_dot[0], dep_uniq_dot[0], dep_uniq_tfidf_dot[0], dep_uniq_p_dot[0], dep_uniq_fn_dot[0], dep_uniq_fn_tfidf_dot[0], dep_uniq_fn_p_dot[0], dep_bin_dot[0], dep_bin_tfidf_dot[0], dep_sorted_dot[0], dep_p_sorted_dot[0], dep_uniq_fn_sorted_dot[0], dep_uniq_fn_p_sorted_dot[0], dep_manh[0], dep_tfidf_manh[0], dep_p_manh[0], dep_uniq_manh[0], dep_uniq_tfidf_manh[0], dep_uniq_p_manh[0], dep_uniq_fn_manh[0], dep_uniq_fn_tfidf_manh[0], dep_uniq_fn_p_manh[0], dep_bin_manh[0], dep_bin_tfidf_manh[0], dep_sorted_manh[0], dep_p_sorted_manh[0], dep_uniq_fn_sorted_manh[0], dep_uniq_fn_p_sorted_manh[0], dep_euclidean[0], dep_tfidf_euclidean[0], dep_p_euclidean[0], dep_uniq_euclidean[0], dep_uniq_tfidf_euclidean[0], dep_uniq_p_euclidean[0], dep_uniq_fn_euclidean[0], dep_uniq_fn_tfidf_euclidean[0], dep_uniq_fn_p_euclidean[0], dep_bin_euclidean[0], dep_bin_tfidf_euclidean[0], dep_sorted_euclidean[0], dep_p_sorted_euclidean[0], dep_uniq_fn_sorted_euclidean[0], dep_uniq_fn_p_sorted_euclidean[0], dep_cosine[0], dep_tfidf_cosine[0], dep_p_cosine[0], dep_uniq_cosine[0], dep_uniq_tfidf_cosine[0], dep_uniq_p_cosine[0], dep_uniq_fn_cosine[0], dep_uniq_fn_tfidf_cosine[0], dep_uniq_fn_p_cosine[0], dep_bin_cosine[0], dep_bin_tfidf_cosine[0], dep_sorted_cosine[0], dep_p_sorted_cosine[0], dep_uniq_fn_sorted_cosine[0], dep_uniq_fn_p_sorted_cosine[0], fn_r_dot[0], fn_r_tfidf_dot[0], dep_r_dot[0], dep_r_bin_dot[0], dep_r_tfidf_dot[0], dep_r_bin_tfidf_dot[0], dep_r_manh[0], dep_r_bin_manh[0], dep_r_tfidf_manh[0], dep_r_bin_tfidf_manh[0], dep_r_euclidean[0], dep_r_bin_euclidean[0], dep_r_tfidf_euclidean[0], dep_r_bin_tfidf_euclidean[0], dep_r_cosine[0], dep_r_bin_cosine[0], dep_r_tfidf_cosine[0], dep_r_bin_tfidf_cosine[0], ] X_test = np.c_[ fnupc_red[1], fnupc_tfidf_red[1], # fn_tfidf_red[1], fn_r_tfidf_red[1], diff_br_red[1], dep[1], dep_tfidf[1], dep_p[1], dep_entropy[1], dep_uniq[1], dep_uniq_tfidf[1], dep_uniq_p[1], dep_uniq_entropy[1], dep_uniq_fn[1], dep_uniq_fn_tfidf[1], dep_uniq_fn_p[1], dep_uniq_fn_entropy[1], dep_bin[1], dep_bin_tfidf[1], dep_sorted[1], dep_p_sorted[1], dep_uniq_fn_sorted[1], dep_uniq_fn_p_sorted[1], dep_r[1], dep_r_tfidf[1], # dep_r_uniq_tfidf[1], dep_r_bin[1], dep_r_bin_tfidf[1], dep_r_sorted[1], dep_r_p_sorted[1], dep_bought_mr[1], top_dep[1], n_br_fn[1], # n_br_upc[1], b_bought[1], n_bought[1], b_returned[1], n_returned[1], n_unique_dep[1], n_unique_fn[1], n_unique_upc[1], max_scan_count[1], min_scan_count[1], mean_scan_count_per_dep[1], day_test, fn_dot[1], fn_tfidf_dot[1], dep_dot[1], dep_tfidf_dot[1], dep_p_dot[1], dep_uniq_dot[1], dep_uniq_tfidf_dot[1], dep_uniq_p_dot[1], dep_uniq_fn_dot[1], dep_uniq_fn_tfidf_dot[1], dep_uniq_fn_p_dot[1], dep_bin_dot[1], dep_bin_tfidf_dot[1], dep_sorted_dot[1], dep_p_sorted_dot[1], dep_uniq_fn_sorted_dot[1], dep_uniq_fn_p_sorted_dot[1], dep_manh[1], dep_tfidf_manh[1], dep_p_manh[1], dep_uniq_manh[1], dep_uniq_tfidf_manh[1], dep_uniq_p_manh[1], dep_uniq_fn_manh[1], dep_uniq_fn_tfidf_manh[1], dep_uniq_fn_p_manh[1], dep_bin_manh[1], dep_bin_tfidf_manh[1], dep_sorted_manh[1], dep_p_sorted_manh[1], dep_uniq_fn_sorted_manh[1], dep_uniq_fn_p_sorted_manh[1], dep_euclidean[1], dep_tfidf_euclidean[1], dep_p_euclidean[1], dep_uniq_euclidean[1], dep_uniq_tfidf_euclidean[1], dep_uniq_p_euclidean[1], dep_uniq_fn_euclidean[1], dep_uniq_fn_tfidf_euclidean[1], dep_uniq_fn_p_euclidean[1], dep_bin_euclidean[1], dep_bin_tfidf_euclidean[1], dep_sorted_euclidean[1], dep_p_sorted_euclidean[1], dep_uniq_fn_sorted_euclidean[1], dep_uniq_fn_p_sorted_euclidean[1], dep_cosine[1], dep_tfidf_cosine[1], dep_p_cosine[1], dep_uniq_cosine[1], dep_uniq_tfidf_cosine[1], dep_uniq_p_cosine[1], dep_uniq_fn_cosine[1], dep_uniq_fn_tfidf_cosine[1], dep_uniq_fn_p_cosine[1], dep_bin_cosine[1], dep_bin_tfidf_cosine[1], dep_sorted_cosine[1], dep_p_sorted_cosine[1], dep_uniq_fn_sorted_cosine[1], dep_uniq_fn_p_sorted_cosine[1], fn_r_dot[1], fn_r_tfidf_dot[1], dep_r_dot[1], dep_r_bin_dot[1], dep_r_tfidf_dot[1], dep_r_bin_tfidf_dot[1], dep_r_manh[1], dep_r_bin_manh[1], dep_r_tfidf_manh[1], dep_r_bin_tfidf_manh[1], dep_r_euclidean[1], dep_r_bin_euclidean[1], dep_r_tfidf_euclidean[1], dep_r_bin_tfidf_euclidean[1], dep_r_cosine[1], dep_r_bin_cosine[1], dep_r_tfidf_cosine[1], dep_r_bin_tfidf_cosine[1], ] print('Scaling...') scl = StandardScaler() for i in range(X_train.shape[1]): if len(np.unique(X_train[:, i])) > 2: scl.fit(X_train[:, i].reshape(-1, 1)) xtrain = scl.transform(X_train[:, i].reshape(-1, 1)).flatten() xtest = scl.transform(X_test[:, i].reshape(-1, 1)).flatten() X_train[:, i] = np.clip(xtrain, -25, 25) X_test[:, i] = np.clip(xtest, -25, 25) else: continue return X_train, X_test, Y_train, y_train
def calc_similarity(self): self.user_similarity = cos_sim(self.training_set) print('User based similarity matrix built...')
from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity as cos_sim text = ["London Paris London", "Paris Paris London"] cv = CountVectorizer() count = cv.fit_transform(text) # print(count.toarray()) similarity_score = cos_sim(count) print(similarity_score)
axis=1, inplace=True) #%% from sklearn.metrics.pairwise import cosine_similarity as cos_sim i = 1 handles = merged.value_counts("Handle").index # sim_df = pd.DataFrame(columns = ["Handle", "Similarity"]) sim_df = merged.value_counts("Handle").to_frame('Counts') sim_df.reset_index(inplace=True) for i, handles_temp in enumerate(handles): df_temp = merged.loc[merged["Handle"].astype("str") == handles_temp] X = df_temp.loc[:, ["dif"]].values Y = df_temp.loc[:, ["sent_score"]].values sim_score = cos_sim(np.transpose(X), np.transpose(Y)) sim_df.loc[i, 'Handle'] = handles_temp sim_df.loc[i, 'Similarity'] = sim_score[0][0] # print(sim_df) sim_df # %% plt.plot(sim_df.index, sim_df.Similarity.values) # %% sim_df["Sim_abs"] = abs(sim_df.Similarity.values) sim_df.drop(sim_df.loc[sim_df.Counts < 100].index, inplace=True) sim_df.sort_values("Sim_abs", inplace=True, ascending=False) sim_df # %% sim_df.Counts.values # %% plt.plot(sim_df.Counts.values[10:])
def similarity(x, y): return cos_sim(x, y)
def cosine_eval(trainer, target, features): return [(cos_sim([target], [f]), i) for i, f in enumerate(features)]
out_path_ch09 = 'out94_ch09.txt' out_path_ch10 = 'out94_ch10.txt' word2vec_model = word2vec.load('out90.bin') ft = load('ft') t2i = {token: i for i, token in enumerate(ft)} X_300 = sio.loadmat('../chapter09/pickles/X_300.mat')['X_300'] with zipfile.PyZipFile(in_path, "r") as myzip, open(out_path_ch09, "w") as f_out_ch09, open( out_path_ch10, "w") as f_out_ch10: with myzip.open('combined.tab') as f_in: for line in map(lambda x: x.decode().rstrip(), f_in): words = line.split('\t') try: cs_09 = cos_sim([X_300[t2i[words[0]]]], [X_300[t2i[words[1]]]])[0][0] except Exception as e: cs_09 = -1 try: cs_10 = cos_sim([word2vec_model[words[0]]], [word2vec_model[words[1]]])[0][0] except Exception as e: cs_10 = -1 print(f"{line}\t{cs_09:f}", file=f_out_ch09) print(f"{line}\t{cs_10:f}", file=f_out_ch10) end = time.time() print(f"elapsed time = {end - start} s")
data = pickle.load(f_in) return data in_path = 'out91.txt' out_path = 'out93.txt' ft = load('ft') t2i = {token: i for i, token in enumerate(ft)} vec = sio.loadmat('../chapter09/pickles/X_300.mat')['X_300'] cnt = [0, 0] with open(out_path, 'w') as f_out: for line in open(in_path): a, b, x, y = line.split() # a - b = x - y <=> y = b - a + x try: tgt = [vec[t2i[b]] - vec[t2i[a]] + vec[t2i[x]]] ranking = [(cos_sim([vec[t2i[key]]], tgt)[0][0], key) for key in ft] cs, word = max(ranking) except Exception as e: word = '***' cs = -1 cnt[y == word] += 1 print(f'{a} {b} {x} {y} {word} {cs:f}', file=f_out) message(f'ok = {cnt[True]}, ng = {cnt[False]}') # => ok = ???, ng = ??? ''' # TODO: 実行時間 '''
def pair_texts_similarity(self, ori_sentences, adv_sentences): cls_token = self.tokenizer_embed_lm.cls_token_id sep_token = self.tokenizer_embed_lm.sep_token_id ori_sentences = self.tokenizer_embed_lm(ori_sentences)['input_ids'] adv_sentences = self.tokenizer_embed_lm(adv_sentences)['input_ids'] ori_exclude_ids = [] adv_exclude_ids = [] for ori in range(len(ori_sentences)): for adv in range(len(adv_sentences)): if ori not in ori_exclude_ids and adv not in adv_exclude_ids: distance, operations = edit_distance( ori_sentences[ori], adv_sentences[adv]) if distance == 0: ori_exclude_ids.append(ori) adv_exclude_ids.append(adv) break ori_input = [] for i in range(len(ori_sentences)): if i not in ori_exclude_ids: ori_input += ori_sentences[i][1:-1] ori_input = [cls_token] + ori_input + [sep_token] adv_input = [] for i in range(len(adv_sentences)): if i not in adv_exclude_ids: adv_input += adv_sentences[i][1:-1] adv_input = [cls_token] + adv_input + [sep_token] distance, operations = edit_distance(ori_input, adv_input) if distance == 0: return 1.0 operations = operations[1:].split(',') operations_o = [int(o.split(';')[0].split()[1]) for o in operations] operations_a = [int(o.split(';')[1].split()[1]) for o in operations] partial_ids_o = [[max(operations_o[0] - 2, 0), operations_o[0] + 2]] partial_ids_a = [[max(operations_a[0] - 2, 0), operations_a[0] + 2]] for o, a in zip(operations_o[1:], operations_a[1:]): if o - 2 < partial_ids_o[-1][1]: partial_ids_o[-1][1] = o + 2 else: partial_ids_o.append([o - 2, o + 2]) if a - 2 < partial_ids_a[-1][1]: partial_ids_a[-1][1] = a + 2 else: partial_ids_a.append([a - 2, a + 2]) partial_ori = [] partial_adv = [] for o, a in zip(partial_ids_o, partial_ids_a): partial_o = ori_input[o[0]:o[1]] if partial_o[0] != cls_token: partial_o = [cls_token] + partial_o if partial_o[-1] != sep_token: partial_o = partial_o + [sep_token] partial_a = adv_input[a[0]:a[1]] if partial_a[0] != cls_token: partial_a = [cls_token] + partial_a if partial_a[-1] != sep_token: partial_a = partial_a + [sep_token] partial_ori.append(partial_o) partial_adv.append(partial_a) if self.verbose: for i in range(len(partial_ori)): print(get_time() + '[INFO] Modification number: %d' % i) print( self.tokenizer_embed_lm.convert_ids_to_tokens( partial_ori[i])) print( self.tokenizer_embed_lm.convert_ids_to_tokens( partial_adv[i])) ori_inputs = [ori_input] + partial_ori adv_inputs = [adv_input] + partial_adv with torch.no_grad(): ori_sentence_emb = [] for i in range(len(ori_inputs)): output = self.model_embed_lm( torch.tensor(ori_inputs[i]).unsqueeze(0).to(self.device)) ori_sentence_emb.append(output.pooler_output if self.pooler == 'cls' else output.last_hidden_state[:, 0]) ori_sentence_emb = torch.cat(ori_sentence_emb, axis=0).cpu() adv_sentence_emb = [] for i in range(len(adv_inputs)): output = self.model_embed_lm( torch.tensor(adv_inputs[i]).unsqueeze(0).to(self.device)) adv_sentence_emb.append(output.pooler_output if self.pooler == 'cls' else output.last_hidden_state[:, 0]) adv_sentence_emb = torch.cat(adv_sentence_emb, axis=0).cpu() similarity = np.array([cos_sim(o.reshape(1, -1), a.reshape(1, -1))[0][0] \ for o, a in zip(ori_sentence_emb.numpy(), adv_sentence_emb.numpy())]) if len(similarity) > 1: if self.verbose: print(get_time() + '[INFO] Original similarity score: %f' % similarity[0]) print(get_time() + '[INFO] Average partial similarity score: %f' % np.average(similarity[1:])) print(get_time() + '[INFO] Minimum partial similarity score: %f' % similarity[1:].min()) print(similarity[1:]) all_sim, avg_sim, min_sim = similarity[0], np.average( similarity[1:]), similarity[1:].min() else: all_sim, avg_sim, min_sim = similarity[0], similarity[ 0], similarity[0] similarity = self.lambda1 * min_sim + self.lambda2 * avg_sim + ( 1 - self.lambda1 - self.lambda2) * all_sim return similarity
def get_corr(args): #classes, class_to_idx, idx_to_class = utils.get_classes(dataset) f = open(os.getcwd() + '/results/files/' + args.run_name + '/encoding_dict.json', 'r') for line in f: reps = json.loads(line) f = open(os.getcwd() + '/data/files/sketchy_classes.json', 'r') for line in f: class_splits = json.loads(line) f = open(os.getcwd() + '/data/files/class_to_idx.json', 'r') for line in f: class_to_idx = json.loads(line) classes = class_splits['train'] #attr_dict, n_attrs = get_attrs(class_to_idx) for name in reps: print(name) print(len(reps[name])) return ''' f = open('/Users/romapatel/Desktop/avg_vgg128_nouns.csv', 'r') lines = f.readlines() vgg_dict = {} for line in lines: items = line.strip().split(',') #print(items[0]) if items[0] in classes: vgg_dict[items[0]] = [float(item) for item in items[1:]] ''' # finally run this using the function in utils print(len(classes)) f = open(os.getcwd() + '/data/files/sem-vis-sketchy.tsv', 'r') lines = [line.strip().split('\t') for line in f.readlines()] # evaluate only the first class_rep_dict, sims, true = {}, [], [] for key in reps: val = class_to_idx[int(key)] if val not in classes: continue class_name = classes[int(key)] # evaluate only the first class_rep_dict[class_name] = reps[key] encoding_dict = class_rep_dict for key in encoding_dict: val = class_to_idx[key] if key not in encoding_dict.keys(): continue print(len(encoding_dict[key])) #print(encoding_dict[key]) sims = [] for rep1 in encoding_dict[key]: for rep2 in encoding_dict[key]: sims.append(cos_sim(np.array(rep1).reshape(1, -1), \ np.array(rep2).reshape(1, -1))) #print(sims) print(np.mean(sims)) return ''' f = open(os.getcwd() + '/data/files/wvecs.json', 'r') for line in f: wvecs = json.loads(line) print(wvecs) ''' #class_rep_dict = attr_dict for line in lines: word1, word2 = line[0], line[1] if word1 not in class_rep_dict.keys(): continue if word2 not in class_rep_dict.keys(): continue print(len(class_rep_dict[word1])) rep1 = np.array(class_rep_dict[word1]).reshape(1, -1) rep2 = np.array(class_rep_dict[word2]).reshape(1, -1) sim = cos_sim(rep1, rep2)[0][0] sims.append(sim) true.append(float(line[2])) s = word1 + '-' + word2 print(s) print(cos_sim(rep1, rep2)) pearson = pearsonr(sims, true) spearman = spearmanr(sims, true) print(pearson) print(spearman)
def getDistance(self, x1, x2): return np.sum(cos_sim(x1, x2))
def cluster_sentences(self, enc, seq_len, words_conf): # ------------------------------------------------------------ # dynamic clustering depending on num low confidence samples # ------------------------------------------------------------ n_clusters = int(len(seq_len) / config.num_clusters) print('\nSimilarity metric is {}\n'.format(config.similarity)) if config.similarity == 'siamese': print("\nReloading the sentence similarity model...\n") graph = tf.Graph() with graph.as_default(): sess = tf.Session() siamese = Siamese_Model(sess) #--------------------------------------------------- # take all possible pairwise combinations of confused # samples to obtain the similarity scores pairwise. # The similarity matrix is symmetric. #--------------------------------------------------- split1, split2 = np.array_split(np.arange(len(seq_len)), 2) max_len = max(seq_len) seq_len1, seq_len2 = \ [seq_len[i] for i in split1], [seq_len[i] for i in split2] if not config.model_aware: sent1, sent2 = [ np.array(enc[i][0][0]).tolist() for i in split1 ], [np.array(enc[i][0][0]).tolist() for i in split2] else: sent1, sent2 = [enc[i] for i in split1], [enc[i] for i in split2] if config.model.split()[1] == 'LSTM' or not config.model_aware: dim = config.hidden_size_lstm else: dim = 2 * config.hidden_size_lstm for i, row in enumerate(sent1): if len(row) <= max_len: sent1[i] += [np.zeros(dim).tolist()] * (max_len - len(row)) try: sent2[i] += [np.zeros(dim).tolist() ] * (max_len - len(sent2[i])) except IndexError: sent2 += [[np.zeros(dim).tolist()] * len(sent1[i])] seq_len2 += [1] siamese_enc = np.concatenate( siamese.run(sent1, sent2, seq_len1, seq_len2, max_len, len(split1), len(split2))) def similarity_scores(enc): shape = np.array(enc).shape out = np.reshape(np.repeat(enc, [shape[0]], axis=0), (-1, shape[0], shape[1])) X = np.exp(-1 * np.sqrt( np.sum(np.square(out - np.transpose(out, (1, 0, 2))), 2, keepdims=False))) return X X = similarity_scores(siamese_enc) clustering = self.spectral_clustering(X, n_clusters) elif config.similarity == 'cosine': enc1 = [emb[-1] for emb in enc] X = np.exp(cos_sim(enc1, enc1)) clustering = self.spectral_clustering(X, n_clusters) elif config.similarity == 'skipthoughts': model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) vectors = encoder.encode([' '.join(list) for list in words_conf]) X = np.exp(cos_sim(vectors, vectors)) #vectors = vectors / np.linalg.norm(vectors) #X = np.cos(np.dot(vectors, np.transpose(vectors))) clustering = self.spectral_clustering(X, n_clusters) return clustering
def get_graph_d3(old1, new1, csim, cstars, cenr, chours): """ determines optimal path (shortest path) Parameters ---------- old1 : int index of old topic new1 : int index of new topic csim : int or float weight for course similarity cstars : int or float weight for course rating cenr : int or float weight for course enrollment chours : int or float weight for course length Returns ------- shortpath : array shortest path in the course graph """ # load Graph file = open('networkx_graph.pkl', 'rb') G = pickle.load(file) file.close() # load positions file = open('networkx_pos.pkl', 'rb') pos = pickle.load(file) file.close() # load node values file = open('networkx_values.pkl', 'rb') values = pickle.load(file) file.close() # load titles file = open('course_titles.pkl', 'rb') titles = pickle.load(file) file.close() # topic scores mat = loadmat('scoremat.mat') scoremat = mat['scoremat'] scorecorrs = cos_sim(scoremat) for d in range(len(scorecorrs)): scorecorrs[d, d] = 0 print('corr test 1:', scorecorrs[old1, new1]) # numeric course info mat = loadmat('course_numeric_info.mat') stars = mat['stars'] hours = mat['hours'] enrollment = mat['enrollment'] Gdir = nx.DiGraph(G) list_edges = list(Gdir.edges) # add weighted costs to edges stars_norm = normalize_cost(stars, 1) enrollment_norm = normalize_cost(np.log10(enrollment), 1) hours_norm = normalize_cost(hours) weighted_costs = cstars * stars_norm + cenr * enrollment_norm + chours * hours_norm if np.shape(weighted_costs)[0] == 1: weighted_costs = weighted_costs.T list_weighted_costs = [] list_weights = [] for edge in Gdir.edges: sim = scorecorrs[edge[0], edge[1]] dissim = 1 - sim edge_cost = weighted_costs[edge[1]] + csim * dissim if edge_cost < 0: print(edge) Gdir.edges[edge[0], edge[1]]['weighted_cost'] = edge_cost Gdir.edges[edge[0], edge[1]]['weight'] = 1 - edge_cost list_weighted_costs.append(edge_cost) list_weights.append(1 - edge_cost) print(np.min(np.array(list_weighted_costs))) print('corr:', scorecorrs[old1, new1]) #edge_weights = [Gdir[u][v]['weight']-.4 for u,v in G.edges()] # min is .5; -.4 so that min is .1 # shortest path shortpath = shortest_path(Gdir, old1, new1, weight='weighted_cost') print('shortpath:', shortpath) mytuples = [] mytuples_directed = [] for i in range(len(shortpath) - 1): newlink = (shortpath[i], shortpath[i + 1]) if shortpath[i] < shortpath[i + 1]: newlink = (shortpath[i], shortpath[i + 1]) else: newlink = (shortpath[i + 1], shortpath[i]) mytuples.append(newlink) newlink = (shortpath[i], shortpath[i + 1]) mytuples_directed.append(newlink) # write nodes_output.csv # write nodes not in shortpath first so large nodes are drawn on top with open('static/nodes_output.csv', mode='w') as fp: fwriter = csv.writer(fp, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) fwriter.writerow(['x', 'y', 'strength', 'radius', 'title']) for i in range(len(pos)): if i in shortpath: fwriter.writerow( [pos[i][0], pos[i][1], int(values[i]), 4, titles[i]]) else: pass # write edges_output.csv with open('static/edges_output.csv', mode='w') as fp: fwriter = csv.writer(fp, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) fwriter.writerow(['x1', 'x2', 'y1', 'y2', 'width', 'color']) for i in range(len(list_edges)): if list_edges[i] in mytuples: if list_edges[i] in mytuples_directed: x1 = pos[list_edges[i][0]][0] x2 = pos[list_edges[i][1]][0] y1 = pos[list_edges[i][0]][1] y2 = pos[list_edges[i][1]][1] else: x1 = pos[list_edges[i][1]][0] x2 = pos[list_edges[i][0]][0] y1 = pos[list_edges[i][1]][1] y2 = pos[list_edges[i][0]][1] fwriter.writerow([x1, x2, y1, y2, 2, '#ff0000']) return shortpath
def prototype_model(pixel_type, num): coarse, fine, cat_dict = get_categories() f = open(path + 'data/tu-berlin/train_1.json', 'r') for line in f: train = json.loads(line) f = open(path + 'data/tu-berlin/test_1.json', 'r') for line in f: test = json.loads(line) print 'Inside prototype model\n' prototypes = {} f = open('/Users/romapatel/Desktop/prototypes_20.json', 'r') for line in f.readlines()[:num]: temp = json.loads(line) prototypes[temp['category']] = temp all_cats = sorted(prototypes.keys()) results = {} num = len(all_cats) cos_matrix, sp_matrix = np.zeros((num, num)), np.zeros((num, num)) abs_matrix, ce_matrix = np.zeros((num, num)), np.zeros((num, num)) for i in range(len(all_cats)): category = sorted(prototypes.keys())[i] print category cat_path = path + 'data/tu-berlin/sketches_png/' + category + '/' if os.path.isdir(cat_path) is False: continue filenames = test[category] for filename in filenames: if '.DS' in filename: continue print filename a = Image(cat_path + filename) if pixel_type == 'bin': pixels = a.get_pixel_features() else: pixels = a.get_pixels() flat_pixels = [val for sublist in pixels for val in sublist] cos_temp, sp_temp = [], [] for j in range(len(all_cats)): cat = sorted(prototypes.keys())[j] prototype = prototypes[cat]['prototype_arr'] print len(prototype) flat_prototype = [ val for sublist in prototype for val in sublist ] cos_matrix[i][j] += np.mean(cos_sim(pixels, prototype)) sp_matrix[i][j] += spearmanr(flat_pixels, flat_prototype)[0] abs_matrix[i][j] += np.mean(pixels - prototype) ce_matrix[i][j] += np.mean(log_loss(pixels, prototype)) break cos_matrix = [list(item) for item in cos_matrix] sp_matrix = [list(item) for item in sp_matrix] abs_matrix = [list(item) for item in abs_matrix] ce_matrix = [list(item) for item in ce_matrix] f = open( path + 'results/tu-berlin/prototype/' + pixel_type + '_' + str(num) + '.json', 'w+') results = { 'cos_sim': list(cos_matrix), 'spearman': list(sp_matrix), 'abs': list(abs_matrix), 'ce': list(ce_matrix), 'indices': all_cats } f.write(json.dumps(results))