def test_cosine_distances(): # Check the pairwise Cosine distances computation rng = np.random.RandomState(1337) x = np.abs(rng.rand(910)) XA = np.vstack([x, x]) D = cosine_distances(XA) assert_array_almost_equal(D, [[0., 0.], [0., 0.]]) # check that all elements are in [0, 2] assert np.all(D >= 0.) assert np.all(D <= 2.) # check that diagonal elements are equal to 0 assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.]) XB = np.vstack([x, -x]) D2 = cosine_distances(XB) # check that all elements are in [0, 2] assert np.all(D2 >= 0.) assert np.all(D2 <= 2.) # check that diagonal elements are equal to 0 and non diagonal to 2 assert_array_almost_equal(D2, [[0., 2.], [2., 0.]]) # check large random matrix X = np.abs(rng.rand(1000, 5000)) D = cosine_distances(X) # check that diagonal elements are equal to 0 assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0]) assert np.all(D >= 0.) assert np.all(D <= 2.)
def sumACluster(dist, vecsIn, topK_t, sameTweetThred): if dist == "cosine": distMatrix = pairwise.cosine_distances(vecsIn) elif dist == "eu": distMatrix = pairwise.euclidean_distances(vecsIn, vecsIn) sameTweetClusters = [[0]] for seqid, text in enumerate(vecsIn[1:], start=1): added = None for stcid, stc in enumerate(sameTweetClusters): sameFlag = False if distMatrix[seqid][stc[0]] <= sameTweetThred: sameFlag = True if sameFlag: stc.append(seqid) added = (stcid, stc) break if added is None: sameTweetClusters.append([seqid]) else: sameTweetClusters[added[0]] = added[1] sameTweetClusterNum = [(stcid, len(stc)) for stcid, stc in enumerate(sameTweetClusters)] numIn = len(sameTweetClusterNum) top = sorted(sameTweetClusterNum, key = lambda a:a[1], reverse=True)[:min(topK_t, numIn)] top = [(sameTweetClusters[item[0]][0], item[1]) for item in top] return top
def test_linkage_misc(): # Misc tests on linkage X = np.ones((5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foobar').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foobar') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", DeprecationWarning) # Use the copy argument, to raise a warning Ward(copy=True).fit(X) # We should be getting 2 warnings: one for using Ward that is # deprecated, one for using the copy argument assert_equal(len(warning_list), 2) # test hiearchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hiearchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_linkage_misc(): # Misc tests on linkage rnd = np.random.RandomState(42) X = rnd.normal(size=(5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foo') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # Deprecation of Ward class with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", DeprecationWarning) Ward().fit(X) assert_equal(len(warning_list), 1) # test hiearchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hiearchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def get_features(vocab): vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3) print("latent_dirichlet_allocation_cos: fit and transform body") t0 = time() lda_body_matrix = lda_body.fit_transform(X_train_body) print("done in %0.3fs." % (time() - t0)) print("latent_dirichlet_allocation_cos: transform head") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar lda_head_matrix = lda_body.transform(X_train_head) #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100) print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body') # calculate cosine distance between the body and head X = [] for i in range(len(lda_head_matrix)): X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X
def getModelInfo(model, features): print("Shape of the transformed features = {}".format(features.shape)) # Uncomment to info: # vocab = model.get_feature_names() # dist = np.sum(features, axis=0) # for tag, count in izip(vocab, dist): # print("word = {}, frequency = {}".format(tag, count)) return cosine_distances(features)
def calcurate_centroid_Matrix(veclist, word2vecdic,DimentionN): centroid_Matrix = np.zeros((DimentionN, 200)) distance_arrays = np.zeros(DimentionN) for word in veclist: label = word2vecdic[word] centroid_Matrix[label] += veclist[word] for word in veclist: label = word2vecdic[word] distance_arrays[label] += cosine_distances(veclist[word], centroid_Matrix[label]) return centroid_Matrix, distance_arrays
def _build_metastore(self): medians = np.median(self.X, axis=0).reshape(1, self.dim) # how far each data point is from the global median dists = cosine_distances(self.X, Y=medians).reshape(-1) sorted_index = [self.index[i] for i in dists.argsort()] return {'sorted_index': sorted_index}
def memory_cf(users, movies, k, similarity_measure, weight_schema, repr_matrix=rating_matrix_orig, rating_matrix=rating_matrix_orig): """ Memory-based collaborative filtering. :param users: a user list. :param movies: a movie list. :param k: number of nearest users :param similarity_measure: 'cosine' or 'dot_product' :param weight_schema: 'mean' or 'weighted_mean' :param repr_matrix: data point representation :param rating_matrix: ratings based on user-movie or cluster centroids :return: recommended ratings for the queries """ # construct mapping between input users and unique users ratings, user_unique = [], list(set(users)) user_index_map = dict((u, i) for i, u in enumerate(user_unique)) users = [(u, user_index_map[u]) for u in users] # find k nearest neighbor for each user if similarity_measure == 'cosine': dist = cosine_distances(repr_matrix[user_unique, :], repr_matrix) sims = 1 - dist elif similarity_measure == 'dot_product': sims = repr_matrix[user_unique, :].dot(repr_matrix.T) if issparse(sims): sims = sims.toarray() dist = -sims sorted_neighbors = np.argsort(dist, axis=1) # make rating matrix dense for fast access rating_matrix = rating_matrix.toarray() weight_method = mean if weight_schema == 'mean' else weighted_mean for (user_index, neighbor_index), movie in zip(users, movies): neighbors = list(islice(ifilter(lambda u: (u, movie) in entry_set, sorted_neighbors[neighbor_index]), k + 1)) # no neighbors, regarded as 3 if not neighbors: ratings.append(3) continue # exclude itself if user_index in neighbors: neighbors.remove(user_index) rating = weight_method(rating_matrix[neighbors, movie], sims[neighbor_index, neighbors]) ratings.append(rating) return ratings
def get_features(head_and_body): filename = "NMF_topics" + str(n_topics) + "topics" if include_holdout == True: filename += "_holdout" if include_unlbled_test == True: filename += "unlbled_test" if not (os.path.exists(features_dir + "/" + filename + ".pkl")): X_all, vocab = get_all_data(head_and_body, filename) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic nfm = NMF(n_components=n_topics, random_state=1, alpha=.1) print("NMF_topics: fit and transform body") t0 = time() nfm.fit_transform(X_all) print("done in %0.3fs." % (time() - t0)) with open(features_dir + "/" + filename + ".pkl", 'wb') as handle: joblib.dump(nfm, handle, protocol=pickle.HIGHEST_PROTOCOL) else: vocab = get_vocab(head_and_body, filename) with open(features_dir + "/" + filename + ".pkl", 'rb') as handle: nfm = joblib.load(handle) vectorizer_head = TfidfVectorizer(vocabulary=vocab, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) print("NMF_topics: transform head and body") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar nfm_head_matrix = nfm.transform(X_train_head) nfm_body_matrix = nfm.transform(X_train_body) if cosinus_dist == False: return np.concatenate([nfm_head_matrix, nfm_body_matrix], axis=1) else: # calculate cosine distance between the body and head X = [] for i in range(len(nfm_head_matrix)): X_head_vector = np.array(nfm_head_matrix[i]).reshape((1, -1)) # 1d array is deprecated X_body_vector = np.array(nfm_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X
def get_sparse_dist_matrix(tweets_tfidf_matrix, eps): """Get the sparse distance matrix from the pairwise cosine distance computations from the given tfidf vectors. Only distances less than or equal to eps are put into the matrix""" rows = [] cols = [] data = [] for ndx, tweet in enumerate(tweets_tfidf_matrix): rows.append(len(cols)) distances = cosine_distances(tweet, tweets_tfidf_matrix)[0] for other_ndx, dist in enumerate(distances): if ndx != other_ndx and dist <= eps: cols.append(other_ndx) data.append(dist) return csr_matrix((data, cols, rows), dtype=int)
def get_features(n_topics): features_dir = "%s/data/fnc-1/features" % (path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))) filename = "lda_gensim_cos_" + str(n_topics) + "topics" if (os.path.exists(features_dir + "/" + filename + ".pkl")): lda = models.LdaMulticore.load(features_dir + "/" + filename + ".pkl") dictionary = corpora.Dictionary.load(features_dir + "/" + filename + ".dict") print("latent_dirichlet_allocation_gensim_cos model found and load") else: print("Creating new latent_dirichlet_allocation_gensim_cos model") h, b = word_ngrams.get_head_body_tuples() head_and_body = combine_and_tokenize_head_and_body(h, b) dictionary = corpora.Dictionary(head_and_body) dictionary.save(features_dir + "/" + filename + ".dict") corpus = [dictionary.doc2bow(text) for text in head_and_body] print(dictionary) lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=n_topics, workers=1) lda.save(features_dir + "/" + filename + ".pkl") X = [] for i in range(len(headlines)): X_head_vector = lda[dictionary.doc2bow(nltk.word_tokenize(headlines[i]))] X_body_vector = lda[dictionary.doc2bow(nltk.word_tokenize(bodies[i]))] # calculate zero padded vector for cosinus distance X_head_vector_filled = np.zeros(n_topics, dtype=np.double) for id, prob in X_head_vector: X_head_vector_filled[id] = prob X_body_vector_filled = np.zeros(n_topics, dtype=np.double) for id, prob in X_body_vector: X_body_vector_filled[id] = prob # reshape for sklearn X_head_vector_filled_reshaped = np.array(X_head_vector_filled).reshape((1, -1)) # 1d array is deprecated X_body_vector_filled_reshaped = np.array(X_body_vector_filled).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector_filled_reshaped, X_body_vector_filled_reshaped).flatten() X.append(cos_dist.tolist()) return X
def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foo') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # test hiearchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hiearchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def cluster_cf_memory(): """ Cluster-based memory CF. """ rating_matrix_cluster = np.empty([k_user, rating_matrix_orig.shape[1]], dtype=np.float64) # build rating matrix for each user cluster, on each movie for i in range(k_user): cluster_indicator = np.where(user_belonging == i)[0] rating_cluster = rating_matrix_orig[cluster_indicator, :] rating_sum = rating_cluster.sum(axis=0) # take average by dividing count rating_cluster.data = np.ones(len(rating_cluster.data)) mu = rating_sum / rating_cluster.sum(axis=0) # fill 0 for nan mu[np.isnan(mu)] = 0 rating_matrix_cluster[i, :] = mu # construct mapping between input users and unique users ratings, user_unique = [], list(set(users)) user_index_map = dict((u, i) for i, u in enumerate(user_unique)) users_neighbors = [user_index_map[u] for u in users] if similarity_measure == 'cosine': dist = cosine_distances(rating_matrix_orig[user_unique, :], m2uc.T) sims = 1 - dist else: sims = rating_matrix_orig[user_unique, :].dot(m2uc).toarray() dist = -sims nearest_neighbors = np.argpartition(dist, k, axis=1)[:, :k] weight_method = mean if weight_schema == 'mean' else weighted_mean for neighbor_index, movie in zip(users_neighbors, movies): neighbors = nearest_neighbors[neighbor_index] rating = weight_method(rating_matrix_cluster[neighbors, movie], sims[neighbor_index, neighbors]) ratings.append(rating) return ratings
def test_fp16_cosine_metric(self): arr = numpy.empty((10000, 2), dtype=numpy.float16) angs = numpy.random.rand(10000) * 2 * numpy.pi for i in range(10000): arr[i] = numpy.sin(angs[i]), numpy.cos(angs[i]) with self.stdout: centroids, assignments = kmeans_cuda( arr, 4, init="kmeans++", metric="cos", device=1, verbosity=2, seed=3) self.assertEqual(self._get_iters_number(self.stdout), 5) self.assertEqual(len(centroids), 4) for c in centroids: norm = numpy.linalg.norm(c) self.assertTrue(0.9995 < norm < 1.0005) dists = numpy.round(cosine_distances(centroids)).astype(int) self.assertTrue((dists == [ [0, 2, 1, 1], [2, 0, 1, 1], [1, 1, 0, 2], [1, 1, 2, 0], ]).all()) self.assertEqual(numpy.min(assignments), 0) self.assertEqual(numpy.max(assignments), 3)
def plot_mds(points, genres, n_points=500): ''' Plots a set of documents in MDS space Args: points: dense array with coordinates of each document genres: list of genres for each entry in points Returns: None ''' genres = np.array(genres) genre_sel = np.not_equal(genres, None) X, y = points[genre_sel], genres[genre_sel] X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, train_size=n_points) distances = cosine_distances(X_train, X_train) mds = MDS(n_components=2, dissimilarity='precomputed') mds.fit(distances) plot_embedding(mds.embedding_, y_train)
""" Testing the change in embeddings over time. Assumes that we've already generated embeddings in output/. """ import pandas as pd import numpy as np import os, codecs from sklearn.metrics.pairwise import cosine_similarity, cosine_distances if __name__ == '__main__': out_dir = 'output' embedding_files = [os.path.join(out_dir, f) for f in os.listdir(out_dir)] # test 0: do the embeddings make semantic sense? end_embedding = pd.read_csv(embedding_files[-1], sep='\t', index_col=0) test_words = ['you', 'go', 'road', 'give', 'cold'] for test_word in test_words: sims = end_embedding.apply(lambda r: cosine_similarity(r.reshape(1,-1), end_embedding.loc[test_word].reshape(1,-1))[0][0], axis=1) print('test word %s has top 10 similarities \n%s'% (test_word, sims.sort_values(ascending=False)[:10])) # TL;DR the embeddings aren't perfect but they work for more common words # test 1: how much have embeddings changed from start to end of data? start_embedding = pd.read_csv(embedding_files[1], sep='\t', index_col=0) embedding_deltas = abs(cosine_distances(end_embedding, start_embedding)) embedding_deltas = pd.Series(np.diagonal(embedding_deltas), index=end_embedding.index).sort_values(ascending=True) print('got embedding deltas %s'%(embedding_deltas))
def get_cos_sim(dset, n_cats, dtype, dset_name, version, sim_type, IPC_dict=None): """ This will take a dataset and calculate the cosine similiarity within and between classes, producing a csv with results and updating a main doc. :param dset: data to be tested, csv, (pd or np array?) :param n_cats: number of classes (items per-class calculated as items/classes) :param dtype: binary, chan_dist or chanProp. only needed for labelling :param dset_name: of dataset eg HBHW, HBLW, LBHW, LBLW :param version: number with 2 versions of each type :param sim_type: Describe the similarity e.g., HBHW or vary etc :param IPC_dict: defalt = None. if the number of items per class is not equal, enter a dict """ print("\nrunning ** get_cos_sim()**") file_path = "/home/nm13850/Documents/PhD/python_v2/experiments/" \ "within_between_dist_july2020/New_data/" if running_on_laptop(): file_path = '/Users/nickmartin/Library/Mobile Documents/com~apple~CloudDocs/' \ 'Documents/PhD/python_v2/experiments/' \ 'within_between_dist_july2020/New_data/' save_path = os.path.join(file_path, 'similarity_details') # # enter either 'cos_sim, 'cos_dist' or 'taxi' distance = 'cos_sim' dataset = np.asarray(dset) items, features = np.shape(dataset) print(f'\ndataset: {dataset}') print(f'items, features: {items}, {features}') # add IPC dict here if class_sizes are not equal if IPC_dict is None: cat_size = int(items / n_cats) IPC_dict = {i: cat_size for i in range(n_cats)} print(f'\nequal size IPC dict\n{IPC_dict}') else: print("using IPC dict") # separate out the individual classes # start with class inidices list containing zero, index of the first class class_indices = [0] IPC_vals = list(IPC_dict.values()) print(f'\nIPC_vals: {IPC_vals}') for i in range(n_cats): next_val = class_indices[-1] + IPC_vals[i] class_indices.append(next_val) # list of items numbers to start each class start_indices = class_indices[:n_cats] # print(f'\nstart_indices: {start_indices}') # list of indices to end each class end_indices = class_indices[1:] # print(f'end_indices: {end_indices}') # 1. define classes as slices of dataset array class_list = [] names_list = [] for cat in range(n_cats): this_name = f'class_{cat}' names_list.append(this_name) this_class = dataset[start_indices[cat]:end_indices[cat], :] class_list.append(this_class) # print(f'\n{this_name}\n{this_class}\n') # within class similarities # 3. make empty list to store results. within_list = [] for index, this_cat in enumerate(class_list): # print(f'\ngetting within class cos_sim for {names_list[index]}') # will do all pairwise comparrisons within the given category if distance in [ 'cos_sim', 'cosine_similarity', 'cosine_sim', 'cos_similarity' ]: within_cat = cosine_similarity(this_cat) # the SIMILARITY between two identical vectors will be 1 elif distance in [ 'cos_dist', 'cosine_distance', 'cosine_dist', 'cos_distance' ]: within_cat = cosine_distances(this_cat) # this DISTANCE between two identical vectors will be 0 # Cosine_distance = 1 - cosine_similarity elif distance in ['manhattan', 'taxi']: within_cat = manhattan_distances(this_cat) else: raise ValueError('must input a valid distance name') # print(within_cat) # just take the triangle since this analysis compares items with themselves triangle_indices = np.triu_indices(IPC_dict[index], 1) values_for_descriptives = (within_cat[triangle_indices]) # print(values_for_descriptives) data_similarity_descriptives = scipy.stats.describe( values_for_descriptives, axis=None) mean_sim = str(np.round(data_similarity_descriptives.mean, decimals=2)) print( f"\nWithin group mean {distance} for {names_list[index]}: {mean_sim}" ) within_list.append(mean_sim) print(f'\nwithin_list ({distance}): {within_list}\n') # between class similarities. print('\nbetween class similarities') ''' For each pair of classes - get the similarities of each item in one class to each item in the other class. - take the average of the whole matrix (not just the triangle) to get the mean similaritiy between these two classes. These mean between class similarities go into an n_cats x n_cats-1 matrix. (n_cats-1 because I am not going to have diagonals comparing classes with themselves. Each row shows a classes similarity to all other classes. - Take the average of each row to a get a class's mean between class similarity. Example below shows 4 classes (rows) and the values show which other class is being compared. e.g., class1 is compared with classes 2, 3, 4. Class2 is compared with classes 1, 3, 4. compA compB compC class1: 2 3 4 class2: 1 3 4 class3: 1 2 4 class4: 1 2 3 ''' class_pairs_list = list(combinations(class_list, 2)) class_names_list = list(combinations(names_list, 2)) class_index_list = list(combinations(range(n_cats), 2)) print( f'running {len(class_index_list)} between class comparrrions.\n{class_index_list}' ) between_array = np.zeros(shape=(n_cats, n_cats - 1)) for index, cat_pair in enumerate(class_pairs_list): cat_a = cat_pair[0] cat_name_a = class_names_list[index][0] cat_b = cat_pair[1] cat_name_b = class_names_list[index][1] print(f'\nbetween class {distance} for: {cat_name_a} and {cat_name_b}') # # do all pairwise comparrisons between the classes if distance in [ 'cos_sim', 'cosine_similarity', 'cosine_sim', 'cos_similarity' ]: between_pairs_matrix = cosine_similarity(X=cat_a, Y=cat_b) elif distance in [ 'cos_dist', 'cosine_distance', 'cosine_dist', 'cos_distance' ]: between_pairs_matrix = cosine_distances(X=cat_a, Y=cat_b) elif distance in ['manhattan', 'taxi']: between_pairs_matrix = manhattan_distances(X=cat_a, Y=cat_b) else: raise ValueError('must input a valid distance name') print(f'{between_pairs_matrix}') mean_between_pair = np.mean(between_pairs_matrix) print(f'mean_between_pair: {mean_between_pair}') # append to between array in both (ofset) diagonals idxA, idxB = class_index_list[index] print(f'add to matrix position: {idxA}, {idxB}') between_array[idxA, idxB - 1] = mean_between_pair between_array[idxB, idxA] = mean_between_pair print(f"\nbetween_array:\n{between_array}") print(f'\nmean between class {distance}') between_list = [] for index in range(n_cats): this_row = between_array[index] this_mean = np.mean(this_row) between_list.append(this_mean) print(index, this_mean) print("I want to get the mean of the between list and the within list") dset_between_mean = np.mean(between_list) dset_between_sd = np.std(between_list) print( f"dataset mean between class distance: {dset_between_mean} std.dev: {dset_between_sd}" ) print(f"check within list:\n{within_list}") within_list_num = [float(i) for i in within_list] print(f"check within_list_num:\n{within_list_num}") dset_within_mean = np.mean(within_list_num) dset_within_sd = np.std(within_list_num) print( f"dataset mean within class distance: {dset_within_mean} std.dev: {dset_within_sd}" ) # # save output. '''for each class: mean within mean between paired between ''' names_list.append('Dset_means') names_list.append('Dset_sd') within_list.append(dset_within_mean) within_list.append(dset_within_sd) between_list.append(dset_between_mean) between_list.append(dset_between_sd) class_sim_dict = { 'class': names_list, 'between': between_list, 'within': within_list } class_sim_df = pd.DataFrame(class_sim_dict) print(class_sim_df) csv_name = f'{dset_name}_{distance}.csv' csv_path = os.path.join(save_path, csv_name) class_sim_df.to_csv( csv_path, index_label='class', ) # check if similiarity summary exists similarity_info = [ dtype, dset_name, sim_type, version, n_cats, dset_between_mean, dset_between_sd, dset_within_mean, dset_within_sd ] print(f"similarity_info:\n{similarity_info}") # check if training_info.csv exists summary_name = 'similarity_summary.csv' print(f"\nlooking for file:\n{os.path.join(save_path, summary_name)}") if not os.path.isfile(os.path.join(save_path, summary_name)): print("making summary page") headers = [ "dtype", "dset_name", 'sim_type', "version", "n_cats", "mean_b", "sd_b", "mean_w", "sd_w" ] similarity_overview = open(os.path.join(save_path, summary_name), 'w') mywriter = csv.writer(similarity_overview) mywriter.writerow(headers) else: print("appending to summary page") similarity_overview = open(os.path.join(save_path, summary_name), 'a') mywriter = csv.writer(similarity_overview) mywriter.writerow(similarity_info) similarity_overview.close() return_dict = { "dtype": dtype, "dset_name": dset_name, 'sim_type': sim_type, "version": version, "n_cats": n_cats, "dset_between_mean": dset_between_mean, "dset_between_sd": dset_between_sd, "dset_within_mean": dset_within_mean, "dset_within_sd": dset_within_sd } return return_dict
def cosine_similarity(vector_a, vector_b): return 1-cosine_distances(vector_a,vector_b)
print(name + " failed to read") dist = cosine_distances(matrix) link = hier.linkage(dist, method="average") clust = hier.fcluster(link, t=0.1) #print(clust) """ print("starting predictions") for name in os.listdir(input_test): streng = input_test + "/" + name + "/docs" vocabulary = [] #print(name) for site in os.listdir(streng): nyStreng = input_test + "/" + name + "/docs/" + site voc = handle_html(nyStreng) vocabulary.append(voc) try: matrix = pre_process(vocabulary) except ValueError: print(name + " failed to predict") dist = cosine_distances(matrix) link = hier.linkage(dist, method="centroid", metric="euclidean") clust = hier.fcluster(link, t=0.08) d = sortClusters(clust) printout(name,d) #print(pred) print("done")
def getFeaturesFromVectors_pair(vectorData, seg1, seg2, segData): X = myX() segDilated = segData.segDilated segBoundSize = segData.segBoundSize seggSizes = segData.segSizes EdgeMapList = vectorData.EdgeMapList vectors = vectorData.segVectors clustersL2 = vectorData.segClustersL2 ratios = vectorData.ratios regFeatures = [] boundLine = np.logical_and(segDilated[seg1], segDilated[seg2]) for e in EdgeMapList: bound = e[boundLine] bound = bound[bound > 0] regFeatures.append(np.mean(bound)) #Feature mean bound dist. boundOverlap1 = bound.size / segBoundSize[seg1] boundOverlap2 = bound.size / segBoundSize[seg2] regFeatures.append(max(boundOverlap1, boundOverlap2)) #Feature max overlap with bound regFeatures.append(bound.size) #Feature bound size size1 = seggSizes[seg1] size2 = seggSizes[seg2] regFeatures.append(size1 + size2) #Feature New seg area X.regFeatures = regFeatures for ratioIdx, ratio in enumerate(ratios): cnnFeatures = [] for numLayer in range(0, len(vectors[seg1][ratioIdx])): layerFeatures = [] pair_dist = scipy.spatial.distance.cdist( vectors[seg1][ratioIdx][numLayer], vectors[seg2][ratioIdx][numLayer]) layerFeatures.append( np.min(pair_dist)) #Feature L2 min dist in vec rep. layerFeatures.append( np.max(pair_dist)) #Feature L2 max dist in vec rep. layerFeatures.append( np.mean(pair_dist)) #Feature L2 average dist in vec rep. layerFeatures.append( bn.median(pair_dist)) #Feature L2 median dist in vec rep. layerFeatures.append( np.sqrt( np.sum((clustersL2[seg1][ratioIdx][numLayer] - clustersL2[seg2][ratioIdx][numLayer] )**2))) #Feature L2 dist between L2 clusters pair_dist = scipy.spatial.distance.cdist( vectors[seg1][ratioIdx][numLayer], vectors[seg2][ratioIdx][numLayer], metric='cosine') layerFeatures.append( np.min(pair_dist)) # Feature cosine min dist in vec rep. layerFeatures.append( np.max(pair_dist)) # Feature cosine max dist in vec rep. layerFeatures.append( np.mean(pair_dist)) # Feature cosine average dist in vec rep. layerFeatures.append( bn.median(pair_dist)) # Feature cosine median dist in vec rep. layerFeatures.append( cosine_distances( np.array([clustersL2[seg1][ratioIdx][numLayer]]), np.array([ clustersL2[seg2][ratioIdx][numLayer] ]))[0][0]) #Feature cosine dist between L2 clusters cnnFeatures.append(layerFeatures) X.cnnFeatures[ratio] = cnnFeatures ImageFeatures = [] ImageFeatures.append( np.sqrt((vectorData.segL[seg1] - vectorData.segL[seg2])**2)) #Feature L channel dist ImageFeatures.append( np.sqrt((vectorData.segA[seg1] - vectorData.segA[seg2])**2)) #Feature A channel dist ImageFeatures.append( np.sqrt((vectorData.segB[seg1] - vectorData.segB[seg2])**2)) #Feature B channel dist X.ImageFeatures = ImageFeatures return X
def local_measure(word, knn, bins): # the matrix of size len(bins) x len(bins) # where the cosine distance of each pair of bins is computed S = [] print(word, knn) for xx in range(len(bins) - 2): bin1 = bins[xx] bin2 = bins[xx + 1] time1 = str(bin1) time2 = str(bin2) # path to the embeddings path = base_all + '3EMB-' + name + '-' + 'win_' + str( win) + '-size_' + str(size) + '-min_count_' + str( min_count) + '-iter_' + str(time1) + '-' + str(time2) # load the embeddings at time t0 using gensim KeyedVectors embed = KeyedVectors.load(path) # check if the word is in the embedding wocabulary if word not in embed.wv.vocab: print(word + ' not in base_embed\'s vocabulary') continue else: knn_t = embed.most_similar(word, topn=knn) knn_t_words = [k[0] for k in knn_t] knn_t_sims = [k[1] for k in knn_t] if xx > 0: S.append([0] * xx) else: S.append([]) knn_t0 = knn_t knn_t0_words = knn_t_words time0 = str(bin1) + '_' + str(bin2) # only the values of S above the main diagonal are non-zero # this because cosine distance is simmetric and the values on the diagonal are 0 for x in range(xx + 1, len(bins) - 1): time11 = str(bins[x]) time22 = str(bins[x + 1]) time = time1 + '-' + time2 + '_' + time11 + '-' + time22 time00 = time0 + '_' + time11 + '-' + time22 print(time) path_t1 = base_all + '3EMB-' + name + '-' + 'win_' + str( win) + '-size_' + str(size) + '-min_count_' + str( min_count) + '-iter_' + time11 + '-' + time22 # load the embeddings at time t1 using gensim KeyedVectors embed_t1 = KeyedVectors.load(path_t1) if word not in embed_t1.wv.vocab: print(word + ' not in embed\'s vocabulary') continue else: knn_t1 = embed_t1.most_similar(word, topn=knn) knn_t1_words = [k[0] for k in knn_t1] knn_t1_sims = [k[1] for k in knn_t1] # create the second order vector as in: # Hamilton, William L., Jure Leskovec, and Dan Jurafsky. "Cultural shift or linguistic drift? comparing two computational measures of semantic change." Proceedings of the Conference on Empirical Methods in Natural Language Processing. Conference on Empirical Methods in Natural Language Processing. Vol. 2016. NIH Public Access, 2016. # Equation 2 s_t = getSim(embed, word, knn_t_words + knn_t1_words) s_t1 = getSim(embed_t1, word, knn_t_words + knn_t1_words) dist = cosine_distances([s_t, s_t1]).tolist()[0][1] new_words, lost_words, diffs = comp_changes(knn_t, knn_t1) new_words0, lost_words0, diffs0 = comp_changes(knn_t0, knn_t1) writeMat(base, 'new_words_' + time + '_' + word + '_' + str(knn), new_words) writeMat(base, 'lost_words_' + time + '_' + word + '_' + str(knn), lost_words) writeMat(base, 'changes_' + time + '_' + word + '_' + str(knn), diffs) writeMat(base, 'new_words_' + time00 + '_' + word + '_' + str(knn), new_words0) writeMat(base, 'lost_words_' + time00 + '_' + word + '_' + str(knn), lost_words0) writeMat(base, 'changes_' + time00 + '_' + word + '_' + str(knn), diffs0) S[xx].append(dist) writeMat(base, 'Local_Similarity_' + name + '_' + word + '_' + str(knn), S)
def wmdo(wvvecs, ref, cand, ref_lang='en', cand_lang='en', delta=0.18, alpha=0.1): ''' wvvecs: word vectors -- retrieved from load_wv method ref: reference translation cand: candidate translation missing: missing word dictionary -- initialise as {} dim: word vector dimension delta: weight of fragmentation penalty alpha: weight of missing word penalty ''' ref_list = get_input_words(ref) cand_list = get_input_words(cand) ref = ' '.join(ref_list) cand = ' '.join(cand_list) common_vectorizer = CountVectorizer().fit(ref_list + cand_list) ref_count_vector, cand_count_vector = common_vectorizer.transform( [ref, cand]) ref_count_vector = ref_count_vector.toarray().ravel() cand_count_vector = cand_count_vector.toarray().ravel() dim = wvvecs[ref_lang].vector_size wvoc, missing = create_vocabulary(common_vectorizer, wvvecs, dim, ref_list, cand_list, ref_lang, cand_lang) distance_matrix = cosine_distances(wvoc) vocab_words = common_vectorizer.get_feature_names() for cand_word_idx, count in enumerate(cand_count_vector): if count > 0: most_similar_ref_indexes = np.argsort( distance_matrix[cand_word_idx]) for ref_word_index in most_similar_ref_indexes[1:]: if ref_count_vector[ref_word_index] > 0: print('{}: {}'.format(vocab_words[cand_word_idx], vocab_words[ref_word_index])) break if np.sum(distance_matrix) == 0.0: return 0., {} #return float('inf') ref_count_vector = ref_count_vector.astype(np.double) cand_count_vector = cand_count_vector.astype(np.double) ref_count_vector /= ref_count_vector.sum() cand_count_vector /= cand_count_vector.sum() distance_matrix = distance_matrix.astype(np.double) (wmd, flow) = emd_with_flow(ref_count_vector, cand_count_vector, distance_matrix) return wmd, {} # adding penalty ratio = fragmentation(ref_list, cand_list, common_vectorizer, flow) if ratio > 1: ratio = 1 penalty = delta * ratio # missing words penalty missingwords = 0 for w in cand_list: if w not in wvvecs: missingwords += 1 missingratio = missingwords / len(cand_list) missing_penalty = alpha * missingratio penalty += missing_penalty wmd += penalty return wmd, missing
def test_pairwise_distances(): """ Test the pairwise_distance helper function. """ rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # manhattan does not support sparse matrices atm. assert_raises(ValueError, pairwise_distances, csr_matrix(X), metric="manhattan") # Low-level function for manhattan can divide in blocks to avoid # using too much memory during the broadcasting S3 = manhattan_distances(X, Y, size_threshold=10) assert_array_almost_equal(S, S3) # Test cosine as a string metric versus cosine callable # "cosine" uses sklearn metric, cosine (function) is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Tests that precomputed metric returns pointer to, and not copy of, X. S = np.dot(X, X.T) S2 = pairwise_distances(S, metric="precomputed") assert_true(S is S2) # Test with sparse X and Y, # currently only supported for euclidean and cosine X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski")
tfidf_matrix = tfidf_matrix.T.round(3) tfidf_matrix.columns = data['Princesa'] tfidf_matrix # # Punto 3: Distancia del coseno # # - Calcular la distancia del coseno entre cada una de las princesas # - ¿Cuáles son las princesas más parecidas? # - ¿Cuáles son las princesas más diferentes? # In[356]: from sklearn.metrics.pairwise import cosine_distances dist_cos = cosine_distances(tfidf_matrix.T.values) dist_cos = pd.DataFrame(dist_cos, columns=tfidf_matrix.columns, index=tfidf_matrix.columns) dist_cos # In[480]: dist_cos.max() # In[519]: for col in dist_cos: print(col) print(max(dist_cos[col])) print(" ")
# We should be getting 2 warnings: one for using Ward that is # deprecated, one for using the copy argument assert_equal(len(warning_list), 2) ======= from sklearn.utils.testing import assert_warns >>>>>>> remote with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", UserWarning) # Use the copy argument, to raise a warning ward_tree(X, copy=True) # We should be getting 1 warnings: for using the copy argument assert_equal(len(warning_list), 1) # Let's test a hiearchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) def test_structured_linkage_tree(): """ Check that we obtain the correct solution for structured linkage trees. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) # Avoiding a mask with only 'True' entries mask[4:7, 4:7] = 0 X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) for tree_builder in _TREE_BUILDERS.values():
def discr_stat(X, Y, dissimilarity="euclidean", remove_isolates=True, return_rdfs=True): """ Computes the discriminability statistic. Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) Input data. If dissimilarity=='precomputed', the input should be the dissimilarity matrix. Y : 1d-array, shape (n_samples) Input labels. dissimilarity : str, {"euclidean" (default), "precomputed"} Dissimilarity measure to use: - 'euclidean': Pairwise Euclidean distances between points in the dataset. - 'precomputed': Pre-computed dissimilarities. remove_isolates : bool, optional, default=True Whether to remove data that have single label. return_rdfs : bool, optional, default=False Whether to return rdf for all data points. Returns ------- stat : float Discriminability statistic. rdfs : array, shape (n_samples, max{len(id)}) Rdfs for each sample. Only returned if ``return_rdfs==True``. """ check_X_y(X, Y, accept_sparse=True) uniques, counts = np.unique(Y, return_counts=True) if (counts != 1).sum() <= 1: msg = "You have passed a vector containing only a single unique sample id." raise ValueError(msg) if remove_isolates: idx = np.isin(Y, uniques[counts != 1]) labels = Y[idx] if dissimilarity == "euclidean" or dissimilarity == "cosine" or dissimilarity == "haversine" or \ dissimilarity == "manhattan" or dissimilarity == "mahalanobis": X = X[idx] else: X = X[np.ix_(idx, idx)] else: labels = Y if dissimilarity == "euclidean": dissimilarities = nan_euclidean_distances(X) elif dissimilarity == "cosine": dissimilarities = cosine_distances(X) elif dissimilarity == "haversine": dissimilarities = haversine_distances(X) elif dissimilarity == "manhattan": dissimilarities = manhattan_distances(X) else: dissimilarities = X rdfs = _discr_rdf(dissimilarities, labels) rdfs[rdfs < 0.5] = np.nan stat = np.nanmean(rdfs) if return_rdfs: return stat, rdfs else: return stat
def __cal__(self, a: np.ndarray, b: np.ndarray): from sklearn.metrics.pairwise import cosine_distances return cosine_distances(a, b)
for date in s_text.keys(): data += 1 bar.update(data) sample = s_text[date] sample_doc = s_all_doc[date] if (len(sample) < 20): continue tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2) tf = tf_vectorizer.fit_transform(sample) feature_names = tf_vectorizer.get_feature_names() cosine_sim = cosine_similarity(tf) cosine_dis = cosine_distances(tf) # get labels_true topic_labels, minor_class_labels, major_class_labels, topic_dense = get_labels_true( sample_doc) # DBSCAN db_group, db_labels, db_n_clusters = dbscan(cosine_dis) centroid_list, centroid_word_list, score_list, centroid_name_list, size_list = find_centroid_score( tf, db_group, feature_names, topic_labels) # print type(centroid_list),centroid_list tweet_id_list, author_list = get_tweet_id(sample_doc, db_group) write_event(date, 'DBSCAN', feature_names, centroid_list, centroid_word_list, score_list, centroid_name_list, size_list, tweet_id_list, author_list)
def search_in_gallery(self, embedding): ''' Takes input embedding vector and searches it in the gallery. ''' distances = cosine_distances(embedding, self.embeddings).reshape([-1]) sorted_indexes = np.argsort(distances) return sorted_indexes, distances
def cosine_dist(): return cosine_distances(paths)
17 + 47 * test_ind:47 * (test_ind + 1), :][sorted_mouse_bool, :, :], axis=1) missing = np.isnan(train_traces[:, 0]) | np.isnan(test_traces[:, 0]) # train_traces = train_traces[~missing & resorted_drop, :] # test_traces = test_traces[~missing & resorted_drop, :] train_traces = train_traces[~missing, :] test_traces = test_traces[~missing, :] if test_traces.shape[0] > 0: logger.info(f'{m} {train_stage}-{test_stage}: {test_traces.shape[0]} cells remain after matching') else: logger.warning(f'{m} {train_stage}-{test_stage}: {test_traces.shape[0]} cells matched, skipping') continue # take cosine distance for c, cue in enumerate(cues): cos[train_ind, test_ind, c, mc] = cosine_distances(train_traces[:, c][None, :], test_traces[:, c][None, :])[0][0] norms[train_ind, test_ind, c, mc] = (np.linalg.norm(train_traces[:, c]) - np.linalg.norm(test_traces[:, c])) / np.linalg.norm(train_traces[:, c]) mdiff[train_ind, test_ind, c, mc] = (np.mean(train_traces[:, c]) - np.mean(test_traces[:, c])) / np.mean(train_traces[:, c]) mdiff2[train_ind, test_ind, c, mc] = np.mean(train_traces[:, c] - test_traces[:, c]) mdiff3[train_ind, test_ind, c, mc] = np.mean(np.abs(train_traces[:, c] - test_traces[:, c])) mdiff_cells[train_ind, test_ind, c, mc] = np.mean(train_traces[:, c]) - np.mean(test_traces[:, c]) logger.info(f'Finished model: {mod}\n') # plot heatmap ax = [] fig = plt.figure(figsize=(30, 15)) gs = fig.add_gridspec(100, 110) ax.append(fig.add_subplot(gs[:, 3:5]))
def fit_joint_all(self, vectors, orig_groups, article_ids, xy_embeddings, sparse_matrix, filtered_matrix, loss_weight, low_weight=0.05): N, D = vectors.shape K = self.k embeddings = xy_embeddings.iloc[:, 1:].values best_group = orig_groups # initialize the first 'k' elements in the dataset to be the initial centroids high_centroid = np.stack(vectors[:K]) assert high_centroid.shape == (K, D) low_centroid = np.stack(embeddings[:K]) # low dimensional clustering for i in range(self.max_iterations): # get cosine distance betw each point and the cenroids, N x k high_dim_dist = cosine_distances(vectors, high_centroid) assert high_dim_dist.shape == (N, K) # Calculate normalized euclidean distance in low dimensional space low_dim_dist = euclidean_distances(embeddings, low_centroid) xy_range = (np.max(embeddings) - np.min(embeddings)) max_dist = np.sqrt(xy_range * xy_range + xy_range * xy_range) low_dim_dist /= max_dist # Calculate the label distance country_matrix = generate_country_matrix(best_group, article_ids) country_label = country_matrix.dot(filtered_matrix) country_label = normalize(country_label, axis=1) label_scores = cosine_distances(filtered_matrix, country_label) # Calculate loss dis_mat = high_dim_dist * ( 1 - low_weight - loss_weight ) + low_dim_dist * low_weight - label_scores * loss_weight best_group = assign_best_groups(dis_mat, article_ids) assert best_group.shape == (N, 2) # calculate the # of articles per group points_per_group = np.zeros(K) + 1e-6 np.add.at(points_per_group, best_group['country'], 1) # calculate the new centroid by averaging the new centroid at each cluster in both high and low dim high_centroid_new = np.zeros((K, D)) np.add.at(high_centroid_new, best_group['country'], vectors) high_centroid_new /= points_per_group.repeat(D).reshape(K, D) low_centroid_new = np.zeros((K, 2)) np.add.at(low_centroid_new, best_group['country'], embeddings) low_centroid_new /= points_per_group.repeat(2).reshape(K, 2) # break out of the main loop if the results are optimal, ie. the centroids don't change their positions # much(more than our tolerance) centroid_changes = np.sum(np.abs(high_centroid_new - high_centroid), axis=1) assert centroid_changes.shape == (K, ) max_centroid_change = np.max(centroid_changes) high_centroid = high_centroid_new low_centroid = low_centroid_new if max_centroid_change < self.tolerance: break mean_distance = get_mean_centroid_distance(vectors, high_centroid, best_group['country']) return best_group, mean_distance
def columns_tfidf_cosine_distances(A: pd.DataFrame, B: pd.DataFrame, vectorizer: TfidfVectorizer): return cosine_distances(*[ vectorizer.transform(df.columns) for df in (A, B) ])
ngram_vectorizer = TfidfVectorizer(analyzer='char',ngram_range=(1, 3), min_df=1,sublinear_tf=True,lowercase=False) tf = ngram_vectorizer.fit_transform(job_docs) fnames = ngram_vectorizer.get_feature_names() dense = tf.todense() Cpp = [i for i,k in enumerate(fnames) if "C++" == k] new=np.reshape(np.array(dense[:,Cpp]),num_docs) ngramMat['C++'] = pd.Series(new,index=ngramMat.index) R = [i for i,k in enumerate(fnames) if " R" == k] new=np.reshape(np.array(dense[:,R]),num_docs) ngramMat['R'] = pd.Series(new,index=ngramMat.index) #%% remove duplicate docs DM_docs = cosine_distances(ngramMat) duplicates = np.zeros(num_docs) for n in range(0,num_docs): doc_dupes = np.sort(np.where(DM_docs[n,:] == 0))[0][1:] duplicates[doc_dupes] = 1 docs_for_removal = np.where(duplicates.astype('int')==1)[0] ngramMat.drop(docs_for_removal,inplace=True,axis=0) #%%setup some constants used later word_occurences=np.sum(ngramMat) num_words = word_occurences.shape[0]
def findSimilarBourbons(BourbonID): ''' This recommendation is finding similar bourbons not from the same distillery ''' bourbons = pd.DataFrame(list(Bourbon.objects.all().values())) # Remove all bourbons with the same distillery (keep BourbonID) distillery = bourbons.loc[bourbons['BourbonID'] == BourbonID]['Distillery'].values[0] bourbons = bourbons.loc[(bourbons['BourbonID'] == BourbonID) | (bourbons['Distillery'] != distillery)] # Remove items not being used for similarities (Bourbon Name, Distillery(we have location), Website, Description, id(from sqlite)) final = bourbons.drop( ['Bourbon', 'Distillery', 'Website', 'Description', 'id'], axis=1).set_index('BourbonID') # Encode the categorical data values le = preprocessing.LabelEncoder() final['Style'] = le.fit_transform(final['Style']) final['Type'] = le.fit_transform(final['Type']) final['Location'] = le.fit_transform(final['Location']) # Calculate the different distance similarity calculations (rank by average of each rank) cosine = cosine_distances(final.values) cosine = pd.DataFrame(cosine, columns=final.index.values, index=final.index)[BourbonID] euclidean = euclidean_distances(final.values) euclidean = pd.DataFrame(euclidean, columns=final.index.values, index=final.index)[BourbonID] manhattan = manhattan_distances(final.values) manhattan = pd.DataFrame(manhattan, columns=final.index.values, index=final.index)[BourbonID] allSimilarities = pd.concat([cosine, euclidean, manhattan], axis=1) allSimilarities.columns = [ 'Cosine Distance', 'Euclidean Distance', 'Manhattan Distance' ] allRank = allSimilarities.rank(axis=0) finalRank = allRank.mean(axis=1) bourbons['SimilaritiesRank'] = list(finalRank.values) bourbons = bourbons.sort_values(by="SimilaritiesRank") # For each recommendation, remove recurring distilleries (so I dont get bourbons from the same distillery) bourbons = bourbons[bourbons['BourbonID'] != BourbonID] for index, row in bourbons.iterrows(): if row['BourbonID'] in bourbons['BourbonID'].tolist(): distillery = bourbons.loc[bourbons['BourbonID'] == row['BourbonID']]['Distillery'].values[0] bourbons = bourbons.loc[(bourbons['BourbonID'] == row['BourbonID']) | (bourbons['Distillery'] != distillery)] topBourbons = bourbons.head() topBourbons.apply(saveSimilarities, axis=1)
import sys import numpy as np from sklearn.cluster import DBSCAN from sklearn.metrics.pairwise import cosine_distances from facedata import FaceData, Spotting face_data = FaceData() items = face_data.find_by_location('entrance') embs = np.stack([item.vector for item in items]) print('faces found: %d ' % embs.shape[0]) db = DBSCAN(0.45, metric='precomputed') dist = cosine_distances(embs) clusters = db.fit_predict(dist) print('clustered: %d' % (len(set(clusters)) - 1)) groups = [[] for i in range(len(set(clusters)) - 1)] for i, label in enumerate(clusters): if label == -1: continue groups[label].append(i) def triage(group): keyed_by_hour = {} for item in group: key = item.spotted_at.split(' ')[1].split(':')[0] if keyed_by_hour.get(key, None) is None: keyed_by_hour[key] = [] keyed_by_hour[key].append(item) keep = []
# extract the terms-by-documents matrix # in scipy compressed sparse column format sparse_movies_tdm = tdm_method.fit_transform(parsed_text) # convert sparse matrix into regular terms-by-documents matrix movies_tdm = sparse_movies_tdm.todense() # define the documents-by-terms matrix movies_dtm = movies_tdm.transpose() # dissimilarity measures and multidimensional scaling # consider alternative pairwise distance metrics from sklearn modules # euclidean_distances, cosine_distances, manhattan_distances (city-block) # note that different metrics provide different solutions # movies_distance_matrix = euclidean_distances(movies_tdm) # movies_distance_matrix = manhattan_distances(movies_tdm) movies_distance_matrix = cosine_distances(movies_tdm) mds_method = manifold.MDS(n_components = 2, random_state = 9999,\ dissimilarity = 'precomputed') mds_fit = mds_method.fit(movies_distance_matrix) mds_coordinates = mds_method.fit_transform(movies_distance_matrix) # plot tagline text for years in two dimensions # defined by multidimensional scaling plt.figure() plt.scatter(mds_coordinates[:,0],mds_coordinates[:,1],\ facecolors = 'none', edgecolors = 'none') # plots points in white (invisible) labels = [] for iyear in range(1974, 2014): labels.append(str(iyear)) for label, x, y in zip(labels, mds_coordinates[:, 0], mds_coordinates[:, 1]):
def cosine(X, Y=None, Y_norm_squared=None, squared=False): return cosine_distances(X, Y)
# extract the terms-by-documents matrix # in scipy compressed sparse column format sparse_movies_tdm = tdm_method.fit_transform(parsed_text) # convert sparse matrix into regular terms-by-documents matrix movies_tdm = sparse_movies_tdm.todense() # define the documents-by-terms matrix movies_dtm = movies_tdm.transpose() # dissimilarity measures and multidimensional scaling # consider alternative pairwise distance metrics from sklearn modules # euclidean_distances, cosine_distances, manhattan_distances (city-block) # note that different metrics provide different solutions # movies_distance_matrix = euclidean_distances(movies_tdm) # movies_distance_matrix = manhattan_distances(movies_tdm) movies_distance_matrix = cosine_distances(movies_tdm) mds_method = manifold.MDS(n_components = 2, random_state = 9999,\ dissimilarity = 'precomputed') mds_fit = mds_method.fit(movies_distance_matrix) mds_coordinates = mds_method.fit_transform(movies_distance_matrix) # plot tagline text for years in two dimensions # defined by multidimensional scaling plt.figure() plt.scatter(mds_coordinates[:,0],mds_coordinates[:,1],\ facecolors = 'none', edgecolors = 'none') # plots points in white (invisible) labels = [] for iyear in range(1974,2014): labels.append(str(iyear)) for label, x, y in zip(labels, mds_coordinates[:,0], mds_coordinates[:,1]):
# using the word2vec model word2idx = tokenizer.word_index idx2word = {v:k for k, v in word2idx.items()} # retrieve the weights from the first dense layer. This will convert # the input vector from a one-hot sum of two words to a dense 300 # dimensional representation W, b = model.layers[0].get_weights() idx2emb = {} for word in word2idx.keys(): wid = word2idx[word] vec_in = ohe.fit_transform(np.array(wid)).todense() vec_emb = np.dot(vec_in, W) idx2emb[wid] = vec_emb for word in ["stupid", "alice", "succeeded"]: wid = word2idx[word] source_emb = idx2emb[wid] distances = [] for i in range(1, vocab_size): if i == wid: continue target_emb = idx2emb[i] distances.append(((wid, i), cosine_distances(source_emb, target_emb))) sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10] predictions = [idx2word[x[0][1]] for x in sorted_distances] print("{:s} => {:s}".format(word, ", ".join(predictions)))
def analyse(self, test_data, mode): # Vectorise cluster_test = self.dv.transform(test_data) # Scale cluster_test = self.scaler.transform(cluster_test) cluster_results = [] if self.number_of_authors is None: for i in range(len(cluster_test)): closest_center = None closest_distance = None for j in range(len(self.centers)): distance = cosine_similarity( [cluster_test[i], self.centers[j]])[0][0] if closest_distance is None: closest_distance = distance closest_center = j elif distance < closest_distance: closest_distance = distance closest_center = j self.predictions.append(closest_center) else: self.predictions = self.kmeans.predict(cluster_test) feature_significance_level = 2 mode_code = 1 if mode == 'diverge': mode_code = 2 feature_significance_level = 0.2 for i in range(len(self.predictions)): prediction = self.predictions[i] result_set = {} result_set["significant_features"] = [] result_set["cluster"] = int(prediction) result_set['dist'] = cosine_distances( [cluster_test[i], self.centers[prediction]])[0][1] * 100 for j in range(len(self.centers[prediction])): feature_name = self.feats[j] if feature_name is 'capital_count': print(feature_name) unscaled_test_value = 0 try: unscaled_test_value = test_data[i][feature_name] except: pass centroid_value = self.centers[prediction][j] if centroid_value is not 0 and unscaled_test_value > 0: test_value = cluster_test[i][j] feature_difference = abs(test_value - centroid_value) if mode_code == 2: if feature_significance_level > feature_difference and unscaled_test_value > 0: result_set["significant_features"].append( (feature_name, feature_difference, test_value, centroid_value)) else: if feature_difference > feature_significance_level and unscaled_test_value > 0: result_set["significant_features"].append( (feature_name, feature_difference, test_value, centroid_value)) cluster_results.append(result_set) return cluster_results
def cosine_distance(v1, v2): #As cosine similarity interval is [-1.0, 1.0], the cosine distance interval is [0.0, 2.0]. #This normalizes the cosine distance to interval [0.0, 1.0] return pairwise.cosine_distances(v1, v2) / 2.0
'control': 3.721765211295327, 'democratic': 3.1026721743330414, 'governments': 4.167571323949673, 'in': 0.0009654063501214492, 'law': 2.4538226269605703, 'popular': 2.764478952022998, 'response': 4.261461747058352, 'to': 0.04694493768179923} word_indices = [map_index_to_word[word] for word in tweet.keys()] tweet_tf_idf = scipy.sparse.csr_matrix((list(tweet.values()), ([0] * len(word_indices), word_indices)), shape=(1, tf_idf.shape[1])) obama_tf_idf = tf_idf[obama_id] print("The cosine distance between Obama's article and the tweet is {:.6e}." .format(cosine_distances(obama_tf_idf, tweet_tf_idf)[0, 0])) print(''' With cosine distances, the tweet is "nearer" to Barack Obama. Ignoring article lengths completely resulted in nonsensical results. In practice, it is common to enforce maximum or minimum document lengths. ''') # QUIZ QUESTIONS: print("Quiz Questions:") # 1. Among the words that appear in both Barack Obama and Francisco Barrio, # take the 5 that appear most frequently in Obama. # How many of the articles in the Wikipedia dataset contain all of those 5 words? print("1. Among the words that appear in both Barack Obama and Francisco Barrio, ") print(" take the 5 that appear most frequently in Obama.") print(" There are {:d} articles in the Wikipedia dataset contain all of those 5 words.\n" .format(has_top_words_count[True]))
def computeSimilarities(answers, response, vectorizer): response_transformed = vectorizer.transform([response]) answers_transformed = vectorizer.transform(answers) distances = cosine_distances(response_transformed, answers_transformed) return [1 - x for x in distances[0]]
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1])) # using the word2vec model word2idx = tokenizer.word_index idx2word = {v: k for k, v in word2idx.items()} # retrieve the weights from the first dense layer. This will convert # the input vector from a one-hot sum of two words to a dense 300 # dimensional representation W, b = model.layers[0].get_weights() idx2emb = {} for word in word2idx.keys(): wid = word2idx[word] vec_in = ohe.fit_transform(np.array(wid)).todense() vec_emb = np.dot(vec_in, W) idx2emb[wid] = vec_emb for word in ["stupid", "alice", "succeeded"]: wid = word2idx[word] source_emb = idx2emb[wid] distances = [] for i in range(1, vocab_size): if i == wid: continue target_emb = idx2emb[i] distances.append(((wid, i), cosine_distances(source_emb, target_emb))) sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10] predictions = [idx2word[x[0][1]] for x in sorted_distances] print("{:s} => {:s}".format(word, ", ".join(predictions)))
def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Low-level function for manhattan can divide in blocks to avoid # using too much memory during the broadcasting S3 = manhattan_distances(X, Y, size_threshold=10) assert_array_almost_equal(S, S3) # Test cosine as a string metric versus cosine callable # "cosine" uses sklearn metric, cosine (function) is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unknown assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
plt.annotate(label, (x_value, y_value), xytext=(0, space), textcoords='offset points', ha='center', va=va) fig = plt.gcf() plt.show() fig.savefig(os.path.join("Outputs", "lsk_sparse_kmeans_user_counts_6.png")) print kmeans.inertia_ #################################### # Try mds on sparse counts user_counts.drop(['kcluster'], axis=1, inplace=True) user_affinity = cosine_distances(user_counts) clf = manifold.MDS(n_components=2, dissimilarity='precomputed') ny_mds = clf.fit_transform(user_affinity) plt.scatter(ny_mds[:, 0], ny_mds[:, 1]) plt.xlabel("First Component") plt.ylabel("Second Component") fig = plt.gcf() plt.show() fig.savefig(os.path.join("Outputs", "lsmk_sparse_mds.png")) # Check silhouette scores range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10] for n_clusters in range_n_clusters: kmeans = KMeans(n_clusters=n_clusters).fit(ny_mds) cluster_labels = kmeans.fit_predict(ny_mds)
def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # Test haversine distance # The data should be valid latitude and longitude X = rng.random_sample((5, 2)) X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi/2 X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi S = pairwise_distances(X, metric="haversine") S2 = haversine_distances(X) assert_array_almost_equal(S, S2) # Test haversine distance, with Y != X Y = rng.random_sample((2, 2)) Y[:, 0] = (Y[:, 0] - 0.5)*2*np.pi/2 Y[:, 1] = (Y[:, 1] - 0.5)*2*np.pi S = pairwise_distances(X, Y, metric="haversine") S2 = haversine_distances(X, Y) assert_array_almost_equal(S, S2) # "cityblock" uses scikit-learn metric, cityblock (function) is # scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test cosine as a string metric versus cosine callable # The string "cosine" uses sklearn.metric, # while the function cosine is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unknown assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
def Similarity(filename, resumename): array_main = filename #df_main = pd.read_csv(filename) #df_main = df_main.drop(["Unnamed: 0"], axis = 1) #array_main = df_main.values #------------------------- dataset = "resume_dataset.csv" df = table_preprocess(dataset) # Create a list of job titles, descriptions, and companies jd = df['Resume'].tolist() categories = df['Category'].tolist() print(len(categories)) #------------------------- # Resume vector data = Word2Vec_Vectorize(str(resumename)) data_array = np.array(data) data_array_reshaped = data_array.reshape(1, -1) #------------------------- cos_dist = [] for vec in array_main: vec = np.array(vec) vec = vec.reshape(1, -1) cos_dist.append(float(cosine_distances(vec, data_array_reshaped))) #----------------------- ps = PorterStemmer() key_list = [] for j in jd: key = '' w = set() for word in keywords(j).split('\n'): w.add(ps.stem(word)) for x in w: key += '{} '.format(x) key_list.append(key) print(len(cos_dist)) summary = pd.DataFrame({ 'Cosine Distances': cos_dist, "Category": categories, 'Resume': jd }) z = summary.sort_values('Cosine Distances', ascending=False) z.to_csv('Summary_res_vec.csv', encoding="utf-8") #-------------------------------- # Plot graphs # array_main = df_main.values # array_list = array_main.tolist() #data_list = data[0] #array_list.append(data_list) #mean_vec = array_list #plot_pca(array_list) #plot_pca(array_list) return z.head()
# using the word2vec model word2idx = tokenizer.word_index idx2word = {v: k for k, v in word2idx.items()} # retrieve the weights from the first dense layer. This will convert # the input vector from a one-hot sum of two words to a dense 300 # dimensional representation W, b = model.layers[0].get_weights() idx2emb = {} for word in word2idx.keys(): wid = word2idx[word] vec_in = ohe.fit_transform(np.array(wid)).todense() vec_emb = np.dot(vec_in, W) idx2emb[wid] = vec_emb for word in ["stupid", "alice", "succeeded"]: wid = word2idx[word] source_emb = idx2emb[wid] distances = [] for i in range(1, vocab_size): if i == wid: continue target_emb = idx2emb[i] distances.append(((wid, i), cosine_distances(source_emb, target_emb))) sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10] predictions = [idx2word[x[0][1]] for x in sorted_distances] print("{:s} => {:s}".format(word, ", ".join(predictions)))
def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Check to ensure NaNs work with pairwise_distances. X_masked = rng.random_sample((5, 4)) Y_masked = rng.random_sample((2, 4)) X_masked[0, 0] = np.nan Y_masked[0, 0] = np.nan S_masked = pairwise_distances(X_masked, Y_masked, metric="nan_euclidean") S2_masked = nan_euclidean_distances(X_masked, Y_masked) assert_array_almost_equal(S_masked, S2_masked) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # Test haversine distance # The data should be valid latitude and longitude X = rng.random_sample((5, 2)) X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi / 2 X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi S = pairwise_distances(X, metric="haversine") S2 = haversine_distances(X) assert_array_almost_equal(S, S2) # Test haversine distance, with Y != X Y = rng.random_sample((2, 2)) Y[:, 0] = (Y[:, 0] - 0.5) * 2 * np.pi / 2 Y[:, 1] = (Y[:, 1] - 0.5) * 2 * np.pi S = pairwise_distances(X, Y, metric="haversine") S2 = haversine_distances(X, Y) assert_array_almost_equal(S, S2) # "cityblock" uses scikit-learn metric, cityblock (function) is # scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert S.shape[0] == S.shape[1] assert S.shape[0] == X.shape[0] assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert S.shape[0] == X.shape[0] assert S.shape[1] == Y.shape[0] assert_array_almost_equal(S, S2) # Test cosine as a string metric versus cosine callable # The string "cosine" uses sklearn.metric, # while the function cosine is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert S.shape[0] == X.shape[0] assert S.shape[1] == Y.shape[0] assert_array_almost_equal(S, S2) # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given with pytest.raises(TypeError): pairwise_distances(X_sparse, metric="minkowski") with pytest.raises(TypeError): pairwise_distances(X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unknown with pytest.raises(ValueError): pairwise_distances(X, Y, metric="blah")
""" # extra work : eu-distance of X from origin eu_origin_normX = euclidean_distances(norm_X, [[0,0,0,0]]) eu_origin_X = euclidean_distances(X, [[0,0,0,0]]) print(eu_origin_normX) print(eu_origin_X) """ """ c.ii Cosine distance = 1-Cosine_similarity """ cosine_dist = cosine_distances(norm_X,norm_X) #print(cosine_dist) print("Cosine matrix calculated for normalized X") plt.matshow(cosine_dist) plt.title("Cosine distance") plt.show() """ from sklearn.neighbors import DistanceMetric maha1 = DistanceMetric.get_metric('mahalanobis') maha = maha1.pairwise(norm_X, 'mahalanobis', 'V') print("mahahaaaaa")
def cosine_metric(w, X, y): return cosine_distances([X.dot(w)], [y])[0, 0]
plt.title('Distribution of document length') plt.xlabel('# of words') plt.ylabel('Percentage') plt.rcParams.update({'font.size': 16}) plt.tight_layout() plt.show() # drawback of cosine metric: ignores text length completely tweet = {'act': 3.4597778278724887, 'control': 3.721765211295327, 'democratic': 3.1026721743330414, 'governments': 4.167571323949673, 'in': 0.0009654063501214492, 'law': 2.4538226269605703, 'popular': 2.764478952022998, 'response': 4.261461747058352, 'to': 0.04694493768179923} word_indices = [map_index_to_word[word] for word in tweet.keys()] # data row id's col id's tweet_tf_idf = csr_matrix((tweet.values(), ([0]*len(word_indices), word_indices)), shape=(1, tf_idf.shape[1])) obama_tf_idf = tf_idf[35817] print cosine_distances(obama_tf_idf, tweet_tf_idf) distances, indices = model2_tf_idf.kneighbors(obama_tf_idf, n_neighbors=10) print distances