def visualize_class_separation(X, labels): _, (ax1,ax2) = pyplot.subplots(ncols=2) label_order = np.argsort(labels) ax1.imshow(pairwise_distances(X[label_order]), interpolation='nearest') ax2.imshow(pairwise_distances(labels[label_order,None]), interpolation='nearest') pyplot.show()
def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=None, gen_min_span_tree=False): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') distance_matrix = pairwise_distances(X, metric=metric, p=p) else: distance_matrix = pairwise_distances(X, metric=metric) mutual_reachability_ = mutual_reachability(distance_matrix, min_samples, alpha) min_spanning_tree = mst_linkage_core(mutual_reachability_) if gen_min_span_tree: result_min_span_tree = min_spanning_tree.copy() for index, row in enumerate(result_min_span_tree[1:], 1): candidates = np.where(np.isclose(mutual_reachability_[row[1]], row[2]))[0] candidates = np.intersect1d(candidates, min_spanning_tree[:index, :2].astype(int)) candidates = candidates[candidates != row[1]] assert (len(candidates) > 0) row[0] = candidates[0] else: result_min_span_tree = None min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] single_linkage_tree = label(min_spanning_tree) return single_linkage_tree, result_min_span_tree
def test_silhouette(): # Tests the Silhouette Coefficient. dataset = datasets.load_iris() X = dataset.data y = dataset.target D = pairwise_distances(X, metric='euclidean') # Given that the actual labels are used, we can assume that S would be # positive. silhouette = silhouette_score(D, y, metric='precomputed') assert(silhouette > 0) # Test without calculating D silhouette_metric = silhouette_score(X, y, metric='euclidean') assert_almost_equal(silhouette, silhouette_metric) # Test with sampling silhouette = silhouette_score(D, y, metric='precomputed', sample_size=int(X.shape[0] / 2), random_state=0) silhouette_metric = silhouette_score(X, y, metric='euclidean', sample_size=int(X.shape[0] / 2), random_state=0) assert(silhouette > 0) assert(silhouette_metric > 0) assert_almost_equal(silhouette_metric, silhouette) # Test with sparse X X_sparse = csr_matrix(X) D = pairwise_distances(X_sparse, metric='euclidean') silhouette = silhouette_score(D, y, metric='precomputed') assert(silhouette > 0)
def smart_initialize(data, k, seed=None): """ Use k-means++ to initialize a good set of centroids :param data: whole dataset :param k: number of centroids :param seed: random seed :return: initial centroids """ if seed is not None: # useful for obtaining consistent results np.random.seed(seed) centroids = np.zeros((k, data.shape[1])) # Randomly choose the first centroid. # Since we have no prior knowledge, choose uniformly at random idx = np.random.randint(data.shape[0]) centroids[0] = data[idx, :].toarray() # Compute distances from the first centroid chosen to all the other data points distances = pairwise_distances(data, centroids[0:1], metric='euclidean').flatten() for i in range(1, k): # Choose the next centroid randomly, so that the probability for each data point to be chosen # is directly proportional to its squared distance from the nearest centroid. # Roughly speaking, a new centroid should be as far as from other centroids as possible. idx = np.random.choice(data.shape[0], 1, p=distances / sum(distances)) centroids[i] = data[idx, :].toarray() # Now compute distances from the centroids to all data points distances = np.min(pairwise_distances(data, centroids[0:i + 1], metric='euclidean'), axis=1) return centroids
def _rsl_small_kdtree(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') distance_matrix = pairwise_distances(X, metric=metric, p=p) else: distance_matrix = pairwise_distances(X, metric=metric) mutual_reachability_ = kdtree_mutual_reachability(X, distance_matrix, metric, p=p, min_points=k, alpha=alpha) min_spanning_tree = mst_linkage_core(mutual_reachability_) min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] single_linkage_tree = label(min_spanning_tree) single_linkage_tree = SingleLinkageTree(single_linkage_tree) labels = single_linkage_tree.get_clusters(cut, gamma) return labels, single_linkage_tree
def _hdbscan_small_kdtree(X, min_cluster_size=5, min_samples=None, metric='minkowski', p=2): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') distance_matrix = pairwise_distances(X, metric=metric, p=p) else: distance_matrix = pairwise_distances(X, metric=metric) mutual_reachability_ = kdtree_mutual_reachability(X, distance_matrix, metric, p=p, min_points=min_samples) min_spanning_tree = mst_linkage_core(mutual_reachability_) min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] single_linkage_tree = label(min_spanning_tree) condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) stability_dict = compute_stability(condensed_tree) cluster_list = get_clusters(condensed_tree, stability_dict) labels = -1 * np.ones(X.shape[0], dtype=int) for index, cluster in enumerate(cluster_list): labels[cluster] = index return labels, condensed_tree, single_linkage_tree, min_spanning_tree
def class_separation(X, labels): unique_labels, label_inds = np.unique(labels, return_inverse=True) ratio = 0 for li in xrange(len(unique_labels)): Xc = X[label_inds==li] Xnc = X[label_inds!=li] ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc,Xnc).mean() return ratio / len(unique_labels)
def eval(self, X): """Evaluate the kernel density estimation Parameters ---------- X : array_like array of points at which to evaluate the KDE. Shape is (n_points, n_dim), where n_dim matches the dimension of the training points. Returns ------- dens : ndarray array of shape (n_points,) giving the density at each point. The density will be normalized for metric='gaussian' or metric='tophat', and will be unnormalized otherwise. """ X = np.atleast_2d(X) if X.ndim != 2: raise ValueError('X must be two-dimensional') if X.shape[1] != self.X_.shape[1]: raise ValueError('dimensions of X do not match training dimension') if self.metric == 'gaussian': # wrangle gaussian into scikit-learn's 'rbf' kernel gamma = 0.5 / self.h / self.h D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma) D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1])) dens = D.sum(1) elif self.metric == 'tophat': # use Ball Tree to efficiently count neighbors bt = BallTree(self.X_) counts = bt.query_radius(X, self.h, count_only=True) dens = counts / n_volume(self.h, X.shape[1]) elif self.metric == 'exponential': D = pairwise_distances(X, self.X_) dens = np.exp(-abs(D) / self.h) dens = dens.sum(1) dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1]) elif self.metric == 'quadratic': D = pairwise_distances(X, self.X_) dens = (1 - (D / self.h) ** 2) dens[D > self.h] = 0 dens = dens.sum(1) dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2) else: D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs) dens = D.sum(1) return dens
def __call__(self, X_train, X_test, y_train, y_test): X = np.vstack([X_train, X_test]) y = np.hstack([y_train, y_test]) unique_labels, label_inds = np.unique(y, return_inverse=True) ratio = 0 for li in range(len(unique_labels)): Xc = X[label_inds == li] Xnc = X[label_inds != li] ratio += pairwise_distances(Xc).mean() \ / pairwise_distances(Xc, Xnc).mean() return -ratio / len(unique_labels)
def outlier_clusters_ward(x, y, skill=None, memory=None): # TODO: incorporate skill data = np.vstack((x, y)).T if len(data) == 0: # uh. print 'clustering: NO cluster members!' cluster_centers = np.array([[-1, -1]]) cluster_labels = [] labels = [] n_clusters = 0 dist_within = np.array([]) elif len(data) == 1: print 'clustering: only 1 data point!' cluster_centers = data cluster_labels = [0] labels = np.array([0]) n_clusters = 1 dist_within = np.array([0]) else: dist_within = 1000 dist_max = 75 n_clusters = 0 n_clusters_max = 10 clusterer = AgglomerativeClustering(n_clusters=n_clusters, memory=memory) # while dist_within > dist_max, keep adding clusters while (dist_within > dist_max) * (n_clusters < n_clusters_max): # iterate n_clusters n_clusters += 1 clusterer.set_params(n_clusters=n_clusters) # cluster labels = clusterer.fit_predict(data) # get cluster_centers cluster_labels = range(n_clusters) cluster_centers = np.array([np.mean(data[labels == i], axis=0) for i in cluster_labels]) # find dist_within: the maximum pairwise distance inside a cluster dist_within = np.max([np.max(pairwise_distances( data[labels == i])) for i in cluster_labels]) dist_within_final = np.array([np.max(pairwise_distances( data[labels == i])) for i in cluster_labels]) return cluster_centers, cluster_labels, labels, n_clusters, dist_within_final
def test_precomputed(random_state=42): """Tests unsupervised NearestNeighbors with a distance matrix.""" # Note: smaller samples may result in spurious test success rng = np.random.RandomState(random_state) X = rng.random_sample((10, 4)) Y = rng.random_sample((3, 4)) DXX = metrics.pairwise_distances(X, metric='euclidean') DYX = metrics.pairwise_distances(Y, X, metric='euclidean') for method in ['kneighbors']: # TODO: also test radius_neighbors, but requires different assertion # As a feature matrix (n_samples by n_features) nbrs_X = neighbors.NearestNeighbors(n_neighbors=3) nbrs_X.fit(X) dist_X, ind_X = getattr(nbrs_X, method)(Y) # As a dense distance matrix (n_samples by n_samples) nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute', metric='precomputed') nbrs_D.fit(DXX) dist_D, ind_D = getattr(nbrs_D, method)(DYX) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Check auto works too nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric='precomputed') nbrs_D.fit(DXX) dist_D, ind_D = getattr(nbrs_D, method)(DYX) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Check X=None in prediction dist_X, ind_X = getattr(nbrs_X, method)(None) dist_D, ind_D = getattr(nbrs_D, method)(None) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Must raise a ValueError if the matrix is not of correct shape assert_raises(ValueError, getattr(nbrs_D, method), X) target = np.arange(X.shape[0]) for Est in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): print(Est) est = Est(metric='euclidean') est.radius = est.n_neighbors = 1 pred_X = est.fit(X, target).predict(Y) est.metric = 'precomputed' pred_D = est.fit(DXX, target).predict(DYX) assert_array_almost_equal(pred_X, pred_D)
def find_distance_matrix(self, metric='cosine'): ''' compute distance matrix between topis using cosine or euclidean distance (default=cosine distance) ''' if metric == 'cosine': self.distance_matrix = pairwise_distances(self.topics, metric='cosine') # diagonals should be exactly zero, so remove rounding errors numpy.fill_diagonal(self.distance_matrix, 0) if metric == 'euclidean': self.distance_matrix = pairwise_distances(self.topics, metric='euclidean')
def update_clfs_M(self, clfs, M): self.clfs = clfs self.M = M self.knn_test_dist, self.knn_test = NearestNeighbors(self.k, algorithm='brute', metric='mahalanobis', VI=self.M).fit(self.X_train).kneighbors(self.X_test) self.preds_train = np.array([e.predict(self.X_train) for e in clfs]).T self.preds_proba_train = np.array([e.predict_proba(self.X_train) for e in clfs]).swapaxes(0,1) self.preds_proba_train_smoothed = self.preds_proba_train + 0.01 self.preds_test = np.array([e.predict(self.X_test) for e in clfs]).T self.preds_proba_test = np.array([e.predict_proba(self.X_test) for e in clfs]).swapaxes(0,1) self.pp_train = np.array([pt==yt for pt,yt in itertools.izip(self.preds_train, self.y_train)]) self.pp_test = np.array([pt==yt for pt,yt in itertools.izip(self.preds_test, self.y_test)]) self.pd_pp_test = pairwise_distances(self.pp_test, self.pp_train, metric='hamming') self.pd_preds_test = pairwise_distances(self.preds_test, self.preds_train, metric='hamming')
def update_input(self, clf): preds_train = np.array([e.predict(self.X_train) for e in clf.estimators_]).T self.pp_train = np.array([pt==yt for pt,yt in itertools.izip(preds_train, self.y_train)]) preds_test = np.array([e.predict(self.X_test) for e in clf.estimators_]).T self.pp_test = np.array([pt==yt for pt,yt in itertools.izip(preds_test, self.y_test)]) self.G = np.zeros(self.M.shape) self.active_set = None self.ij = [] self.ijl = [] self.loss = np.inf self.pd_pp = pairwise_distances(self.pp_train, metric='hamming') np.fill_diagonal(self.pd_pp, np.inf) self.pd_pp_test = pairwise_distances(self.pp_test, self.pp_train, metric='hamming') self.step_size = self.alpha self.step_size_break = False
def ds_clustering(clusters,support_vectors, f_values, new_element): ''' clustering the new element Efficient Out-of-Sample extension of Dominant set clusters Massimiliano et. al., NIPS 2004 for all h in S: if sum(a(h,i)*x(h) > f(x) then i is assigned to S) ''' if clusters ==None or support_vectors==None or new_element == None: return None sum_axs = [] for i in np.arange(len(clusters)): S = clusters[i] S_old = S.copy() x = support_vectors[i] #print 'len S ', len(S), 'len x', len(x) from sklearn.metrics import euclidean_distances , pairwise_distances #euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False): new_arr = [new_element] dis = pairwise_distances(new_arr,S, metric='sqeuclidean') sigma2 = np.median(dis) a_hj = np.exp(-dis / sigma2) #print dis, a_hj sum_ax = 0. for h in np.arange(len(S_old)): sum_ax = sum_ax + a_hj[0][h]*x[h] #print 'i =',i,' sum_ax', sum_ax, 'f_values ', f_values[i] sum_axs.append(sum_ax) #print np.argmax(sum_axs), ' ', np.max(sum_axs) if np.max(sum_axs) >= 0.5*f_values[np.argmax(sum_axs)]: return np.argmax(sum_axs) return None
def display_single_tf_idf_cluster(cluster, df_map): '''map_index_to_word: SFrame specifying the mapping betweeen words and column indices''' wiki_subset = cluster['dataframe'] tf_idf_subset = cluster['matrix'] centroid = cluster['centroid'] # Print top 5 words with largest TF-IDF weights in the cluster idx = centroid.argsort()[::-1] for i in range(5): print('{0:s}:{1:.3f}'.format(df_map[df_map['idx'] == idx[i]]['word'].values[0], centroid[idx[i]])) print('') # Compute distances from the centroid to all data points in the cluster. distances = pairwise_distances(tf_idf_subset, [centroid], metric='euclidean').flatten() # compute nearest neighbors of the centroid within the cluster. nearest_neighbors = distances.argsort() # For 8 nearest neighbors, print the title as well as first 180 characters of text. # Wrap the text at 80-character mark. for i in range(8): text = ' '.join(wiki_subset.iloc[nearest_neighbors[i]]['text'].split(None, 25)[0:25]) print('* {0:50s} {1:.5f}\n {2:s}\n {3:s}'.format(wiki_subset.iloc[nearest_neighbors[i]]['name'], distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else '')) print(text) print()
def test_l1_precomputed(self): dist = pairwise_distances(self.pts, metric='l1') k_range = range(1, 5) incr_gen = incremental_neighbor_graph(dist, precomputed=True, k=k_range) for k, G in zip_longest(k_range, incr_gen): expected = ngraph(dist, precomputed=True, k=k) assert_array_almost_equal(G.matrix(dense=True), expected)
def test_spectral_amg_mode(): # Test the amg mode of SpectralClustering centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) try: from pyamg import smoothed_aggregation_solver amg_loaded = True except ImportError: amg_loaded = False if amg_loaded: labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, mode="amg") # We don't care too much that it's good, just that it *worked*. # There does have to be some lower limit on the performance though. assert_greater(np.mean(labels == true_labels), .3) else: assert_raises(ValueError, spectral_embedding, S, n_components=len(centers), random_state=0, mode="amg")
def visualize_document_clusters(wiki, tf_idf, centroids, cluster_assignment, k, map_index_to_word, display_content=True): '''wiki: original dataframe tf_idf: data matrix, sparse matrix format map_index_to_word: SFrame specifying the mapping betweeen words and column indices display_content: if True, display 8 nearest neighbors of each centroid''' print('==========================================================') # Visualize each cluster c for c in xrange(k): # Cluster heading print('Cluster {0:d} '.format(c)), # Print top 5 words with largest TF-IDF weights in the cluster idx = centroids[c].argsort()[::-1] for i in xrange(5): # Print each word along with the TF-IDF weight print('{0:s}:{1:.3f}'.format(map_index_to_word['category'][idx[i]], centroids[c,idx[i]])), print('') if display_content: # Compute distances from the centroid to all data points in the cluster, # and compute nearest neighbors of the centroids within the cluster. distances = pairwise_distances(tf_idf, [centroids[c]], metric='euclidean').flatten() distances[cluster_assignment!=c] = float('inf') # remove non-members from consideration nearest_neighbors = distances.argsort() # For 8 nearest neighbors, print the title as well as first 180 characters of text. # Wrap the text at 80-character mark. for i in xrange(8): text = ' '.join(wiki[nearest_neighbors[i]]['text'].split(None, 25)[0:25]) print('\n* {0:50s} {1:.5f}\n {2:s}\n {3:s}'.format(wiki[nearest_neighbors[i]]['name'], distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else '')) print('==========================================================')
def precompute_gaussian_kernels(XX, YY, verbose=False): """For each unit, precompute Gaussian kernel between the trials of the two samples XX and YY. Estimate each sigma2 parameter as median distance between the trials of each sample. """ if verbose: print("Pre-computing the kernel matrix for each unit.") n_units = XX.shape[1] # or YY.shape[1] Ks = [] # here we store all the kernel matrices sigma2s = np.zeros(n_units) # here we store all the sigma2s, one per unit m = XX.shape[0] n = YY.shape[0] for i in range(n_units): if verbose: print("Unit %s" % i), X = XX[:,i,:].copy() Y = YY[:,i,:].copy() if verbose: print("Computing Gaussian kernel."), dm = pairwise_distances(np.vstack([X, Y]), metric='sqeuclidean') # Heuristic: sigma2 is the median value among all pairwise # distances between X and Y. Note: should we use just # dm[:m,m:] or all dm? sigma2 = np.median(dm[:m,m:])**2 sigma2s[i] = sigma2 if verbose: print("sigma2 = %s" % sigma2) K = np.exp(-dm / sigma2) Ks.append(K) return Ks, sigma2s
def assign_clusters(data, centroids): # Compute distances between each data point and the set of centroids: distances_from_centroids = pairwise_distances(data, centroids, metric='euclidean') # Compute cluster assignments for each data point: cluster_assignment = np.apply_along_axis(np.argmin, axis = 1, arr = distances_from_centroids) return cluster_assignment
def sim_calc(self): nt = self.corpora[0] self.scores = {} for corp in self.corpora: i_nt = [] i_c2 = [] rows = self.ekk_rows[corp[0]] for i, word in enumerate(self.ekk_rows['NT']): if word in rows: i_nt.append(i) i_c2.append(self.ekk_rows[corp[0]].index(word)) d_c2 = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format( self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd), dtype='float32', shape=(len(rows), len(rows)))[i_c2] d_c2 = d_c2[:, i_c2] d_nt = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format( self.base, nt[0], nt[1], nt[2], self.english, self.prefix, self.svd), dtype='float32', shape=(len(self.ekk_rows['NT']), len(self.ekk_rows['NT'])))[ i_nt] d_nt = d_nt[:, i_nt] self.scores['{0}_{1}'.format('NT', corp[0])] = np.average(np.diag( 1 - pairwise_distances(d_nt, d_c2, metric='cosine', n_jobs=12)))
def logpdf_diagonal_gaussian(x, mean, cov): """ Compute log-pdf of a multivariate Gaussian distribution with diagonal covariance at a given point x. A multivariate Gaussian distribution with a diagonal covariance is equivalent to a collection of independent Gaussian random variables. The log-pdf will be computed for each row of x. mean and cov should be given as 1D numpy arrays. :param x: a sparse matrix :param mean: means of variables :param cov: covariances of variables :return: log-pdf of a multivariate Gaussian distribution """ n = x.shape[0] dim = x.shape[1] assert(dim == len(mean) and dim == len(cov)) # multiply each i-th column of x by (1/(2*sigma_i)), where sigma_i is sqrt of variance of i-th variable. scaled_x = x.dot(diag(1. / (2 * np.sqrt(cov)))) # multiply each i-th entry of mean by (1/(2*sigma_i)) scaled_mean = mean / (2 * np.sqrt(cov)) # sum of pairwise squared Eulidean distances gives SUM[(x_i - mean_i)^2/(2*sigma_i^2)] dist_sqr = pairwise_distances(scaled_x, [scaled_mean], 'euclidean').flatten() ** 2 return -np.sum(np.log(np.sqrt(2 * np.pi * cov))) - dist_sqr
def main(): X, theta = swiss_roll(8, 500, return_theta=True) D = pairwise_distances(X) graph_info = [ _c('5-NN', neighbor_graph, D, k=6, precomputed=True), _c('b-matching', b_matching, D, 6), _c('gabriel', gabriel_graph, X), _c('rel. neighborhood', relative_neighborhood_graph,D,metric='precomputed'), _c('manifold spanning', manifold_spanning_graph, X, 2), _c('L1', sparse_regularized_graph, X, kmax=10, sparsity_param=0.0005), _c('SMCE', _smce_symm_dist, X, kmax=25, sparsity_param=5), _c('SAFFRON', saffron, X, q=15, k=5, tangent_dim=2), _c('MST', mst, D, metric='precomputed'), _c('dMST', disjoint_mst, D, metric='precomputed'), ] print('Plotting graphs & embeddings') fig1, axes1 = plt.subplots(nrows=3, ncols=3, subplot_kw=dict(projection='3d')) fig2, axes2 = plt.subplots(nrows=3, ncols=3) fig1.suptitle('Original Coordinates') fig2.suptitle('Isomap Embeddings') for ax1, ax2, info in zip(axes1.flat, axes2.flat, graph_info): label, G, gg, emb, mask = info G.plot(X, ax=ax1, title=label, vertex_style=dict(c=theta)) gg.plot(emb, ax=ax2, title=label, vertex_style=dict(c=theta[mask])) ax1.view_init(elev=5, azim=70) ax1.set_axis_off() ax2.set_axis_off() plt.show()
def discriminate_activation_distance(self): testset = A([ self.get_input_item(t) for t in self.data.discrimination_stimuli ]) f = self.data.nT activation = np.array([np.linalg.norm(self.map[:,:,f:] - t[f:], ord = 2, axis = 2).reshape(1,-1)[0] for t in testset]) distances = pairwise_distances(activation, metric = 'euclidean') dn, tn = self.data.dirname, self.data.discrimination_data tz = self.predict_terms(self.data.discrimination_stimuli).argmax(1) terms = self.data.terms[tz] d_fn = '%s/AD_discrimination_terms_%s.csv' % (dn, tn) dc_fn = '%s/AD_discrimination_confusability_%s.csv' % (dn, tn) with open(d_fn, 'a') as o: if os.path.getsize(d_fn) == 0: o.write('simulation,time,stimulus,term\n') for i,t in enumerate(terms): o.write('%d,%d,%d,%s\n' % (self.simulation, self.time, i, t)) with open(dc_fn, 'a') as o: if os.path.getsize(dc_fn) == 0: o.write('simulation,time,stimulus.1,stimulus.2,') o.write('term.1,term.2,distance\n') for i in range(distances.shape[0]): for j in range(i+1, distances.shape[0]): o.write('%d,%d,%d,%d,%s,%s,%.3f\n' % (self.simulation, self.time, i, j, terms[i], terms[j], distances[i,j])) return
def rankInCluster(self,labels,centers_features,K,X,tweets=None): clusters = dict((clusId,{'all':[],'best':"",'first':"",'words':"","n":0,'sentiment':0}) for clusId in range(K)) if not tweets: tweets = self.tweets # In each cluster, do the following : # 1) sort tweets by created time in descending order # 2) get the first tweet (in terms of time) # 3) find the tweet that is closet to the cluster centroid (best tweet) for i,label in enumerate(labels): clusters[label]['all'].append(tweets[i]) clusters[label]['n'] += 1 for label in labels: clusters[label]['all'] = sorted(clusters[label]['all'], key=lambda x:x.time, reverse=True) clusters[label]['first'] = clusters[label]['all'][-1].printTweet() # Find the best tweet and avg sentiment in each cluster for clusId in xrange(K): print "{} tweets in cluster {}".format(len(clusters[clusId]['all']), clusId) tweetIdxInClus = np.where(labels == clusId) clusters[clusId]['sentiment'] = np.mean(X[tweetIdxInClus,-1]) if not clusters[clusId]["n"]: break #print tweetIdxInClus centerCoord = centers_features[clusId].reshape(1,-1) distToCtr = pairwise_distances(X[tweetIdxInClus], centerCoord) # dimension: (n_tweets, 1) # Calculate tweet popularity/quality feature popularity = [] for i,t in enumerate(tweets): if i in tweetIdxInClus[0]: popularity.append([t.retweetCnt,t.favCnt,t.isRetweet,t.followers]) popularity = np.array(popularity) # n_tweet X 5 coef = np.array([.5,.5,-.8,.2]) # hard-coded coefficient #print "popularity:{}".format(popularity.dot(coef).shape) norm_popularity = normalize(popularity).dot(coef).reshape(-1,1) #print norm_popularity #print norm_popularity.shape feat = np.add(distToCtr, norm_popularity) bestTweetId = np.argmax(feat) clusters[clusId]['best'] = tweets[tweetIdxInClus[0][bestTweetId]].printTweet() # Get the top words in each cluster sorted_centers_features = centers_features.argsort()[:, ::-1] for ctr in xrange(K): top3words = [] found = 0 for field in sorted_centers_features[ctr]: # Get the top 3 common words try: top3words.append(self.tfidfDict[field].encode('utf-8', 'ignore')) if found == 2: break found +=1 except IndexError: continue clusters[ctr]['words'] = "/".join(top3words) return clusters
def binned(): #reader, pointer, dialect = csvreader("german.embeddings",verbose=True) #dialect.quoting = csv.QUOTE_NONE global featureMatrix global wordVector wordVector = [] features = [] for record in reader("german.embeddings", verbose=True, delimeter="\t", nrows=2000): word, featurelst = record[0], [float(f) for f in record[1:]] if VERBOSE: print >> sys.stderr, word features.append(featurelst) wordVector.append(word) featureMatrix = np.array(features) #featureMatrix = StandardScaler().fit_transform(featureMatrix) #print featureMatrix #print featureMatrix.shape euclideandist = pairwise_distances(featureMatrix) #print max(euclideandist.flatten()) #print euclideandist if VERBOSE: print >> sys.stderr, wordVector[1:10] print >> sys.stderr, featureMatrix print >> sys.stderr, featureMatrix.shape from multiprocessing import Pool #p = Pool(3) #p.map(kmeans, range(8, 100, 2), 2) def asymmetric_n(min, max): next = min while next < max: yield next next += int(math.log(next, 2)) - 1 #for n in range(16,23): # docluster(n,algorithm='kmeans') from itertools import product results = [] for eps, MinPts in product([5.5], [8]): sil, n = docluster(algorithm='AffinityPropagation', showClusters=True, eps_in=eps, MinPts_in=MinPts) results.append((sil, n, eps, MinPts)) print sorted(results, key=lambda x: x[0], reverse=True)
def discriminate(self): # calculates the between-cell map distance for the BMUs of an array of # stimuli used in Beekhuizen & Stevenson 2016. # only uses sinij/goluboj or blue t_set = { '111' : ['BU'], '112' : ['sinij', 'goluboj'] } lg = self.parameters['target language'] t_ix = [i for i,t in enumerate(self.data.terms) if t in t_set[lg]] testset = A([ self.get_input_item(t) for t in self.data.discrimination_stimuli ]) positions = A([ A(self.get_bmu_ix(t)) for t in testset ]) distances = pairwise_distances(positions, metric = 'euclidean') dn, tn = self.data.dirname, self.data.discrimination_data pt = self.predict_terms(self.data.discrimination_stimuli) terms = [t_set[lg][t] for t in pt[:,t_ix].argmax(1)] #print(terms) d_fn = '%s/discrimination_terms_%s_bo.csv' % (dn, tn) dc_fn = '%s/discrimination_confusability_%s_bo.csv' % (dn, tn) with open(d_fn, 'a') as o: if os.path.getsize(d_fn) == 0: o.write('simulation,time,stimulus,term\n') for i,t in enumerate(terms): o.write('%d,%d,%d,%s\n' % (self.simulation, self.time, i, t)) with open(dc_fn, 'a') as o: if os.path.getsize(dc_fn) == 0: o.write('simulation,time,stimulus.1,stimulus.2,') o.write('term.1,term.2,distance\n') for i in range(distances.shape[0]): for j in range(i+1, distances.shape[0]): o.write('%d,%d,%d,%d,%s,%s,%.3f\n' % (self.simulation, self.time, i, j, terms[i], terms[j], distances[i,j])) return
def train(self, reactions, predictor_headers, response_headers, filename): print "Preparing arrays" data, labels = self._prepareArrays(reactions, predictor_headers, response_headers) old_settings = np.seterr(divide='raise') # we don't want division by zero to pass # This is how metric learn determines bounds internally # but the lower bound can be zero this way (especially for low-dimensional data) # which causes divide by zero errors print "Calculating bounds" pair_dists = pairwise_distances(data) bounds = np.percentile(pair_dists, (5, 95)) # the extra check ensures against divide-by-zero errors later if bounds[0] == 0: bounds[0] = min(pair_dists[np.nonzero(pair_dists)]) print "Lowerbound was 0. Set to {}".format(bounds[0]) print "Preparing {} constraints with bounds of ({}, {})".format(self.num_constraints, bounds[0], bounds[1]) constraints = self.metric_object.prepare_constraints(labels, data.shape[0], self.num_constraints) print "Fitting" self.metric_object.fit(data, constraints, bounds=bounds) self.save(filename) np.seterr(**old_settings) print "Transforming training set" return self.metric_object.transform()
def __init__(self, X, constraints, bounds=None, A0=None): """ X: (n x d) data matrix - each row corresponds to a single instance A0: [optional] (d x d) initial regularization matrix, defaults to identity constraints: tuple of arrays: (a,b,c,d) indices into X, such that: d(X[a],X[b]) < d(X[c],X[d]) bounds: (pos,neg) pair of bounds on similarity, such that: d(X[a],X[b]) < pos d(X[c],X[d]) > neg """ self.X = X # check to make sure that no two constrained vectors are identical a,b,c,d = constraints ident = _vector_norm(self.X[a] - self.X[b]) > 1e-9 a, b = a[ident], b[ident] ident = _vector_norm(self.X[c] - self.X[d]) > 1e-9 c, d = c[ident], d[ident] self.C = a,b,c,d # init bounds if bounds is None: self.bounds = np.percentile(pairwise_distances(X), (5, 95)) else: assert len(bounds) == 2 self.bounds = bounds # intialize metric if A0 is None: self.A = np.identity(self.X.shape[1]) else: self.A = A0
def compute_neighbors( self, n_neighbors: int = 30, knn: bool = True, n_pcs: Optional[int] = None, use_rep: Optional[str] = None, method: _Method = 'umap', random_state: AnyRandom = 0, write_knn_indices: bool = False, metric: _Metric = 'euclidean', metric_kwds: Mapping[str, Any] = MappingProxyType({}), ) -> None: """\ Compute distances and connectivities of neighbors. Parameters ---------- n_neighbors Use this number of nearest neighbors. knn Restrict result to `n_neighbors` nearest neighbors. {n_pcs} {use_rep} Returns ------- Writes sparse graph attributes `.distances` and `.connectivities`. Also writes `.knn_indices` and `.knn_distances` if `write_knn_indices==True`. """ from sklearn.metrics import pairwise_distances start_neighbors = logg.debug('computing neighbors') if n_neighbors > self._adata.shape[0]: # very small datasets n_neighbors = 1 + int(0.5 * self._adata.shape[0]) logg.warning( f'n_obs too small: adjusting to `n_neighbors = {n_neighbors}`') if method == 'umap' and not knn: raise ValueError('`method = \'umap\' only with `knn = True`.') if method == 'rapids' and metric != 'euclidean': raise ValueError( "`method` 'rapids' only supports the 'euclidean' `metric`.") if method not in {'umap', 'gauss', 'rapids'}: raise ValueError( "`method` needs to be 'umap', 'gauss', or 'rapids'.") if self._adata.shape[0] >= 10000 and not knn: logg.warning( 'Using high n_obs without `knn=True` takes a lot of memory...') # do not use the cached rp_forest self._rp_forest = None self.n_neighbors = n_neighbors self.knn = knn X = _choose_representation(self._adata, use_rep=use_rep, n_pcs=n_pcs) # neighbor search use_dense_distances = (metric == 'euclidean' and X.shape[0] < 8192) or knn == False if use_dense_distances: _distances = pairwise_distances(X, metric=metric, **metric_kwds) knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( _distances, n_neighbors) if knn: self._distances = _get_sparse_matrix_from_indices_distances_numpy( knn_indices, knn_distances, X.shape[0], n_neighbors) else: self._distances = _distances elif method == 'rapids': knn_indices, knn_distances = compute_neighbors_rapids( X, n_neighbors) else: # non-euclidean case and approx nearest neighbors if X.shape[0] < 4096: X = pairwise_distances(X, metric=metric, **metric_kwds) metric = 'precomputed' knn_indices, knn_distances, forest = compute_neighbors_umap( X, n_neighbors, random_state, metric=metric, metric_kwds=metric_kwds) # very cautious here try: if forest: self._rp_forest = _make_forest_dict(forest) except: pass # write indices as attributes if write_knn_indices: self.knn_indices = knn_indices self.knn_distances = knn_distances start_connect = logg.debug('computed neighbors', time=start_neighbors) if not use_dense_distances or method in {'umap', 'rapids'}: # we need self._distances also for method == 'gauss' if we didn't # use dense distances self._distances, self._connectivities = _compute_connectivities_umap( knn_indices, knn_distances, self._adata.shape[0], self.n_neighbors, ) # overwrite the umap connectivities if method is 'gauss' # self._distances is unaffected by this if method == 'gauss': self._compute_connectivities_diffmap() logg.debug('computed connectivities', time=start_connect) self._number_connected_components = 1 if issparse(self._connectivities): from scipy.sparse.csgraph import connected_components self._connected_components = connected_components( self._connectivities) self._number_connected_components = self._connected_components[0]
x = x.reshape((numSamples, -1)).astype(np.float) / 255.0 x_r = x_r.reshape((numSamples, -1)) x_nr = x_nr.reshape((numSamples, -1)) locations = [np.where(y == i)[0] for i in range(10)] result = [] for i in range(10): sampleMat = x[locations[i], ...] sampleMat_r = x_r[locations[i], ...] sampleMat_nr = x_nr[locations[i], ...] toDelete = [j * j for j in range(sampleMat.shape[0])] dist = np.delete(pairwise_distances(sampleMat).flatten(), toDelete) mean_ = np.mean(dist) std_ = np.std(dist) norm_ = norm(mean_, std_) dist_r = np.delete(pairwise_distances(sampleMat_r).flatten(), toDelete) mean_r = np.mean(dist_r) std_r = np.std(dist_r) norm_r = norm(mean_r, std_r) dist_nr = np.delete(pairwise_distances(sampleMat_nr).flatten(), toDelete) mean_nr = np.mean(dist_nr) std_nr = np.std(dist_nr) norm_nr = norm(mean_nr, std_nr) # show histogram
str(digInd)) # In[14]: subset0 = X[y[:] == 0] # In[15]: subset1 = X[y[:] == 1] # #### 1E -- Genuine vs. Imposter Calculations # In[16]: #create symetric matrices representing distances between all genuine zeros, where the diagonal is 0 (same value) gen0 = pairwise_distances(subset0, subset0) gen1 = pairwise_distances(subset1, subset1) #create matrix representing the distance between all 1s and all 0s imp = pairwise_distances(subset0, subset1) # In[17]: gen0DiffVals = [] #create an array of the lower triangle of the genuine 0 matrices for i in range(len(gen0)): for j in range(len(gen0)): if j >= i: continue else: gen0DiffVals.append(gen0[i][j])
def compute_fitness(self, X): if len(X.shape) == 1: X = X.reshape(1, -1) distances = pairwise_distances(X, self.L, metric=self.distance) membership = argmin(distances, axis=1) return (-distances[range(distances.shape[0]), membership].sum())
def get_recommendations_system_collaborative(): # Key to works. try: wine_key = request.args.get('wine_key') except KeyError: return "Wine Key cannot be null" # Get database connection conn = get_database_connection() # Prepare data frames. df = load_database_with_wines_and_ratings_features(conn) wines = load_database_with_wines_features(conn) ratings = load_database_with_ratings_features(conn) # Close connection. close_database_connection(conn) # Average wine ratings wines_df_stats = df.groupby('wine_name').agg( {'rating': [np.size, np.mean]}) # Filters the statistical dataset with wines that have more than x analyzes. min_10 = wines_df_stats['rating']['size'] >= 3 wines_df_stats[min_10].sort_values([('rating', 'mean')], ascending=False) # Pivot Table matrix_df = ratings.pivot_table(index=['wine_id'], columns=['user_id'], values=['rating']).reset_index(drop=True) matrix_df.fillna(0, inplace=True) # Calculate Cosine Similarity. wines_similarity = 1 - pairwise_distances(matrix_df.to_numpy(), metric='cosine') np.fill_diagonal(wines_similarity, 0) # Set Similarities to Matrix data frame. matrix_df = pd.DataFrame(wines_similarity) # Recommendation System try: wine = wines[wines['wine_name'] == wine_key].index.tolist() wine = wine[0] wines['similarity'] = matrix_df.iloc[wine] wines.drop_duplicates('wine_id') wines.drop(columns=[ 'alcohol_content', 'country', 'grape', 'harmonization', 'harvest', 'producer', 'region', 'service', 'type', 'volume' ], axis=1, inplace=True) result = pd.DataFrame( wines.sort_values(['similarity'], ascending=False)) return jsonify(json.loads(result[0:int(10)].to_json(orient='records'))) except: return "Wine not found."
from sklearn.metrics import pairwise_distances from sklearn.datasets import load_digits import matplotlib.pyplot as plt # --- READING DATA --- digits = load_digits().data # --- BUILDING SIMILARITY MATRIX --- SMatrix = pairwise_distances(digits, metric='sqeuclidean')
np.random.seed(100) # set the number of observed data points n = 200 # draw random points from a log-normal and a gaussian distributions Y = np.random.lognormal(mean=2, sigma=0.3, size=n) X = np.random.normal(loc=np.exp(2.0 + 0.3**2 / 2.0), scale=0.3 * np.exp(2.0), size=n) XX = X[:, np.newaxis] YY = Y[:, np.newaxis] # compute MMD/K-S two sample tests and their null distributions sigma2 = np.median(pairwise_distances(XX, YY, metric='euclidean'))**2 * 2.0 mmd2u, mmd2u_null, p_value = two_sample_test(XX, YY, model='MMD', kernel_function='rbf', gamma=1.0 / sigma2, iterations=5000, verbose=True, n_jobs=1) ks, ks_null, ks_p_value = two_sample_test(XX, YY, model='KS', iterations=5000, verbose=True, n_jobs=1)
def _test(self, args, epoch, disc, gen, test_loader, test_output_dir): mse_criterion = nn.MSELoss() cls_criterion = nn.CrossEntropyLoss() _loss_g, _loss_cls_gen, _loss_adv_gen = 0., 0., 0. _loss_d, _loss_cls, _loss_adv = 0., 0., 0. _loss_recon, _loss_mse = 0., 0. _loss = 0. ypred, ypred_gen = [], [] ytrue, ytrue_gen = [], [] cls_count = [0] * 10 class_featmaps = np.zeros((10, 1000, 256 * 14 * 14)) class_featmaps_gen = np.zeros((10, 1000, 256 * 14 * 14)) class_idx = [0] * 10 for i, (inputs, featmaps, targets, indexes) in enumerate(test_loader): inputs, featmaps, targets = inputs.to(args.device), featmaps.to( args.device), targets.to(args.device) feats, gen_targets = self._sample_vecs_index(inputs.shape[0]) feats, gen_targets = feats.to(args.device), gen_targets.to( args.device) gen_image = gen(feats.unsqueeze(2).unsqueeze(3).detach()) for j, target in enumerate( targets.detach().cpu().numpy().astype(int)): class_featmaps[target, class_idx[target]] = featmaps[j].view( 256 * 14 * 14).detach().cpu().numpy() class_featmaps_gen[target, class_idx[target]] = gen_image[j].view( 256 * 14 * 14).detach().cpu().numpy() class_idx[target] += 1 print(class_idx) plt.figure(figsize=(12, 12)) plt.imshow( pairwise_distances(np.vstack(class_featmaps[0:10]), metric='l2')) plt.colorbar() plt.savefig("self.png") plt.close() print(class_idx) plt.figure(figsize=(12, 12)) plt.imshow( pairwise_distances(np.vstack(class_featmaps_gen[0:10]), metric='l2')) plt.colorbar() plt.savefig("self_gen.png") plt.close() plt.figure(figsize=(12, 12)) plt.imshow( pairwise_distances(np.vstack(class_featmaps[0:10]), np.vstack(class_featmaps_gen[0:10]), metric='l2')) plt.colorbar() plt.savefig("pair.png") plt.close() plt.figure(figsize=(12, 12)) plt.imshow( pairwise_distances(np.vstack(class_featmaps[0:10]), metric='cosine')) plt.colorbar() plt.savefig("self_cosine.png") plt.close() plt.figure(figsize=(12, 12)) plt.imshow( pairwise_distances(np.vstack(class_featmaps_gen[0:10]), metric='cosine')) plt.colorbar() plt.savefig("self_gen_cosine.png") plt.close() plt.figure(figsize=(12, 12)) plt.imshow( pairwise_distances(np.vstack(class_featmaps[0:10]), np.vstack(class_featmaps_gen[0:10]), metric='cosine')) plt.colorbar() plt.savefig("pair_cosine.png") plt.close() for i, (images, featmaps, targets, indexes) in enumerate(test_loader): loss = 0 images, featmaps, targets = images.to(args.device), featmaps.to( args.device), targets.to(args.device) if args.data == "image": inputs = (images * 2) - 1 else: inputs = featmaps feats, logits_cls, logits_adv = disc(inputs) loss_cls = cls_criterion(logits_cls, targets.long()) loss = loss_cls _loss_cls += loss_cls.item() preds = F.softmax(logits_cls, dim=1).argmax(dim=1).cpu().numpy().tolist() ypred.extend(preds) ytrue.extend(targets) feats, gen_targets = self._sample_vecs_index(inputs.shape[0]) feats, gen_targets = feats.to(args.device), gen_targets.to( args.device) gen_image = gen(feats.unsqueeze(2).unsqueeze(3).detach()) feats_gen, logits_cls_gen, logits_adv_gen = disc(gen_image) loss_cls_gen = cls_criterion(logits_cls_gen, gen_targets.long()) loss += args.cls_w * loss_cls_gen _loss_cls_gen += loss_cls_gen.item() if args.adv: loss_adv = (adversarial_loss(logits_adv, is_real=True, is_disc=True, type_=args.adv_type) + adversarial_loss(logits_adv_gen, is_real=False, is_disc=True, type_=args.adv_type)) _loss_adv += loss_adv.item() loss += args.adv_w * loss_adv.clone() / 2. loss_adv_gen = adversarial_loss(logits_adv_gen, is_real=True, is_disc=False, type_=args.adv_type) _loss_adv_gen += loss_adv_gen.item() loss += args.adv_w * loss_adv_gen.clone() if args.recon: loss_recon = (1 - nn.CosineSimilarity(dim=1, eps=1e-6)( feats_gen, feats).mean()) loss += args.adv_r * loss_recon.clone() _loss_recon += loss_recon.item() if args.mse: loss_mse = nn.MSELoss()(gen_image, inputs) loss += args.mse_w * loss_mse.clone() _loss_mse += args.mse_w * loss_mse.item() preds_gen = F.softmax(logits_cls_gen, dim=1).argmax(dim=1).cpu().numpy().tolist() ypred_gen.extend(preds_gen) ytrue_gen.extend(gen_targets) _loss += loss.item() if i % 10 == 0: visualize(inputs[0], gen_image[0], out_dir=test_output_dir + str(epoch) + "_" + str(i) + ".jpg", featmap=(args.data == "featmap")) if sum(cls_count) < 50: cls_count = visualize_classes(inputs, gen_image, gen_targets, cls_count, test_output_dir, args.data == "featmap") acc = round((np.array(ypred) == np.array(ytrue)).sum() / len(ytrue), 4) acc_gen = round((np.array(ypred_gen) == np.array(ytrue_gen)).sum() / len(ytrue_gen), 4) print("Test Set Epoch {}, Training Iteration {}".format(epoch, i)) print("Accuracy: {}, Accuracy gen: {}".format(acc, acc_gen)) print("Loss: {}, Loss_cls: {}, Loss_cls_gen: {}".format( _loss / (i + 1), _loss_cls / (i + 1), _loss_cls_gen / (i + 1))) if args.adv: print("Loss_adv: {}, Loss_adv_gen: {}".format( _loss_adv / (i + 1), _loss_adv_gen / (i + 1))) if args.mse: print("Loss_mse: {}".format(_loss_mse / (i + 1))) return return_statement(i, acc, acc_gen, _loss_cls, _loss_cls_gen, _loss_adv, _loss_adv_gen, _loss_recon, _loss_mse)
def multi_component_layout( data, graph, n_components, component_labels, dim, random_state, metric="euclidean", metric_kwds={}, ): """Specialised layout algorithm for dealing with graphs with many connected components. This will first fid relative positions for the components by spectrally embedding their centroids, then spectrally embed each individual connected component positioning them according to the centroid embeddings. This provides a decent embedding of each component while placing the components in good relative positions to one another. Parameters ---------- data: array of shape (n_samples, n_features) The source data -- required so we can generate centroids for each connected component of the graph. graph: sparse matrix The adjacency matrix of the graph to be emebdded. n_components: int The number of distinct components to be layed out. component_labels: array of shape (n_samples) For each vertex in the graph the label of the component to which the vertex belongs. dim: int The chosen embedding dimension. metric: string or callable (optional, default 'euclidean') The metric used to measure distances among the source data points. metric_kwds: dict (optional, default {}) Keyword arguments to be passed to the metric function. Returns ------- embedding: array of shape (n_samples, dim) The initial embedding of ``graph``. """ result = np.empty((graph.shape[0], dim), dtype=np.float32) if n_components > 2 * dim: meta_embedding = component_layout( data, n_components, component_labels, dim, random_state, metric=metric, metric_kwds=metric_kwds, ) else: k = int(np.ceil(n_components / 2.0)) base = np.hstack([np.eye(k), np.zeros((k, dim - k))]) meta_embedding = np.vstack([base, -base])[:n_components] for label in range(n_components): component_graph = graph.tocsr()[component_labels == label, :].tocsc() component_graph = component_graph[:, component_labels == label].tocoo() distances = pairwise_distances([meta_embedding[label]], meta_embedding) data_range = distances[distances > 0.0].min() / 2.0 if component_graph.shape[0] < 2 * dim: result[component_labels == label] = (random_state.uniform( low=-data_range, high=data_range, size=(component_graph.shape[0], dim), ) + meta_embedding[label]) continue diag_data = np.asarray(component_graph.sum(axis=0)) # standard Laplacian # D = scipy.sparse.spdiags(diag_data, 0, graph.shape[0], graph.shape[0]) # L = D - graph # Normalized Laplacian I = scipy.sparse.identity(component_graph.shape[0], dtype=np.float64) D = scipy.sparse.spdiags( 1.0 / np.sqrt(diag_data), 0, component_graph.shape[0], component_graph.shape[0], ) L = I - D * component_graph * D k = dim + 1 num_lanczos_vectors = max(2 * k + 1, int(np.sqrt(component_graph.shape[0]))) try: eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh( L, k, which="SM", ncv=num_lanczos_vectors, tol=1e-4, v0=np.ones(L.shape[0]), maxiter=graph.shape[0] * 5, ) order = np.argsort(eigenvalues)[1:k] component_embedding = eigenvectors[:, order] expansion = data_range / np.max(np.abs(component_embedding)) component_embedding *= expansion result[component_labels == label] = (component_embedding + meta_embedding[label]) except scipy.sparse.linalg.ArpackError: warn( "WARNING: spectral initialisation failed! The eigenvector solver\n" "failed. This is likely due to too small an eigengap. Consider\n" "adding some noise or jitter to your data.\n\n" "Falling back to random initialisation!") result[component_labels == label] = (random_state.uniform( low=-data_range, high=data_range, size=(component_graph.shape[0], dim), ) + meta_embedding[label]) return result
from io import BytesIO import sframe as sf ?multivariate_normal.pdf multivariate_normal.pdf(1,mean=0,cov=1) print multivariate_normal.pdf([10,5],mean=[3,4],cov=3) print norm.pdf(3) data_pts=pd.DataFrame({'X':[10,2,3],'Y':[5,1,7]}) data_pts=np.array(data_pts) clusters=np.array([[3,4],[6,3],[4,6]]) dist=pairwise_distances(data_pts,clusters,metric='euclidean') ?np.argmin np.argmin(dist,axis=1) l=list() for i in clusters: l.append(multivariate_normal.pdf(data_pts,mean=i,cov=[[3,0],[0,3]])) ?np.array l=np.array(l,shape=[3,3]) c_wts=[1./3,1./3,1./3] l=l*c_wts l=l.T ?normalize res=normalize(l,norm='l1',axis=1) c_wts_1=np.sum(res,axis=0)/np.sum(res)
def _leastsq_patch(ayxyx, pa_thresholds, angles, metric, dist_threshold, solver, tol): """ Helper function for _leastsq_ann. Parameters ---------- axyxy : tuple This tuple contains all per-segment data. pa_thresholds : list of list This is a per-annulus list of thresholds. angles, metric, dist_threshold, solver, tol These parameters are the same for each annulus or segment. """ iann, yy, xx, yy_opt, xx_opt = ayxyx pa_threshold = pa_thresholds[iann] values = ARRAY[:, yy, xx] # n_frames x n_pxs_segment values_opt = ARRAY[:, yy_opt, xx_opt] n_frames = ARRAY.shape[0] if dist_threshold < 100: mat_dists_ann_full = pairwise_distances(values, metric=metric) else: mat_dists_ann_full = np.ones((values.shape[0], values.shape[0])) if pa_threshold > 0: mat_dists_ann = np.zeros_like(mat_dists_ann_full) for i in range(n_frames): ind_fr_i = _find_indices_adi(angles, i, pa_threshold, None, False) mat_dists_ann[i][ind_fr_i] = mat_dists_ann_full[i][ind_fr_i] else: mat_dists_ann = mat_dists_ann_full matrix_res = np.zeros((values.shape[0], yy.shape[0])) for i in range(n_frames): vector = pn.DataFrame(mat_dists_ann[i]) if vector.sum().values > 0: ind_ref = np.where(~np.isnan(vector))[0] A = values_opt[ind_ref] b = values_opt[i] if solver == 'lstsq': coef = sp.linalg.lstsq(A.T, b, cond=tol)[0] # SVD method elif solver == 'nnls': coef = sp.optimize.nnls(A.T, b)[0] elif solver == 'lsq': # TODO coef = sp.optimize.lsq_linear(A.T, b, bounds=(0, 1), method='trf', lsq_solver='lsmr')['x'] else: raise ValueError("`solver` not recognized") else: msg = "No frames left in the reference set. Try increasing " msg += "`dist_threshold` or decreasing `delta_rot`." raise RuntimeError(msg) recon = np.dot(coef, values[ind_ref]) matrix_res[i] = values[i] - recon return matrix_res, yy, xx
def myfunction(): #data from task 1 randomData, stratifiedData, anotherX, targetForStrat, targetForRand, targetForOrg, attributeNames, latitude,longitude, stratLat, stratLong, numberInEachState, avArray = Task1.task1() #additional data stateFinancesData = ReadCSVFiles.readStates() energyData = ReadCSVFiles.statePopulation() medianHouseholdIncomeData = ReadCSVFiles.medianHouseholdIncome() unemploymentData = ReadCSVFiles.unemployment() averageIQData = ReadCSVFiles.averageIQ() educationData = ReadCSVFiles.educationLevel() bigData, avArray = ReadCSVFiles.getAllArray(stateFinancesData, energyData, medianHouseholdIncomeData, unemploymentData,averageIQData, educationData) # print(anotherX) # print(len(anotherX)) # for i in range(0,20): # print(anotherX[i]) superTempArray = [None]*997 #link state with other attributes for i in range(0,999): if(i<742): tempArray = [None]*22 for j in range(0,9): tempArray[j] = anotherX[i][j] #get big Data items associated with state # tempArray = [None]*12 tempNum = int(anotherX[i][9])-1 for j in range(1,14): tempArray[j+8] = bigData[j][tempNum] superTempArray[i] = tempArray elif((i>742)and(i<923)): tempArray = [None]*22 for j in range(0,9): tempArray[j] = anotherX[i][j] #get big Data items associated with state # tempArray = [None]*12 tempNum = int(anotherX[i][9])-1 for j in range(1,14): tempArray[j+8] = bigData[j][tempNum] superTempArray[i-1] = tempArray elif(i>923): tempArray = [None]*22 for j in range(0,9): tempArray[j] = anotherX[i][j] #get big Data items associated with state # tempArray = [None]*12 tempNum = int(anotherX[i][9])-1 for j in range(1,14): tempArray[j+8] = bigData[j][tempNum] superTempArray[i-2] = tempArray anotherX = np.array(superTempArray) orgData_std = StandardScaler().fit_transform(anotherX) # print(orgData_std) # for i in range(0,len(orgData_std)): # for j in range(0, len(orgData_std[i])): # if(math.isnan(orgData_std[i][j])==True): # print("TRUEE") # print(math.isnan(orgData_std)) #pca = decomposition.PCA(n_components=3) pcaOrg = decomposition.PCA() # I transform the data and get respective eigenvalues sklearn_pcaOrg = pcaOrg.fit_transform(orgData_std) orgEigVal = pcaOrg.explained_variance_ sumOfOrgEig =0 contSumOfOrgEig = [None]*22 #calculate sum of all eigenval for i in range(0,22): sumOfOrgEig = orgEigVal[i] + sumOfOrgEig contSumOfOrgEig[i] = sumOfOrgEig orgVarArray = [None]*22 #calculate variance array for i in range(0,22): orgVarArray[i] = orgEigVal[i]/sumOfOrgEig sumOrgVar =0 #get the sum of variances (total variance) for i in range(0,22): sumOrgVar = sumOrgVar + orgVarArray[i] tempOrgVarSum =0 orgIntrDimCount = 0 #get when 75% of the total variance occured for i in range(0,22): tempOrgVarSum = tempOrgVarSum + orgVarArray[i] if((tempOrgVarSum >(sumOrgVar*.75))and(orgIntrDimCount==0)): orgIntrDimCount = i #calculate loading factors pcaOrgNew = decomposition.PCA(n_components=orgIntrDimCount) sklearn_pcaOrg = pcaOrgNew.fit_transform(orgData_std) orgLoadFact = pcaOrgNew.components_.T * np.sqrt(pcaOrgNew.explained_variance_) orgSumOfSquaredLoad = [[0 for i in range(0,2)] for j in range(0,22)] #get attributes with highest PCA loading for i in range(0,22): for j in range(0, orgIntrDimCount): orgSumOfSquaredLoad[i] = orgSumOfSquaredLoad[i] + (orgLoadFact[i][j])**2 orgSumOfSquaredLoad[i][1]= i #I sort the arrays orgSumOfSquaredLoad.sort(key=lambda x: x[0]) #I get the highest 3 attributes orgThreeHighAttr = np.array(orgSumOfSquaredLoad[-3:]) orgThreeHighAttrData = [[0 for i in range(0,3)] for j in range(0,997)] #I get the data associated with the three highest attribtues for j in range(0,997): for i in range(0,3): orgThreeHighAttrData[j][i] = orgData_std[j][int(orgThreeHighAttr[i][1])] #names of the three highest attributes orgColumns = [None]*3 attributeNames = ['Change in Rank', 'Revenue', 'Revenue Change', 'Profit', 'Profit Change', 'Assets', 'Market Value', 'Employees', 'Years on Fortune 500 List', 'Total Revenue', 'Federal Revenue','State Revenue','Total Expenditure','Instruction Expenditure','GDP','Census','Median Household Income', 'Unemployment Rate','Average IQ','High School Graduate','Bachelors Graduate','Masters Graduate'] for i in range(0,3): orgColumns[i] = (attributeNames[int(orgThreeHighAttr[i][1])]) org3Data = pd.DataFrame(data = orgThreeHighAttrData, columns = orgColumns) targetForOrg2 = pd.DataFrame(data=targetForOrg, columns = ['Target']) #create the array with data points for 3 attr and cluster associated with that org3DataFinal = pd.concat([org3Data, targetForOrg2[['Target']]], axis=1) #create an array with coordinates for 3 attr scatter plot bigOrg3Array = [[0 for i in range(0,9)] for j in range (0,997)] for m in range(0,997): count =0 for j in range(0,3): for i in range(0,3): bigOrg3Array[m][count]=([org3DataFinal.values[m][i],org3DataFinal.values[m][j]]) count = count +1 # print(bigOrg3Array) #to visualize data on top 2 pcaVectors pcaVisOrg = decomposition.PCA(n_components=2) principalDFOrg = pd.DataFrame(data = pcaVisOrg.fit_transform(orgData_std), columns = ['Principal Component 1', 'Principal Component 2']) # targetForStrat2 = pd.DataFrame(data=targetForStrat, columns = ['Target']) # targetForRand2 = pd.DataFrame(data=targetForRand, columns = ['Target']) targetForOrg2 = pd.DataFrame(data=targetForOrg, columns = ['Target']) #print(targetForStrat) #last row will show the cluster associated w/ each data point finalDFOrg = pd.concat([principalDFOrg, targetForOrg2[['Target']]], axis=1) #mds mds_dataOrg = manifold.MDS(n_components=2, dissimilarity='precomputed') #mds with euclidean orgSimEuc = pairwise_distances(orgData_std, metric= 'euclidean') # print("printing org sim euc") # print(orgSimEuc) orgDEuc = mds_dataOrg.fit_transform(orgSimEuc) orgMDSdatEuc = pd.DataFrame(orgDEuc) finalMDSOrgDataEuc = pd.concat([orgMDSdatEuc, targetForOrg2[['Target']]], axis=1) #mds with corr orgSimCor = pairwise_distances(orgData_std, metric= 'correlation') orgDCor = mds_dataOrg.fit_transform(orgSimCor) orgMDSdatCor = pd.DataFrame(orgDCor) finalMDSOrgDataCor = pd.concat([orgMDSdatCor, targetForOrg2[['Target']]], axis=1) #json data --> to export to front end data = {} data['orgEigVal'] = orgEigVal.tolist() data['orgLoadFact'] = orgLoadFact.tolist() data['orgSigNum'] = orgIntrDimCount data['sumOfOrgEig'] = contSumOfOrgEig finalMDSOrgDataEuc = np.array(finalMDSOrgDataEuc).tolist() finalMDSOrgDataCor= np.array(finalMDSOrgDataCor).tolist() org3DataFinal = np.array(org3DataFinal).tolist() finalDFOrg = np.array(finalDFOrg).tolist() for i in range(0,2): finalMDSOrgDataEuc.pop() finalMDSOrgDataCor.pop() org3DataFinal.pop() finalDFOrg.pop() # print(finalMDSOrgDataCor) # print(org3DataFinal) # print(contSumOfOrgEig) # print("\n\n") # print(bigOrg3Array) data['pca2OrgValues'] = np.array(finalDFOrg).tolist() data['orgMDSDataEuc'] = finalMDSOrgDataEuc data['orgMDSDataCor'] = finalMDSOrgDataCor data['org3LoadData'] = org3DataFinal data['org3AttrNames'] = orgColumns data['bigOrg3Array'] = bigOrg3Array # print(bigOrg3Array) # print(data) json_data2 = json.dumps(data) # print(json_data2) return json_data2
paths = new_paths print(f"Number of paths after subsampling: {len(paths)}") # %% [markdown] # ## embedder = AdjacencySpectralEmbed(n_components=None, n_elbows=2) embed = embedder.fit_transform(pass_to_ranks(adj)) embed = np.concatenate(embed, axis=-1) pairplot(embed, labels=labels) # %% [markdown] # ## Show 2 dimensions of pairwise cosine embedding pdist = pairwise_distances(embed, metric="cosine") # %% [markdown] # ## manifold = TSNE(metric="precomputed") # manifold = ClassicalMDS(n_components=2, dissimilarity="precomputed") cos_embed = manifold.fit_transform(pdist) # %% [markdown] # ## plot_df = pd.DataFrame(data=cos_embed) plot_df["labels"] = labels fig, axs = plt.subplots(1, 2, figsize=(20, 10)) ax = axs[0] sns.scatterplot(
def topic_result(tt): predicted_topics=[np.argsort(each)[::-1][0] for each in tt] if predicted_topics==[0]: return 'Family House' elif predicted_topics==[1]: return 'Tourism and Event' elif predicted_topics==[2]: return 'Peaceful Vacation' elif predicted_topics==[3]: return 'Young Life Style' else: return 'Specialty Needs' similar_indices=pairwise_distances(tt,doc_topic,metric='cosine').argsort()[0][0:num] similar_id = [(listings['id'].iloc[i]) for i in similar_indices] similar_name = [(listings['name'].iloc[i]) for i in similar_indices] similar_desc = [(listings['description'].iloc[i][0:165]) for i in similar_indices] similar_url = [(listings['host_url'].iloc[i]) for i in similar_indices] similar_pic = [(listings['picture_url'].iloc[i]) for i in similar_indices] # def recommend(text,num): # a=('Recommending ' + str(num) + ' Airbnb products for ' + str(text)) # b=('------------------------------------') # for i in list(range(0,num)):
def magnitude_and_difference(matrix): magnitude = np.sqrt(matrix.multiply(matrix).sum(1)) magnitude_diff = pairwise_distances(magnitude, metric='manhattan') return magnitude, magnitude_diff
def predict(self, X, categorical_mapping=None): # Scale the test set. Select wether to use the reference or # just the train set weights doRefWeightsScaling = False doSymmetricalWeights = True #if self.is_fitted: X_test = X.copy() # Prepare for the Euclidean distance r_test = X_test.shape[0] t_test = self.X_train.shape[0] X_temp = np.concatenate([X_test, self.X_train], axis=0) # Normalisation. If not z-score, normalise from 0 to 1 if self.normalisation_type == 'z-score': x_normed = StandardScaler().fit_transform(X_temp) else: x_normed = (X_temp - X_temp.min(0)) / np.maximum(X_temp.ptp(0), 1) if doSymmetricalWeights: print('...Symmetrical Weights') # scale using Cholesky's so W = Q.T Q V = np.diag(self.x_combined_weights) try: Q = np.linalg.cholesky(V) weights_Q = np.diag(Q) except np.linalg.LinAlgError as err: print(err) doSymmetricalWeights = False if doRefWeightsScaling: X_test_scaled = np.multiply(x_normed[0:r_test, :], np.sqrt(self.x_ref_weights)) elif doSymmetricalWeights: X_test_scaled = np.multiply(x_normed[0:r_test, :], weights_Q) else: X_test_scaled = np.multiply(x_normed[0:r_test, :], np.sqrt(self.x_weights)) # Scale the training set if doSymmetricalWeights: X_train_scaled = np.multiply(x_normed[r_test:(r_test + t_test), :], weights_Q) else: X_train_scaled = np.multiply(x_normed[r_test:(r_test + t_test), :], np.sqrt(self.x_weights)) y_k_all_list = [] y_k_list = [] y_k_weighted_list = [] y_delta_list = [] y_idx_closest_promos = [] y_distances_closest_promos = [] y_weights = [] testSize = X_test.shape[0] for idx_test in range(0, testSize): # Select the closest promotions. Try scaling... current_promo_scaled = X_test_scaled[idx_test].reshape(1, -1) # >> Euclidean distances euclidean = np.squeeze( pairwise_distances(X_train_scaled, current_promo_scaled)) idxSorted = np.argsort(euclidean)[0:self.num_neighbours] x_A = self.X_train[idxSorted] x_B = np.tile(X_test[idx_test], (self.num_neighbours, 1)) X_AB_test = np.concatenate([x_A, x_B], axis=1) # differences regarding the reference promotions xgb_frc = self.regressor.predict(X_AB_test) # Get the average y_delta_list.append(xgb_frc) y_k_hat_all = xgb_frc + self.y_train[idxSorted] y_k_hat = np.mean(y_k_hat_all) # Weighted by the Euclidean distances w_distance = 1.0 / np.maximum(euclidean[idxSorted], 1e-3) y_k_hat_distances = \ w_distance.dot(y_k_hat_all.T)/np.sum(w_distance) y_k_weighted_list.append(y_k_hat_distances) # Append to the list y_k_all_list.append(y_k_hat_all) y_k_list.append(y_k_hat) y_idx_closest_promos.append(idxSorted) y_distances_closest_promos.append(euclidean[idxSorted]) y_weights.append(w_distance) # Arrange the forecast as np-arrays y_hat = np.array(y_k_list) y_hat_weighted = np.array(y_k_weighted_list) # Arrange the outputs self.results = { 'y_idx_closest_promos': y_idx_closest_promos, 'y_hat': y_hat, 'y_hat_weighted': y_hat_weighted, 'y_delta_list': y_delta_list, 'y_k_all_list': y_k_all_list, 'y_distances_closest_promos': y_distances_closest_promos, 'y_weights': y_weights, 'feat_importances': self.feat_importances, 'internal_var_names': self.int_vars } # Sort out feature importances if not doRefWeightsScaling: idx_importances = np.argsort(self.x_combined_weights)[::-1] int_var_names = self.int_vars[0:self.numFeatures] else: idx_importances = np.argsort(self.feat_importances)[::-1] int_var_names = self.int_vars # If provided if categorical_mapping: ''' Get the feature importances as a DF ''' inputVars_plain = [ categorical_mapping.get(iVar, iVar) for iVar in [int_var_names[this_idx] for this_idx in idx_importances] ] df_feat_importances = pd.DataFrame( self.feat_importances[idx_importances], index=inputVars_plain) else: inputVars_plain = [ int_var_names[this_idx] for this_idx in idx_importances ] df_feat_importances = pd.DataFrame( self.x_combined_weights[idx_importances], index=inputVars_plain) self.results['df_feat_importances'] = df_feat_importances self.valid_predictions = True return y_hat_weighted
# Load table disp('Loading table...') df = pd.read_csv(inputpath, index_col=0) disp('%s samples and %s features detected' % (df.shape[1], df.shape[0])) samples = df.columns # Compute pairwise distances if 'unifrac' in args.m: features = df.index metric = setup_unifrac(args.d, df.index, args.m) elif (args.m in _METRICS_NAMES) & ( args.m not in PAIRWISE_DISTANCE_FUNCTIONS.keys()): import scipy.spatial.distance as sd metric = getattr(sd, args.m) else: metric = args.m disp('Calculating pairwise distances...') dists = pairwise_distances(df.T, metric=metric, n_jobs=args.t) # Place into dataframe dist_df = pd.DataFrame(dists, samples, samples) # Save disp('Saving pairwise distances to: %s' % outputpath) # Create output directory if it doesn't exist outdir = os.path.dirname(outputpath) if not os.path.exists(outdir): os.makedirs(outdir) dist_df.to_csv(outputpath)
def rotate(X, angle): theta = np.deg2rad(angle) R = [[np.cos(theta), np.sin(theta)], [-np.sin(theta), np.cos(theta)]] return np.dot(X, R) X2 = rotate(X, 20) + 5 plt.scatter(X2[:, 0], X2[:, 1], **colorize) plt.axis('equal') plt.show() # calculated distance matrix from sklearn.metrics import pairwise_distances D = pairwise_distances(X) print(D.shape) plt.imshow(D, zorder=2, cmap='Blues', interpolation='nearest') plt.colorbar() plt.show() # MDS: distance matrix ---- coordinate representation from sklearn.manifold import MDS model = MDS(n_components=2, dissimilarity='precomputed', random_state=1) out = model.fit_transform(D) plt.scatter(out[:, 0], out[:, 1], **colorize) plt.axis('equal') plt.show()
def __getitem__(self, item): if self.cache_data: if item in self.data_dict.keys(): return self.data_dict[item] else: pass pdbid, pose, affinity = self.data_list[item] node_feats, coords = None, None with h5py.File(self.data_file, "r") as f: if ( not self.dataset_name in f[ "{}/{}/{}".format( pdbid, self.feature_type, self.preprocessing_type ) ].keys() ): print(pdbid) return None if self.use_docking: # TODO: the next line will cuase runtime error because not selelcting poses data = f[ "{}/{}/{}/{}".format( pdbid, self.feature_type, self.preprocessing_type, self.dataset_name, ) ][pose]["data"] vdw_radii = ( f[ "{}/{}/{}/{}".format( pdbid, self.feature_type, self.preprocessing_type, self.dataset_name, ) ][pose] .attrs["van_der_waals"] .reshape(-1, 1) ) else: data = f[ "{}/{}/{}/{}".format( pdbid, self.feature_type, self.preprocessing_type, self.dataset_name, ) ]["data"] vdw_radii = ( f[ "{}/{}/{}/{}".format( pdbid, self.feature_type, self.preprocessing_type, self.dataset_name, ) ] .attrs["van_der_waals"] .reshape(-1, 1) ) if self.feature_type == "pybel": coords = data[:, 0:3] node_feats = np.concatenate([vdw_radii, data[:, 3:22]], axis=1) else: raise NotImplementedError # account for the vdw radii in distance cacluations (consider each atom as a sphere, distance between spheres) dists = pairwise_distances(coords, metric="euclidean") edge_index, edge_attr = dense_to_sparse(torch.from_numpy(dists).float()) x = torch.from_numpy(node_feats).float() y = torch.FloatTensor(affinity).view(-1, 1) data = Data( x=x, edge_index=edge_index, edge_attr=edge_attr.view(-1, 1), y=y ) if self.cache_data: if self.output_info: self.data_dict[item] = (pdbid, pose, data) else: self.data_dict[item] = data return self.data_dict[item] else: if self.output_info: return (pdbid, pose, data) else: return data
def get_distances_centers(centers): return pairwise_distances(centers)
from sklearn.metrics import pairwise_distances from src.datasets.datasets import Spheres dataset = Spheres() dataset_l, labels_l = dataset.sample(n_samples=8) DD = pairwise_distances(dataset_l, dataset_l) print(DD.max())
def test_dunn(): kmeans = KMeans(n_clusters=2, random_state=0) labels = kmeans.fit_predict(iris) d_val = dunn(pairwise_distances(iris), labels) assert .05 < d_val < .1
def get_cluster_diameter(cluster): distance = pairwise_distances(cluster) return max(map(lambda x: x[len(distance) - 1], distance))
user_book = user_book.replace(np.nan, 0) # In[29]: user_book # In[30]: #Calculating Cosine Similarity between Users from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cosine, correlation # In[31]: user_sim = 1 - pairwise_distances(user_book.values, metric='cosine') # In[32]: user_sim # In[33]: #Store the results in a dataframe user_sim_df = pd.DataFrame(user_sim) # In[34]: #Set the index and column names to user ids user_sim_df.index = book_df["User.ID"].unique() user_sim_df.columns = book_df["User.ID"].unique()
def get_prediction(vecs, pics): dists = pairwise_distances(vecs, pics, metric='cosine') return dists.T.argsort(1)
# Generamos muestra estratificada split = StratifiedShuffleSplit(n_splits=1, test_size=0.45, random_state=42) for train_index, test_index in split.split(df_users_to_cluster, df_users_to_cluster['divisionGiro']): strat_test_set = df_users_to_cluster.iloc[train_index] users_clustered = df_users_to_cluster.iloc[test_index] print('------------MUESTRA ESTRATIFICADA-----------') print(users_clustered['divisionGiro'].value_counts() / len(users_clustered)) print('Tamaño de datos a procesar', len(users_clustered)) # Calculo de la matriz de distancias con metrica gower start = time.time() gower_mat = metrics.pairwise_distances( gower.gower_matrix(users_clustered), metric="precomputed") end = time.time() print('Tiempo de ejecución de la matriz de distancia',end - start) # Comparación de matriz de distancia con el primer registro first_row = gower_mat[0] print('Registro base: ',users_clustered[:1], "\n") print('Registro mas parecido: ',users_clustered[first_row == min(first_row[first_row != min(first_row)])]) print('Registro menos parecido: ',users_clustered[first_row == max(first_row)]) # Implementamos el modelo de DBSCAN con la matriz ya computada start = time.time() model = DBSCAN(eps=0.11, min_samples=40, metric="precomputed") model.fit(gower_mat) end = time.time()
def compute_neighbors( self, n_neighbors: int = 30, knn: bool = True, n_pcs: Optional[int] = None, use_rep: Optional[str] = None, method: str = 'umap', random_state: Optional[Union[RandomState, int]] = 0, write_knn_indices: bool = False, metric: str = 'euclidean', metric_kwds: Mapping[str, Any] = {} ) -> None: """\ Compute distances and connectivities of neighbors. Parameters ---------- n_neighbors Use this number of nearest neighbors. knn Restrict result to `n_neighbors` nearest neighbors. {n_pcs} {use_rep} Returns ------- Writes sparse graph attributes `.distances` and `.connectivities`. Also writes `.knn_indices` and `.knn_distances` if `write_knn_indices==True`. """ if n_neighbors > self._adata.shape[0]: # very small datasets n_neighbors = 1 + int(0.5*self._adata.shape[0]) logg.warn('n_obs too small: adjusting to `n_neighbors = {}`' .format(n_neighbors)) if method == 'umap' and not knn: raise ValueError('`method = \'umap\' only with `knn = True`.') if method not in {'umap', 'gauss'}: raise ValueError('`method` needs to be \'umap\' or \'gauss\'.') if self._adata.shape[0] >= 10000 and not knn: logg.warn( 'Using high n_obs without `knn=True` takes a lot of memory...') self.n_neighbors = n_neighbors self.knn = knn X = choose_representation(self._adata, use_rep=use_rep, n_pcs=n_pcs) # neighbor search use_dense_distances = (metric == 'euclidean' and X.shape[0] < 8192) or knn == False if use_dense_distances: # standard eulcidean case for relatively small matrices self._distances, knn_indices, knn_distances = compute_neighbors_numpy( X, n_neighbors, knn=knn) else: # non-euclidean case and approx nearest neighbors if X.shape[0] < 4096: X = pairwise_distances(X, metric=metric, **metric_kwds) metric = 'precomputed' knn_indices, knn_distances = compute_neighbors_umap( X, n_neighbors, random_state, metric=metric, metric_kwds=metric_kwds) # write indices as attributes if write_knn_indices: self.knn_indices = knn_indices self.knn_distances = knn_distances logg.msg('computed neighbors', t=True, v=4) if not use_dense_distances or method == 'umap': # we need self._distances also for method == 'gauss' if we didn't # use dense distances self._distances, self._connectivities = compute_connectivities_umap( knn_indices, knn_distances, self._adata.shape[0], self.n_neighbors) # overwrite the umap connectivities if method is 'gauss' # self._distances is unaffected by this if method == 'gauss': self._compute_connectivities_diffmap() logg.msg('computed connectivities', t=True, v=4) self._number_connected_components = 1 if issparse(self._connectivities): from scipy.sparse.csgraph import connected_components self._connected_components = connected_components(self._connectivities) self._number_connected_components = self._connected_components[0]
def calculate_distance_matrix(X, Y=None, dist_metric='haversine', n_jobs=0, **kwds): """ Calculate a distance matrix based on a specific distance metric. If only X is given, the pair-wise distances between all elements in X are calculated. If X and Y are given, the distances between all combinations of X and Y are calculated. Distances between elements of X and X, and distances between elements of Y and Y are not calculated. Parameters ---------- X : GeoDataFrame (as trackintel staypoints or triplegs) Y : GeoDataFrame (as trackintel staypoints or triplegs), optional dist_metric: {'haversine', 'euclidean', 'dtw', 'frechet'} The distance metric to be used for calculating the matrix. This function wraps around the ``pairwise_distance`` function from scikit-learn if only `X` is given and wraps around the ``scipy.spatial.distance.cdist`` function if X and Y are given. Therefore the following metrics are also accepted: via ``scikit-learn``: `[‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’]` via ``scipy.spatial.distance``: `[‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]` triplegs can only be used in combination with `['dtw', 'frechet']`. n_jobs: int Number of cores to use: 'dtw', 'frechet' and all distance metrics from `pairwise_distance` (only available if only X is given) are parallelized. **kwds: optional keywords passed to the distance functions. Returns ------- np.array matrix of shape (len(X), len(X)) or of shape (len(X), len(Y)) """ geom_type = X.geometry.iat[0].geom_type if Y is None: Y = X assert Y.geometry.iat[0].geom_type == Y.geometry.iat[0].geom_type, "x and y need same geometry type " \ "(only first column checked)" if geom_type == 'Point': x1 = X.geometry.x.values y1 = X.geometry.y.values x2 = Y.geometry.x.values y2 = Y.geometry.y.values if dist_metric == 'haversine': # create point pairs for distance calculation nx = len(X) ny = len(Y) # if y != x they could have different dimensions if ny >= nx: ix_1, ix_2 = np.triu_indices(nx, k=1, m=ny) trilix = np.tril_indices(nx, k=-1, m=ny) else: ix_1, ix_2 = np.tril_indices(nx, k=-1, m=ny) trilix = np.triu_indices(nx, k=1, m=ny) x1 = x1[ix_1] y1 = y1[ix_1] x2 = x2[ix_2] y2 = y2[ix_2] d = haversine_dist(x1, y1, x2, y2) D = np.zeros((nx, ny)) D[(ix_1, ix_2)] = d # mirror triangle matrix to be conform with scikit-learn format and to # allow for non-symmetric distances in the future D[trilix] = D.T[trilix] else: xy1 = np.concatenate((x1.reshape(-1, 1), y1.reshape(-1, 1)), axis=1) if Y is not None: xy2 = np.concatenate((x2.reshape(-1, 1), y2.reshape(-1, 1)), axis=1) D = cdist(xy1, xy2, metric=dist_metric, **kwds) else: D = pairwise_distances(xy1, metric=dist_metric, n_jobs=n_jobs) return D elif geom_type == 'LineString': if dist_metric in ['dtw', 'frechet']: # these are the preparation steps for all distance functions based only on coordinates if dist_metric == 'dtw': d_fun = partial(dtw, **kwds) elif dist_metric == 'frechet': d_fun = partial(frechet_dist, **kwds) # get combinations of distances that have to be calculated nx = len(X) ny = len(Y) if ny >= nx: ix_1, ix_2 = np.triu_indices(nx, k=1, m=ny) trilix = np.tril_indices(nx, k=-1, m=ny) else: ix_1, ix_2 = np.tril_indices(nx, k=-1, m=ny) trilix = np.triu_indices(nx, k=1, m=ny) left = list(X.iloc[ix_1].geometry) right = list(Y.iloc[ix_2].geometry) # map the combinations to the distance function if n_jobs == -1 or n_jobs > 1: if n_jobs == -1: n_jobs = multiprocessing.cpu_count() with multiprocessing.Pool(processes=n_jobs) as pool: left_right = list(zip(left, right)) d = np.array(list(pool.starmap(d_fun, left_right))) else: d = np.array(list(map(d_fun, left, right))) # write results to (symmetric) distance matrix D = np.zeros((nx, ny)) D[(ix_1, ix_2)] = d D[trilix] = D.T[trilix] return D else: raise AttributeError( "Metric unknown. We only support ['dtw', 'frechet'] for LineStrings. " f"You passed {dist_metric}") else: raise AttributeError( f"We only support 'Point' and 'LineString'. Your geometry is {geom_type}" )
def component_layout( data, n_components, component_labels, dim, random_state, metric="euclidean", metric_kwds={}, ): """Provide a layout relating the separate connected components. This is done by taking the centroid of each component and then performing a spectral embedding of the centroids. Parameters ---------- data: array of shape (n_samples, n_features) The source data -- required so we can generate centroids for each connected component of the graph. n_components: int The number of distinct components to be layed out. component_labels: array of shape (n_samples) For each vertex in the graph the label of the component to which the vertex belongs. dim: int The chosen embedding dimension. metric: string or callable (optional, default 'euclidean') The metric used to measure distances among the source data points. metric_kwds: dict (optional, default {}) Keyword arguments to be passed to the metric function. If metric is 'precomputed', 'linkage' keyword can be used to specify 'average', 'complete', or 'single' linkage. Default is 'average' Returns ------- component_embedding: array of shape (n_components, dim) The ``dim``-dimensional embedding of the ``n_components``-many connected components. """ if data is None: # We don't have data to work with; just guess return np.random.random(size=(n_components, dim)) * 10.0 component_centroids = np.empty((n_components, data.shape[1]), dtype=np.float64) if metric == "precomputed": # cannot compute centroids from precomputed distances # instead, compute centroid distances using linkage distance_matrix = np.zeros((n_components, n_components), dtype=np.float64) linkage = metric_kwds.get("linkage", "average") if linkage == "average": linkage = np.mean elif linkage == "complete": linkage = np.max elif linkage == "single": linkage = np.min else: raise ValueError("Unrecognized linkage '%s'. Please choose from " "'average', 'complete', or 'single'" % linkage) for c_i in range(n_components): dm_i = data[component_labels == c_i] for c_j in range(c_i + 1, n_components): dist = linkage(dm_i[:, component_labels == c_j]) distance_matrix[c_i, c_j] = dist distance_matrix[c_j, c_i] = dist else: for label in range(n_components): component_centroids[label] = data[component_labels == label].mean( axis=0) if scipy.sparse.isspmatrix(component_centroids): warn( "Forcing component centroids to dense; if you are running out of " "memory then consider increasing n_neighbors.") component_centroids = component_centroids.toarray() if metric in SPECIAL_METRICS: distance_matrix = pairwise_special_metric( component_centroids, metric=metric, kwds=metric_kwds, ) elif metric in SPARSE_SPECIAL_METRICS: distance_matrix = pairwise_special_metric( component_centroids, metric=SPARSE_SPECIAL_METRICS[metric], kwds=metric_kwds, ) else: if callable(metric) and scipy.sparse.isspmatrix(data): function_to_name_mapping = { v: k for k, v in sparse_named_distances.items() } try: metric_name = function_to_name_mapping[metric] except KeyError: raise NotImplementedError( "Multicomponent layout for custom " "sparse metrics is not implemented at " "this time.") distance_matrix = pairwise_distances(component_centroids, metric=metric_name, **metric_kwds) else: distance_matrix = pairwise_distances(component_centroids, metric=metric, **metric_kwds) affinity_matrix = np.exp(-(distance_matrix**2)) component_embedding = SpectralEmbedding( n_components=dim, affinity="precomputed", random_state=random_state).fit_transform(affinity_matrix) component_embedding /= component_embedding.max() return component_embedding
len(book.User_ID.unique()) len(book.Book_Title.unique()) # convert data into n * p matrix Book_pivot = book.pivot_table(index='User_ID', columns='Book_Title', values='Book_Rating').reset_index(drop=True) # replace index of pivot with user id values Book_pivot.index = book.User_ID.unique() #Impute NaNs with 0 values Book_pivot.fillna(0, inplace=True) # calculate distance # pairwise gives (1-cos(A,B)) so to negate that we are using 1- ahead of that user_distance = 1 - pairwise_distances(Book_pivot.values, metric='cosine') distance_matrix = pd.DataFrame(user_distance) #Set the index and column names to user ids distance_matrix.index = book.User_ID.unique() distance_matrix.columns = book.User_ID.unique() # fill diagnols with 0s as diagnol has all 1 np.fill_diagonal(distance_matrix.values, 0) #Most Similar Users distance_matrix.idxmax(axis=1)[0:5]