def test_kneighbors_regressor_sparse(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0): # Test radius-based regression on sparse matrices # Like the above, but with various types of sparse matrices rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < .25).astype(np.int) for sparsemat in SPARSE_TYPES: knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, algorithm='auto') knn.fit(sparsemat(X), y) knn_pre = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, metric='precomputed') knn_pre.fit(pairwise_distances(X, metric='euclidean'), y) for sparsev in SPARSE_OR_DENSE: X2 = sparsev(X) assert_true(np.mean(knn.predict(X2).round() == y) > 0.95) X2_pre = sparsev(pairwise_distances(X, metric='euclidean')) if issparse(sparsev(X2_pre)): assert_raises(ValueError, knn_pre.predict, X2_pre) else: assert_true( np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95)
def trustworthiness(X, X_embedded, n_neighbors=5, precomputed=False): """Expresses to what extent the local structure is retained. The trustworthiness is within [0, 1]. It is defined as .. math:: T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1} \sum_{j \in U^{(k)}_i (r(i, j) - k)} where :math:`r(i, j)` is the rank of the embedded datapoint j according to the pairwise distances between the embedded datapoints, :math:`U^{(k)}_i` is the set of points that are in the k nearest neighbors in the embedded space but not in the original space. * "Neighborhood Preservation in Nonlinear Projection Methods: An Experimental Study" J. Venna, S. Kaski * "Learning a Parametric Embedding by Preserving Local Structure" L.J.P. van der Maaten Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. X_embedded : array, shape (n_samples, n_components) Embedding of the training data in low-dimensional space. n_neighbors : int, optional (default: 5) Number of neighbors k that will be considered. precomputed : bool, optional (default: False) Set this flag if X is a precomputed square distance matrix. Returns ------- trustworthiness : float Trustworthiness of the low-dimensional embedding. """ if precomputed: dist_X = X else: dist_X = pairwise_distances(X, squared=True) dist_X_embedded = pairwise_distances(X_embedded, squared=True) ind_X = np.argsort(dist_X, axis=1) ind_X_embedded = np.argsort(dist_X_embedded, axis=1)[:, 1:n_neighbors + 1] n_samples = X.shape[0] t = 0.0 ranks = np.zeros(n_neighbors) for i in range(n_samples): for j in range(n_neighbors): ranks[j] = np.where(ind_X[i] == ind_X_embedded[i, j])[0][0] ranks -= n_neighbors t += np.sum(ranks[ranks > 0]) t = 1.0 - t * (2.0 / (n_samples * n_neighbors * (2.0 * n_samples - 3.0 * n_neighbors - 1.0))) return t
def pairwise_distances(X, Y=None, index=None, metric="euclidean"): ''' Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array. :param X: array [n_samples_a, n_samples_a] Array of pairwise distances between samples, or a feature array. :param Y: array [n_samples_b, n_features] A second feature array only if X has shape [n_samples_a, n_features]. :param index: int, the index of element in X array :param metric: The metric to use when calculating distance between instances in a feature array. If metric ='rmsd', it should be computed by MDTraj :return: The distances ''' if metric == "rmsd": if Y is None: distances_ = md.rmsd(X, X, index, parallel=True, precentered=True) else: #distances_ = np.empty((len(X), len(Y)), dtype=np.float32) # for i in xrange(len(Y)): distances_ = md.rmsd(X, Y, index, parallel=True, precentered=True) return distances_ else: if Y is None: print "if Y is None" return sp.pairwise_distances(X, X[index], metric=metric) if index is None: print "if index is None, pairwise XX" return sp.pairwise_distances(X, X, metric=metric)
def make_rbf(x,sigma,metric='euclidean', x2=None): if x.ndim == 1: x = np.expand_dims(x, 1) if x2 is None: x2 = x if metric == 'cosine': #This code may be faster for some matrices # Code from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat ''' tic() #x = x.toarray() #similarity = np.dot(x, x.T) similarity = (x.dot(x.T)).toarray() square_mag = np.diag(similarity) inv_square_mag = 1 / square_mag inv_square_mag[np.isinf(inv_square_mag)] = 0 inv_mag = np.sqrt(inv_square_mag) W = similarity * inv_mag W = W.T * inv_mag W = 1 - W toc() tic() W2 = pairwise.pairwise_distances(x,x,metric) toc() ''' W = pairwise.pairwise_distances(x,x2,metric) else: #tic() W = pairwise.pairwise_distances(x,x2,metric) #toc() W = np.square(W) W = -sigma * W W = np.exp(W) return W
def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict): ## stats parameters quantiles_range = np.arange(0, 1.5, 0.5) stats_func = [ np.mean, np.std ] stats_feat_num = len(quantiles_range) + len(stats_func) n_class_relevance = 13 if metric == "cosine": stats_feat = 0 * np.ones((len(ids_test), stats_feat_num*n_class_relevance), dtype=float) sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1) elif metric == "euclidean": stats_feat = -1 * np.ones((len(ids_test), stats_feat_num*n_class_relevance), dtype=float) sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1) print("pairwise_distances generated!") for i in range(len(ids_test)): id = ids_test[i] for j in range(n_class_relevance): key = j if key in indices_dict: inds = indices_dict[key] # exclude this sample itself from the list of indices inds = [ ind for ind in inds if id != ids_train[ind] ] sim_tmp = sim[i][inds] if len(sim_tmp) != 0: feat = [ func(sim_tmp) for func in stats_func ] ## quantile sim_tmp = pd.Series(sim_tmp) quantiles = sim_tmp.quantile(quantiles_range) feat = np.hstack((feat, quantiles)) stats_feat[i,j*stats_feat_num:(j+1)*stats_feat_num] = feat return stats_feat
def dunn(max_nc, all_labels, dataset): dunn = [] print "DUNN (MAX)..." for nc in xrange(2, max_nc + 1): dn = 0.0 max_intra = 0.0 for cluster_i in xrange(nc): instances_i = dataset[np.where(all_labels[nc - 2] == cluster_i)[0]] pairwase_matrix_intra = pairwise_distances(instances_i, n_jobs=1) new_max_intra = np.amax(pairwase_matrix_intra) if new_max_intra > max_intra: max_intra = new_max_intra for cluster_i in xrange(nc): instances_i = dataset[np.where(all_labels[nc - 2] == cluster_i)[0]] for cluster_j in xrange(nc): if cluster_j > cluster_i: instances_j = dataset[np.where(all_labels[nc - 2] == cluster_j)[0]] pairwase_matrix_inter = pairwise_distances(instances_i, instances_j, n_jobs=1) min_inter = np.amin(pairwase_matrix_inter) if dn == 0.0: dn = min_inter / max_intra elif min_inter / max_intra < dn: dn = min_inter / max_intra print 'DUNN for k = ' + str(nc) + ' is ' + str(dn) + ' ...' dunn += [dn] return dunn
def bipartite_clustering(D2W,word_cluster_num,doc_cluster_num,metric,criteria): W2D = D2W.transpose() W2WC = kmean(W2D,word_cluster_num,criteria) #word_cluster_num = np.amax(W2WC)+1 #print "wc:",word_cluster_num for loop in range(4): #D2WC = D2W.dot(transform_from_index_array(W2WC,W2WC.size,word_cluster_num)) #print D2WC #print loop new_centroids = get_new_centroids(W2D,W2WC) new_distance_matrix = pairwise_distances(W2D,new_centroids,metric=metric) #how to calculate distance? maybe 1-matrix? #print new_distance_matrix D2WC = D2W.dot(new_distance_matrix) if loop==0: D2DC = kmean(D2WC,doc_cluster_num,criteria) else: new_centroids = get_new_centroids(D2WC,D2DC) D2DC = kmean(D2WC,doc_cluster_num,criteria,new_centroids) #doc_cluster_num = np.amax(D2DC)+1 #print "dc:",doc_cluster_num new_centroids = get_new_centroids(D2W,D2DC) new_distance_matrix = pairwise_distances(D2W,new_centroids,metric=metric) W2DC = W2D.dot(new_distance_matrix) new_centroids = get_new_centroids(W2DC,W2WC) W2WC = kmean(W2DC,word_cluster_num,criteria,new_centroids) #word_cluster_num = np.amax(W2WC)+1 #print "wc:",word_cluster_num return D2DC,W2WC
def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict, qids_test=None): if metric == "cosine": stats_feat = 0 * np.ones((len(ids_test), stats_feat_num * n_classes), dtype=float) sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1) elif metric == "euclidean": stats_feat = -1 * np.ones((len(ids_test), stats_feat_num * n_classes), dtype=float) sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1) for i in range(len(ids_test)): id = ids_test[i] if qids_test is not None: qid = qids_test[i] for j in range(n_classes): key = (qid, j + 1) if qids_test is not None else j + 1 if indices_dict.has_key(key): inds = indices_dict[key] # exclude this sample itself from the list of indices inds = [ind for ind in inds if id != ids_train[ind]] sim_tmp = sim[i][inds] if len(sim_tmp) != 0: feat = [func(sim_tmp) for func in stats_func] ## quantile sim_tmp = pd.Series(sim_tmp) quantiles = sim_tmp.quantile(quantiles_range) feat = np.hstack((feat, quantiles)) stats_feat[i, j * stats_feat_num:(j + 1) * stats_feat_num] = feat return stats_feat
def test_no_data_conversion_warning(): # No warnings issued if metric is not a boolean distance function rng = np.random.RandomState(0) X = rng.randn(5, 4) with pytest.warns(None) as records: pairwise_distances(X, metric="minkowski") assert len(records) == 0
def getSimMat(self, type = 'euclidean', ftr_type = 'data', orderFlag = True, pca_dim=20): if ftr_type == 'ftr': #use input features self.slctData = [ts for ts in self.slctData if ((ts.ftr is not None) and (len(ts.ftr) > 0))] dataMat = [ts.ftr for ts in self.slctData] elif ftr_type == 'data': #use input data dataMat = [ts.val for ts in self.slctData] else: print 'unknown ftr_type for ftr_type:', ftr_type if pca_dim > len(dataMat): pca_dim = int(math.ceil(len(dataMat)/2.0)) if type == 'euclidean': #euclidean distance based on time series data self.simMat = skmpw.euclidean_distances(dataMat) elif type == 'pca_euc': #extract feature based on PCA, then use Euclidean distance pca = skd.PCA(n_components=pca_dim) dataMat = pca.fit_transform(dataMat) self.simMat = skmpw.euclidean_distances(dataMat) elif type == 'nmf_euc': #extract feature based on NMF, then use Euclidean distance nmf = skd.NMF(n_components=pca_dim) dataMat = nmf.fit_transform(dataMat) self.simMat = skmpw.euclidean_distances(dataMat) elif type =='ica_euc': #extract feature based on ICA, then use Euclidean distance ica = skd.FastICA(n_components=pca_dim) dataMat = ica.fit_transform(dataMat) self.simMat = skmpw.euclidean_distances(dataMat) elif type =='cosine': self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine') elif type == 'pca_cos': #extract feature based on PCA, then use cosine distance pca = skd.PCA(n_components=pca_dim) dataMat = pca.fit_transform(dataMat) self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine') elif type == 'nmf_cos': #extract feature based on NMF, then use cosine distance nmf = skd.NMF(n_components=pca_dim) dataMat = nmf.fit_transform(dataMat) self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine') elif type =='ica_cos': #extract feature based on ICA, then use cosine distance ica = skd.FastICA(n_components=pca_dim) dataMat = ica.fit_transform(dataMat) self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine') else: print 'unknown type for similarity matrix: ', type #rearrange the order of data in simMat self.slctDataMat = dataMat if orderFlag: link = spc.hierarchy.linkage(self.simMat) dend = spc.hierarchy.dendrogram(link, no_plot=True) order = dend['leaves'] self.slctData = [self.slctData[i] for i in order] #rearrange order self.simMat = [self.simMat[i] for i in order] for i in xrange(len(self.simMat)): self.simMat[i] = [self.simMat[i][j] for j in order] self.slctDataMat = [self.slctDataMat[i] for i in order] # self.patchOrdering = [ts.ptchNm for ts in self.slctData] #record new ordering self.patchOrdering = JSONifyData(self.slctData) # Deok wants all the data for each patch in the response self.clstData = self.slctData self.clstSimMat = self.simMat
def test_radius_neighbors(): # Checks whether Returned distances are less than `radius` # At least one point should be returned when the `radius` is set # to mean distance from the considering point to other points in # the database. # Moreover, this test compares the radius neighbors of LSHForest # with the `sklearn.neighbors.NearestNeighbors`. n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = ignore_warnings(LSHForest, category=DeprecationWarning)() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) ignore_warnings(lshf.fit)(X) for i in range(n_iter): # Select a random point in the dataset as the query query = X[rng.randint(0, n_samples)].reshape(1, -1) # At least one neighbor should be returned when the radius is the # mean distance from the query to the points of the dataset. mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) assert_equal(neighbors.shape, (1,)) assert_equal(neighbors.dtype, object) assert_greater(neighbors[0].shape[0], 0) # All distances to points in the results of the radius query should # be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) # dists and inds should not be 1D arrays or arrays of variable lengths # hence the use of the object dtype. assert_equal(distances.shape, (n_queries,)) assert_equal(distances.dtype, object) assert_equal(neighbors.shape, (n_queries,)) assert_equal(neighbors.dtype, object) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)].reshape(1, -1) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
def predict(dialogue_session, line): lowest = ('x',1) data = dataDict[dialogue_session][1][line,:] for vector in vDict: predictor = vDict[vector] if pair.pairwise_distances(predictor,data,'cosine') < lowest[1]: lowest = (vector, pair.pairwise_distances(predictor,data,'cosine')) return lowest
def cramer_statistic(self, n_jobs=1): ''' Applies the Cramer Statistic to the datasets. Parameters ---------- n_jobs : int, optional Sets the number of cores to use to calculate pairwise distances. Default is 1. ''' # Adjust what we call n,m based on the larger dimension. # Then the looping below is valid. if self.data_matrix1.shape[0] >= self.data_matrix2.shape[0]: m = self.data_matrix1.shape[0] n = self.data_matrix2.shape[0] larger = self.data_matrix1 smaller = self.data_matrix2 else: n = self.data_matrix1.shape[0] m = self.data_matrix2.shape[0] larger = self.data_matrix2 smaller = self.data_matrix1 pairdist11 = pairwise_distances(larger, metric="euclidean", n_jobs=n_jobs) pairdist22 = pairwise_distances(smaller, metric="euclidean", n_jobs=n_jobs) pairdist12 = pairwise_distances(larger, smaller, metric="euclidean", n_jobs=n_jobs) # Take sqrt of each # We default to using the Cramer kernel in Baringhaus & Franz (2004) # \phi(dist) = sqrt(dist) / 2. # The normalization values below reflect this pairdist11 = np.sqrt(pairdist11) pairdist12 = np.sqrt(pairdist12) pairdist22 = np.sqrt(pairdist22) term1 = 0.0 term2 = 0.0 term3 = 0.0 for i in range(m): for j in range(n): term1 += pairdist12[i, j] for ii in range(m): term2 += pairdist11[i, ii] if i < n: for jj in range(n): term3 += pairdist22[i, jj] m, n = float(m), float(n) term1 *= (1 / (m * n)) term2 *= (1 / (2 * m ** 2.)) term3 *= (1 / (2 * n ** 2.)) self._distance = (m * n / (m + n)) * (term1 - term2 - term3)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot("df") df = dfslot.data() dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() logger.info("Reseting history because of changes in the input df") dfslot.update(run_number, df) # TODO: be smarter with changed values m = step_size indices = dfslot.next_created(m) m = indices_len(indices) i = None j = None Si = self._buf.matrix() arrayslot = self.get_input_slot("array") if arrayslot is not None and arrayslot.data() is not None: array = arrayslot.data() logger.debug("Using array instead of DataFrame columns") if Si is not None: i = array[self._last_index] j = array[indices] if j is None: if self.columns is None: self.columns = df.columns.delete(np.where(df.columns == Module.UPDATE_COLUMN)) elif not isinstance(self.columns, pd.Index): self.columns = pd.Index(self.columns) rows = df[self.columns] if Si is not None: i = rows.loc[self._last_index] assert len(i) == len(self._last_index) j = rows.loc[fix_loc(indices)] assert len(j) == indices_len(indices) Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs) if Si is None: mat = self._buf.resize(Sj.shape[0]) mat[:, :] = Sj self._last_index = dfslot.last_index[indices] else: Sij = pairwise_distances(i, j, metric=self._metric, n_jobs=self._n_jobs) n0 = i.shape[0] n1 = n0 + j.shape[0] mat = self._buf.resize(n1) mat[0:n0, n0:n1] = Sij mat[n0:n1, 0:n0] = Sij.T mat[n0:n1, n0:n1] = Sj self._last_index = self._last_index.append(df.index[indices]) # truth = pairwise_distances(array[0:n1], metric=self._metric) # import pdb # pdb.set_trace() # assert np.allclose(mat,truth) return self._return_run_step(dfslot.next_state(), steps_run=m)
def test_radius_neighbors(): """Checks whether Returned distances are less than `radius` At least one point should be returned when the `radius` is set to mean distance from the considering point to other points in the database. Moreover, this test compares the radius neighbors of LSHForest with the `sklearn.neighbors.NearestNeighbors`. """ n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) lshf.fit(X) for i in range(n_iter): query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) # At least one neighbor should be returned. assert_greater(neighbors.shape[0], 0) # All distances should be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) assert_equal(neighbors.shape[0], n_queries) assert_equal(distances.shape[0], n_queries) # dists and inds should not be 2D arrays assert_equal(distances.ndim, 1) assert_equal(neighbors.ndim, 1) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine') nbrs.fit(X) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) # Distances of exact neighbors is less than or equal to approximate assert_true(np.all(np.less_equal(np.sort(distances_exact[0]), np.sort(distances_approx[0]))))
def fit(self, X, y=None, c=None): """Fit the model using X as training data. Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64) random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: raise ValueError("early_exaggeration must be at least 1, but is " "%f" % self.early_exaggeration) if self.n_iter < 200: raise ValueError("n_iter should be at least 200") if self.metric == "precomputed": if self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be used " "with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") distances = X else: if self.verbose: print("[t-SNE] Computing pairwise distances...") if self.metric == "euclidean": distances = pairwise_distances(X, metric=self.metric, squared=True) else: distances = pairwise_distances(X, metric=self.metric) # Degrees of freedom of the Student's t-distribution. The suggestion # alpha = n_components - 1 comes from "Learning a Parametric Embedding # by Preserving Local Structure" Laurens van der Maaten, 2009. alpha = max(self.n_components - 1.0, 1) n_samples = X.shape[0] self.training_data_ = X P = _joint_probabilities(distances, self.perplexity, self.verbose) self.P = deepcopy(P) if self.init == 'pca': pca = RandomizedPCA(n_components=self.n_components, random_state=random_state) X_embedded = pca.fit_transform(X) elif self.init == 'random': X_embedded = None else: raise ValueError("Unsupported initialization scheme: %s" % self.init) self.embedding_ = self._tsne(P, alpha, n_samples, random_state, X_embedded=X_embedded, c=c)
def multiQuadricKernel(X, X2=None, offset=1.0, jobs=1, *args, **kwargs): offset = float(offset) if X2 is not None: distanceMatrix = pairwise.pairwise_distances(X, X2, n_jobs=jobs) else: distanceMatrix = pairwise.pairwise_distances(X, n_jobs=jobs) result = np.sqrt(distanceMatrix**2 + offset**2) return result
def knn_dist(x, x_ctrl, s=100, p=1): x_tmp = random_subsample(x_ctrl, 200000, replace=False) xs = kmeans_subsample(x_tmp, s) if p == 1: min_dist = np.min(pairwise_distances(X=x, Y=xs, metric="l1"), axis=1) elif p == 2: min_dist = np.min(pairwise_distances(X=x, Y=xs, metric="l2"), axis=1) assert len(min_dist) == x.shape[0] return min_dist
def kpca_cluster(data,nclusters=100,ncomponents=40,topwhat=10,zscored=False): ''' Computes clustering of bag-of-words vectors of articles INPUT folder model folder nclusters number of clusters ''' from sklearn.cluster import KMeans # filtering out some noise words stops = map(lambda x:x.lower().strip(),open('stopwords.txt').readlines()[6:]) # vectorize non-stopwords bow = TfidfVectorizer(min_df=2,stop_words=stops) X = bow.fit_transform(data) # creating bow-index-to-word map idx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys())) # using now stopwords and filtering out digits print 'Computing pairwise distances' K = pairwise_distances(X,metric='l2',n_jobs=1) perc = 50.0 width = percentile(K.flatten(),perc) # KPCA transform bow vectors Xc = KernelPCA(n_components=ncomponents,kernel='rbf',gamma=width).fit_transform(X) if zscored: Xc = zscore(Xc) # compute clusters km = KMeans(n_clusters=nclusters).fit(Xc) Xc = km.predict(Xc) clusters = [] for icluster in range(nclusters): nmembers = (Xc==icluster).sum() if True:#nmembers < len(data) / 5.0 and nmembers > 1: # only group clusters big enough but not too big members = (Xc==icluster).nonzero()[0] topwordidx = array(X[members,:].sum(axis=0))[0].argsort()[-topwhat:][::-1] topwords = ' '.join([idx2word[wi] for wi in topwordidx]) meanDist = triu(pairwise_distances(X[members,:],metric='l2',n_jobs=1)).sum() meanDist = meanDist / (len(members) + (len(members)**2 - len(members))/2.0) # print u'Cluster %d'%icluster + u' %d members'%nmembers + u' mean Distance %f'%meanDist + u'\n\t'+topwords clusters.append({ 'name':'Cluster-%d'%icluster, 'description': topwords, 'members': list(members), 'meanL2Distances': meanDist }) return clusters
def cauchyKernel(X, X2, sigma=1.0, jobs=1, *args, **kwargs): sigma = float(sigma) if X2 is not None: distanceMatrix = pairwise.pairwise_distances(X, X2, n_jobs=jobs) else: distanceMatrix = pairwise.pairwise_distances(X, n_jobs=jobs) result = 1 / (1 + distanceMatrix**2 / sigma**2) return result
def cramer_statistic(self, n_jobs=1): ''' Applies the Cramer Statistic to the datasets. Parameters ---------- n_jobs : int, optional Sets the number of cores to use to calculate pairwise distances ''' # Adjust what we call n,m based on the larger dimension. # Then the looping below is valid. if self.data_matrix1.shape[0] >= self.data_matrix2.shape[0]: m = self.data_matrix1.shape[0] n = self.data_matrix2.shape[0] larger = self.data_matrix1 smaller = self.data_matrix2 else: n = self.data_matrix1.shape[0] m = self.data_matrix2.shape[0] larger = self.data_matrix2 smaller = self.data_matrix1 pairdist11 = pairwise_distances( larger, metric="euclidean", n_jobs=n_jobs) pairdist22 = pairwise_distances( smaller, metric="euclidean", n_jobs=n_jobs) pairdist12 = pairwise_distances( larger, smaller, metric="euclidean", n_jobs=n_jobs) term1 = 0.0 term2 = 0.0 term3 = 0.0 for i in range(m): for j in range(n): term1 += pairdist12[i, j] for ii in range(m): term2 += pairdist11[i, ii] if i < n: for jj in range(n): term3 += pairdist22[i, jj] m, n = float(m), float(n) term1 *= (1 / (m * n)) term2 *= (1 / (2 * m ** 2.)) term3 *= (1 / (2 * n ** 2.)) self.distance = (m * n / (m + n)) * (term1 - term2 - term3) return self
def rationalQuadraticKernel(X, X2=None, offset=1.0, jobs=1, *args, **kwargs): assert (offset > 0) offset = float(offset) if X2 is not None: distanceMatrix = pairwise.pairwise_distances(X, X2, n_jobs=jobs) else: distanceMatrix = pairwise.pairwise_distances(X, n_jobs=jobs) result = 1 - distanceMatrix**2 / (distanceMatrix**2 + offset) return result
def fit(self, X, y): X,y=check_X_y(X,y) # X, y = check_arrays(X, y, sparse_format="csr") # y = column_or_1d(y, warn=True) n_samples, n_features = X.shape classes = np.unique(y) self.classes_ = classes n_classes = classes.size if n_classes < 2: raise ValueError('y has fewer than 2 classes') if len(self.centers_)>0: assert len(self.centers_[0])==n_features radii=[] count=[] # first pass - only need the radii, because the vectors and the targets are already stored pass_number=0 i=0 for v,t in zip(X,y): v=v.reshape(1, -1) D=pairwise_distances(v,X).ravel() r=max(D[y!=t].min()-1e-10,1e-10) radii.append(r) within=D[y==t]<=r count.append(within.sum()) i+=1 radii=np.array(radii) count=np.array(count) # second pass for v,t in zip(X,y): # Go through all of the data points #Select the sphere that contains that point, # and the largest number of other points, # and add it to the final spheres list v=v.reshape(1, -1) D=pairwise_distances(v,X).ravel() within_centers=(D<=radii) matched=(t==y) & (within_centers) idx=np.arange(len(y)) idx_matched=idx[matched] best=idx_matched[np.argmax(count[matched])] self._add_center(X[best],radii[best],y[best]) pass_number+=1
def laplacianKernel(X, X2=None, sigma=1.0, cutoff=None, jobs=1, *args, **kwargs): sigma = float(sigma) if X2 is not None: distanceMatrix = pairwise.pairwise_distances(X, X2, n_jobs=jobs) else: distanceMatrix = pairwise.pairwise_distances(X, n_jobs=jobs) result = np.exp(-distanceMatrix / sigma) if cutoff is not None: result[result < cutoff] = 0 return result
def kneighbors(X, n_neighbors, metric): """Finds the K-neighbors of a point. Based on sklearn Returns distance ind """ if metric == 'abs_correlation': dist = pairwise_distances(X, metric = 'correlation') dist = np.sqrt( -np.log( np.abs(1 - dist )) ) else: dist = pairwise_distances(X, metric = metric) neigh_ind = dist.argsort(axis=1) neigh_ind = neigh_ind[:, :n_neighbors] return neigh_ind
def jaccard_distance(X, Y=None, n_jobs=-1, **kwds): """ Computes the Jaccard distance between all the pairs of vectors in X. If X is not sparse the function defaults to sklearn.metrics.pairwise.pairwise_distances If Y is given distances between X and Y are computed The Jaccard index is defined as the intersection / union of items in the vector (that is non-sparse indices, regardless of their magnitude) Parameters: ----------- X: an array (dims: samples X features) Y: an optional array (dims: samples_2 X features) n_jobs: optionally run on multiple cores **kwds: additional parameters to sklearn.metrics.pairwise.pairwise_distances Returns a square distance matrix. All elements are in [0, 1]. One may use 1-R for similarity (where R is the return value) Examples: >>> from scipy.sparse import csr_matrix >>> from numpy import matrix >>> d = matrix([[1, 1, 1, 0, 0], [0, 1, 1, 0, 1]]) >>> s = csr_matrix(d) >>> jaccard_distance(s) matrix([[ 0. , 0.5], [ 0.5, 0. ]]) >>> jaccard_distance(s, [1, 1, 1, 0, 1]) matrix([[ 0.25], [ 0.25]]) """ if Y is None: Y = X if sparse.issparse(X): if not sparse.issparse(Y): Y = sparse.csr_matrix(Y) mmx = (X!=0) mmy = (Y!=0) mx = sparse.csr_matrix((np.ones_like(mmx.data, dtype=np.double), mmx.indices, mmx.indptr), shape=mmx.shape) if X is Y: my = mx else: my = sparse.csr_matrix((np.ones_like(mmy.data, dtype=np.double), mmy.indices, mmy.indptr), shape=mmy.shape) m_int = mx * my.T m_uni = pairwise_distances(mx, my, metric='manhattan', n_jobs=n_jobs, **kwds) m_uni += m_int return 1.0 - (m_int / m_uni) else: return pairwise_distances(X, metric='jaccard', n_jobs=n_jobs, **kwds)
def find_reference(raw, n_cluster, pick_types=None, copy=True, flat_threshold=1e-15, n_split=100, plot=True): """ Computes covariance on splits of the raw data, and apply KMeans clustering to find the number of disjoint references. n_cluster is found with PCA if float """ import matplotlib.pyplot as plt from pyriemann.estimation import Covariances from sklearn.cluster import KMeans from sklearn.metrics.pairwise import pairwise_distances if copy: raw = raw.copy() # Remove flat lines flat = np.where(np.std(raw._data, axis=1) < flat_threshold)[0] for ch in flat: raw.info['bads'] += [raw.ch_names[ch]] # Pick data channels only if pick_types is None: pick_types = dict(seeg=True, exclude='bads') raw.pick_types(**pick_types) # Compute covariance on data splits n_time = len(raw.times) t_max = raw.times[n_time - n_time % n_split - 1] raw.crop(0, t_max, copy=False) # ensure regularly sized splits X = np.array(np.array_split(raw._data, n_split, axis=1)) covs = Covariances().fit_transform(X) # Compute cluster for each data split cluster = KMeans(n_cluster) all_kmeans = list() for cov in covs: dist = pairwise_distances(cov) all_kmeans.append(cluster.fit_predict(dist)) # Combine clusters dist = pairwise_distances(np.array(all_kmeans).T) idx = cluster.fit_predict(dist) if plot: idx_ = np.argsort(idx) cov = np.median(covs, axis=0) plt.matshow(np.log10(cov)[idx_, :][:, idx_]) clusters = [np.array(raw.ch_names)[idx == ii] for ii in np.unique(idx)] return clusters
def test_neighbors_accuracy_with_n_estimators(): # Checks whether accuracy increases as `n_estimators` increases. n_estimators = np.array([1, 10, 100]) n_samples = 100 n_features = 10 n_iter = 10 n_points = 5 rng = np.random.RandomState(42) accuracies = np.zeros(n_estimators.shape[0], dtype=float) X = rng.rand(n_samples, n_features) for i, t in enumerate(n_estimators): lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( n_candidates=500, n_estimators=t) ignore_warnings(lshf.fit)(X) for j in range(n_iter): query = X[rng.randint(0, n_samples)].reshape(1, -1) neighbors = lshf.kneighbors(query, n_neighbors=n_points, return_distance=False) distances = pairwise_distances(query, X, metric='cosine') ranks = np.argsort(distances)[0, :n_points] intersection = np.intersect1d(ranks, neighbors).shape[0] ratio = intersection / float(n_points) accuracies[i] = accuracies[i] + ratio accuracies[i] = accuracies[i] / float(n_iter) # Sorted accuracies should be equal to original accuracies assert_true(np.all(np.diff(accuracies) >= 0), msg="Accuracies are not non-decreasing.") # Highest accuracy should be strictly greater than the lowest assert_true(np.ptp(accuracies) > 0, msg="Highest accuracy is not strictly greater than lowest.")
def fuzzy_c_means(points, num_centers, m=2., tol=1e-4, max_iter=100, verbose=False): '''Uses Fuzzy C-Means to downsample `points`. m : aggregation parameter >1, larger implies smoother clusters Returns indices of downsampled points. ''' num_points = points.shape[0] if num_centers >= num_points: return np.arange(num_points) # randomly initialize cluster assignments matrix assn = np.random.random((points.shape[0], num_centers)) # iterate assignments until they converge for i in range(max_iter): # compute centers w = assn ** m w /= w.sum(axis=0) centers = w.T.dot(points) # calculate new assignments d = pairwise_distances(points, centers) d **= 2. / (m - 1) np.maximum(d, 1e-10, out=d) new_assn = 1. / np.einsum('ik,ij->ik', d, 1./d) # check for convergence change = np.linalg.norm(new_assn - assn) if verbose: print('At iteration %d: change = %g' % (i+1, change)) if change < tol: break assn = new_assn else: warnings.warn("fuzzy_c_means didn't converge in %d iterations" % max_iter) # find points closest to the selected cluster centers return d.argmin(axis=0)
def fh_dist_lines(li1, li2): """ Compute a cheap distance (based on hausdorff-distance) between *li1* and *li2*, two LineString. Parameters ---------- li1: shapely.geometry.LineString li2: shapely.geometry.LineString Returns ------- max_dist: Float of the distance between li1 and li2. """ coord_li1 = np.array([i for i in zip(li1.coords.xy[0], li1.coords.xy[1])]) coord_li2 = np.array([i for i in zip(li2.coords.xy[0], li2.coords.xy[1])]) if len(coord_li2) > len(coord_li2): coord_li1, coord_li2 = coord_li2, coord_li1 dist_mat = pairwise_distances( coord_li1, coord_li2, metric='euclidean', n_jobs=2 ) chkl = round(len(coord_li1)/len(coord_li2)) return max( [dist_mat[i, j] for i, j in zip( list(range(len(coord_li1))), list(nrepeat(range(len(coord_li2)), chkl))[:len(coord_li1)])] )
def _fit(self, X, skip_num_points=0): """Fit the model using X as training data. Note that sparse arrays can only be handled by method='exact'. It is recommended that you convert your sparse array to dense (e.g. `X.toarray()`) if it fits in memory, or otherwise using a dimensionality reduction technique (e.g. TruncatedSVD). Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. Note that this when method='barnes_hut', X cannot be a sparse array and if need be will be converted to a 32 bit float array. Method='exact' allows sparse arrays and 64bit floating point inputs. skip_num_points : int (optional, default:0) This does not compute the gradient for points with indices below `skip_num_points`. This is useful when computing transforms of new data where you'd like to keep the old data fixed. """ if self.method not in ['barnes_hut', 'exact']: raise ValueError("'method' must be 'barnes_hut' or 'exact'") if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") if self.method == 'barnes_hut' and sp.issparse(X): raise TypeError('A sparse matrix was passed, but dense ' 'data is required for method="barnes_hut". Use ' 'X.toarray() to convert to a dense numpy array if ' 'the array is small enough for it to fit in ' 'memory. Otherwise consider dimensionality ' 'reduction techniques (e.g. TruncatedSVD)') else: X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64) random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: raise ValueError("early_exaggeration must be at least 1, but is " "%f" % self.early_exaggeration) if self.n_iter < 200: raise ValueError("n_iter should be at least 200") if self.metric == "precomputed": if isinstance(self.init, string_types) and self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be used " "with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") distances = X else: if self.verbose: print("[t-SNE] Computing pairwise distances...") if self.metric == "euclidean": distances = pairwise_distances(X, metric=self.metric, squared=True) else: distances = pairwise_distances(X, metric=self.metric) if not np.all(distances >= 0): raise ValueError("All distances should be positive, either " "the metric or precomputed distances given " "as X are not correct") # Degrees of freedom of the Student's t-distribution. The suggestion # degrees_of_freedom = n_components - 1 comes from # "Learning a Parametric Embedding by Preserving Local Structure" # Laurens van der Maaten, 2009. degrees_of_freedom = max(self.n_components - 1.0, 1) n_samples = X.shape[0] # the number of nearest neighbors to find k = min(n_samples - 1, int(3. * self.perplexity + 1)) neighbors_nn = None if self.method == 'barnes_hut': if self.verbose: print("[t-SNE] Computing %i nearest neighbors..." % k) if self.metric == 'precomputed': # Use the precomputed distances to find # the k nearest neighbors and their distances neighbors_nn = np.argsort(distances, axis=1)[:, :k] elif self.rho >= 1: # Find the nearest neighbors for every point bt = BallTree(X) # LvdM uses 3 * perplexity as the number of neighbors # And we add one to not count the data point itself # In the event that we have very small # of points # set the neighbors to n - 1 distances_nn, neighbors_nn = bt.query(X, k=k + 1) neighbors_nn = neighbors_nn[:, 1:] elif self.rho < 1: # Use pyFLANN to find the nearest neighbors myflann = FLANN() testset = X params = myflann.build_index(testset, algorithm="autotuned", target_precision=self.rho, log_level='info') neighbors_nn, distances = myflann.nn_index( testset, k + 1, checks=params["checks"]) neighbors_nn = neighbors_nn[:, 1:] P = _joint_probabilities_nn(distances, neighbors_nn, self.perplexity, self.verbose) else: P = _joint_probabilities(distances, self.perplexity, self.verbose) assert np.all(np.isfinite(P)), "All probabilities should be finite" assert np.all(P >= 0), "All probabilities should be zero or positive" assert np.all(P <= 1), ("All probabilities should be less " "or then equal to one") if isinstance(self.init, np.ndarray): X_embedded = self.init elif self.init == 'pca': pca = PCA(n_components=self.n_components, svd_solver='randomized', random_state=random_state) X_embedded = pca.fit_transform(X) elif self.init == 'random': X_embedded = None else: raise ValueError("Unsupported initialization scheme: %s" % self.init) return self._tsne(P, degrees_of_freedom, n_samples, random_state, X_embedded=X_embedded, neighbors=neighbors_nn, skip_num_points=skip_num_points)
def build_all(): # For each class, we build all the trees and save them in CSVs nar_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/narrative') write_tree_in_csv(nar_trees) arg_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/argumentative/') write_tree_in_csv(arg_trees) inf_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative/') write_tree_in_csv(inf_trees) des_trees = [] #des_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative/') #write_tree_in_csv(des_trees) # Attention, contient couples de (trees + tree_ID) ou tree_ID est le nom du fichier. all_trees = nar_trees + arg_trees + inf_trees + des_trees int2cl = {0:'narrative', 1:'argumentative', 2:'informative',3:'descriptive'} path_to_save = '~/Documents/s2/tal/discourseAnalysis/data/' y_nar = [0 for t in nar_trees] y_arg = [1 for t in arg_trees] y_inf = [2 for t in inf_trees] y_des = [3 for t in des_trees] y = np.array( y_nar + y_arg + y_inf + y_des ) pickle.dump(y,open(path_to_save+'labels_test.pkl','wb')) T = [t[0] for t in all_trees] pickle.dump(T,open(path_to_save+'trees_test.pkl','wb')) index = ['bin','count','norm','height','tfid'] #Dicts D_bin = vectorizers.build_bin_vects(T) D_count = vectorizers.build_count_vects(T) D_norm = vectorizers.build_norm_vects(T) D_height = vectorizers.build_height_vects(T) D_tfid = vectorizers.build_tfid_vects(T) D_df = pd.DataFrame([D_bin,D_count,D_norm,D_height,D_tfid],index=index) D_df = D_df.transpose() D_df.to_pickle(path_to_save+'dicts_test.pkl') #Vects vectorizer = feature_extraction.DictVectorizer(sparse=False) V_bin = vectorizer.fit_transform(D_bin) V_count = vectorizer.fit_transform(D_count) V_norm = vectorizer.fit_transform(D_norm) V_height = vectorizer.fit_transform(D_height) V_tfid = vectorizer.fit_transform(D_tfid) V_all = np.zeros((len(index),V_bin.shape[0],V_bin.shape[1])) V_all = np.array([V_bin,V_count,V_norm,V_height,V_tfid]) V_df = [] for i in range(V_all.shape[1]): d = {} for j,v in enumerate(V_all[:,i]): d[index[j]]=v V_df.append(d) V_df = pd.DataFrame(V_df) V_df.to_pickle(path_to_save+'vects_test.pkl') #euclidean distance K_bin_eucl_dist = pairwise.pairwise_distances(V_bin,metric='euclidean') K_count_eucl_dist = pairwise.pairwise_distances(V_count,metric='euclidean') K_norm_eucl_dist = pairwise.pairwise_distances(V_norm,metric='euclidean') K_height_eucl_dist = pairwise.pairwise_distances(V_height,metric='euclidean') K_tfid_eucl_dist = pairwise.pairwise_distances(V_tfid,metric='euclidean') K_all_eucl_dist = [K_bin_eucl_dist, K_count_eucl_dist, K_norm_eucl_dist, K_height_eucl_dist, K_tfid_eucl_dist] K_all = {'eucl_dist':K_all_eucl_dist} pickle.dump(K_all,open(path_to_save+'kernels_test.pkl','wb'))
def _convert_to_similarity_matrix(self, embeddings, metric=None): # if self.metric.lower() == 'euclidean': if metric is None: metric = self.metric.lower() #return squareform(pdist(embeddings,metric=metric)) return pairwise_distances(embeddings, metric=metric)
ratings = pd.DataFrame(df_train.groupby('title')['rating'].mean()) ratings['number_of_ratings'] = df_train.groupby('title')['rating'].count() movie_matrix = df_train.pivot_table(index='user_id', columns='title', values='rating') #Calculate similarity train_data_matrix = np.zeros((n_users, n_items)) for row in df_train.itertuples(): train_data_matrix[row[1] - 1, row[2] - 1] = row[3] test_data_matrix = np.zeros((n_users, n_items)) for line in df_test.itertuples(): test_data_matrix[line[1] - 1, line[2] - 1] = line[3] user_similarity = pairwise_distances(train_data_matrix, metric="cosine") item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine") #Calculate popularity item_popular = {} for i in range(n_items): if np.sum(train_data_matrix[:, i]) != 0: item_popular[i] = np.sum(train_data_matrix[:, i] != 0) item_count = len(item_popular) #Similarity compensates for each person's rating habits and the popularity of movies rate = train_data_matrix.mean(axis=1) rate2 = (train_data_matrix - rate[:, np.newaxis]) pred_user = rate[:, np.newaxis] + user_similarity.dot(rate2) / np.array( [np.abs(user_similarity).sum(axis=1)]).T pred_item = train_data_matrix.dot(item_similarity) / np.array(
resultfile.close() header1 = ['user_id', 'item_id', 'rating'] df = pd.read_csv('D://recommendation//new//movielens1M.txt', sep='\t', names=header1,index_col=False) n_users = df.user_id.unique().shape[0] n_items = df.item_id.unique().shape[0] print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)) from sklearn import cross_validation as cv dataset,_ = cv.train_test_split(df, test_size=0) originalMatrix = np.zeros((n_users, n_items)) for line in dataset.itertuples(): originalMatrix[int(line[1])-1, int(line[2])-1] = line[3] from sklearn.metrics.pairwise import pairwise_distances cosine_distance = pairwise_distances(originalMatrix.T, metric='cosine') sim = 1-cosine_distance #based on similarity label = np.array( [ 5,16,9,12,12,16,7,15,19,16,7,12,15,6,19,1,1,9,9,9,5,1,19,9,1,8,15,8,6,2,12,5,10,5,8,1,10,15,5,2,6,9,8,9,1,6,5,12,2,5,17,1,10,15,8,10,6,6,10,9,19,1,15,15,12,15,2,2,9,16,19,8,8,15,10,19,2,8,19,2,6,8,2,2,6,19,15,12,9,10,4,19,12,6,16,13,2,2,2,6,6,12,19,7,6,10,12,2,1,5,11,16,8,13,15,2,8,15,2,10,2,9,8,2,6,15,10,10,2,10,2,19,10,10,12,2,13,13,10,12,7,3,12,6,9,2,6,8,8,5,1,8,16,4,8,8,12,12,6,16,16,6,16,6,16,8,10,9,15,9,8,9,16,12,6,6,0,2,8,9,12,8,8,8,16,12,8,0,15,8,19,10,9,6,12,16,2,16,17,2,8,2,12,9,8,8,12,16,15,2,2,10,2,2,6,9,15,12,2,18,13,6,1,1,9,10,19,2,6,1,7,6,6,12,5,9,9,15,12,15,15,2,10,15,2,6,6,12,6,15,15,9,16,2,15,9,19,15,19,5,6,8,2,2,6,1,12,8,8,15,8,6,9,15,12,12,19,8,8,9,6,6,10,8,2,10,2,16,12,8,15,16,1,8,15,5,15,2,8,1,2,8,16,15,6,6,6,6,2,10,10,12,15,6,9,16,7,5,6,8,2,6,15,2,15,8,9,0,16,0,2,0,7,8,8,10,1,9,7,19,2,1,15,7,6,6,8,6,16,1,12,8,16,19,9,5,5,19,2,12,9,12,2,5,8,0,5,16,8,9,9,1,6,12,8,9,5,12,16,5,6,9,9,19,13,2,19,8,2,2,15,2,19,15,2,10,10,10,10,2,3,10,15,10,19,10,0,15,15,7,15,6,9,12,9,19,6,15,12,9,15,19,9,19,8,0,19,6,12,15,1,7,15,16,16,9,12,12,10,5,1,16,2,8,19,6,8,6,15,9,8,2,15,5,9,8,5,19,19,15,8,2,19,9,8,16,15,6,8,15,1,8,12,5,6,2,6,15,19,5,1,19,8,15,16,15,15,8,12,9,9,6,19,16,2,2,1,2,19,7,8,12,13,19,12,8,9,1,1,15,19,0,19,9,6,9,9,9,19,7,6,8,8,9,2,2,5,19,1,13,6,9,9,6,6,19,6,6,7,9,5,12,7,19,10,9,19,9,8,12,5,16,16,2,1,8,10,12,10,10,2,1,13,15,8,13,2,2,12,2,8,10,10,6,12,10,15,10,3,8,8,10,2,13,7,7,5,5,5,5,10,5,5,7,7,7,7,10,4,8,3,2,15,15,12,0,13,5,15,16,0,19,8,2,2,12,2,13,15,2,8,19,10,10,4,15,6,1,13,13,15,2,2,10,8,10,12,13,8,19,10,10,3,3,2,14,1,16,10,8,3,2,16,10,10,15,10,10,2,3,9,9,9,13,2,13,15,2,2,2,9,3,9,14,9,8,2,6,13,17,2,10,17,10,2,8,10,12,15,10,15,19,15,19,19,10,19,10,9,12,13,2,10,15,2,10,14,7,12,15,15,13,18,8,2,15,10,2,9,6,14,2,17,9,19,13,10,6,10,10,8,13,16,10,18,16,9,19,2,2,19,0,12,10,6,17,15,16,13,11,2,18,2,10,13,10,10,10,2,2,19,9,13,8,12,6,2,9,10,10,10,18,9,10,13,10,19,1,15,16,8,9,9,7,1,16,10,16,3,10,10,10,10,2,15,13,2,9,9,11,12,7,8,12,9,8,10,15,19,12,13,2,15,10,10,17,4,12,17,13,10,4,10,2,13,10,13,15,12,7,2,16,12,10,19,9,12,6,19,15,18,0,10,10,10,2,15,6,16,2,6,7,10,2,15,2,2,5,3,13,9,2,10,10,10,1,15,10,8,15,15,10,3,13,15,10,2,10,0,9,15,19,10,13,15,19,10,15,10,1,0,8,8,10,3,2,17,4,11,4,4,11,11,11,4,4,17,11,4,11,4,11,11,11,4,4,8,17,5,11,4,4,11,5,13,4,17,4,17,4,4,17,4,14,17,17,17,4,13,14,17,4,17,4,4,17,17,4,4,4,4,4,11,4,4,2,2,13,17,13,2,13,15,17,4,13,2,14,11,17,4,13,17,13,10,17,3,17,18,10,10,17,10,8,10,12,8,8,10,19,6,15,2,6,18,9,8,19,9,15,18,10,9,19,12,19,14,17,14,7,12,14,7,12,12,12,14,12,14,7,12,7,14,12,9,13,7,5,7,9,7,7,12,6,11,5,16,13,10,13,6,7,19,2,8,8,16,19,9,6,6,2,8,8,19,8,6,10,1,1,1,10,2,12,4,17,13,17,17,13,13,14,5,4,8,13,11,4,5,11,11,4,14,11,17,4,2,7,5,5,7,5,1,11,1,4,5,13,4,9,16,10,11,4,18,17,10,17,16,4,8,8,15,8,3,2,13,3,2,1,2,9,2,11,11,9,5,0,16,0,4,4,10,10,7,5,9,10,10,1,2,3,18,2,3,14,6,6,10,4,2,2,13,13,19,19,19,1,14,13,4,17,10,10,10,10,12,18,2,10,6,4,6,10,6,8,6,4,11,2,2,2,1,8,4,11,6,6,4,6,6,8,11,14,0,5,5,5,11,5,11,17,11,11,19,5,11,11,4,5,4,4,5,5,16,8,4,6,11,5,11,5,6,4,11,4,4,11,7,11,11,2,11,11,11,8,4,4,8,5,18,5,6,11,6,5,11,4,1,11,4,11,14,4,18,11,7,5,5,4,14,11,11,4,5,5,11,1,4,5,1,11,8,9,16,11,6,5,6,8,4,11,4,4,5,14,11,11,4,6,5,11,11,11,4,4,7,1,11,4,14,5,4,11,4,8,5,2,2,2,10,2,10,19,10,10,2,15,8,16,11,18,18,18,18,0,0,18,18,0,18,0,11,14,18,18,18,12,16,14,18,0,1,4,11,14,16,4,10,14,8,19,12,6,19,16,1,1,12,10,8,17,12,2,8,8,12,8,2,16,16,16,16,5,16,16,16,16,9,7,9,19,10,13,16,3,5,16,0,12,16,6,5,5,11,5,8,15,8,19,6,16,15,8,7,2,16,16,7,19,6,2,2,6,2,6,2,15,17,10,10,9,10,10,12,15,19,2,9,10,9,19,10,10,8,13,2,9,15,19,7,8,2,10,12,8,8,8,1,13,19,0,12,8,2,15,12,8,9,12,9,0,15,6,8,1,2,19,18,15,10,8,15,12,8,7,2,17,16,6,14,8,6,8,7,10,12,9,15,15,19,17,2,10,15,10,15,8,16,5,8,10,12,8,17,19,8,8,10,13,2,18,1,13,9,2,5,9,2,10,15,10,15,1,10,12,16,10,8,8,13,10,2,13,8,14,6,10,10,0,12,6,2,16,2,2,2,13,2,12,15,16,10,8,10,9,9,3,19,15,10,16,2,2,15,12,2,2,7,2,2,2,16,10,2,6,0,8,10,5,15,15,10,5,2,16,16,12,1,16,16,15,12,6,15,2,16,19,15,8,19,15,16,15,19,19,17,16,15,5,6,8,2,7,9,16,5,2,9,9,19,13,0,2,1,19,19,10,12,3,8,15,6,17,1,10,8,10,1,15,5,19,6,9,16,15,19,6,8,8,13,10,5,15,0,2,10,6,10,6,19,2,5,10,12,2,19,8,2,8,2,1,1,11,2,16,8,1,15,6,19,5,6,2,13,19,9,12,12,16,10,10,1,6,2,8,8,19,8,13,6,12,15,5,6,7,12,16,10,15,1,2,9,10,2,19,9,10,6,10,1,16,2,10,2,19,6,8,1,6,15,5,8,8,6,15,17,17,15,17,10,19,10,19,9,15,5,16,13,15,10,9,12,9,2,18,9,8,2,12,15,0,19,3,15,15,8,9,9,2,18,12,10,8,18,2,7,14,16,10,14,10,19,5,19,18,10,8,18,0,8,16,2,8,10,10,8,19,9,10,9,18,11,19,1,12,10,2,2,1,2,2,13,2,10,8,10,18,10,13,12,15,10,19,15,15,8,10,2,10,16,10,9,1,9,6,15,7,15,6,19,10,10,2,1,2,2,12,10,10,10,3,2,15,12,8,2,19,2,2,10,0,12,2,2,9,15,10,19,15,10,13,19,10,6,16,10,10,10,2,15,16,1,1,1,15,15,12,8,13,10,9,2,9,12,8,8,2,10,2,10,2,2,8,10,2,9,3,16,2,12,1,4,6,10,6,16,16,15,9,1,2,5,14,17,17,4,17,17,13,4,17,17,17,17,17,17,17,4,17,17,17,17,4,4,4,11,4,4,4,4,11,11,5,11,11,11,11,4,11,5,11,4,4,11,6,7,5,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,18,18,0,0,0,5,0,0,11,0,0,5,16,16,5,9,7,16,15,13,14,4,5,16,14,12,14,12,12,7,11,11,16,6,1,6,6,0,12,5,8,10,15,15,12,14,15,12,15,12,10,12,15,12,14,19,15,9,15,12,2,12,12,12,9,5,15,10,15,16,12,9,10,2,10,11,11,17,11,17,4,4,6,9,2,17,17,11,12,7,2,7,7,9,12,15,7,15,7,7,12,12,12,12,12,16,12,7,14,17,14,5,2,14,15,8,16,12,0,5,7,11,7,1,18,1,5,14,14,14,0,0,0,0,12,7,9,16,10,2,10,4,17,4,7,7,14,14,7,14,9,16,7,12,9,7,7,7,19,0,0,11,19,15,9,19,6,10,2,10,6,11,7,12,14,2,6,19,16,8,12,15,8,10,17,5,13,4,4,4,17,17,17,17,4,4,17,4,17,19,2,19,13,10,16,5,12,15,10,10,10,13,2,4,4,17,2,4,13,17,13,17,17,17,10,13,13,17,10,10,13,7,13,8,13,15,10,10,17,0,4,10,1,19,3,10,3,19,2,17,17,14,14,13,11,10,7,2,11,1,9,8,10,9,9,10,19,18,15,10,14,15,12,14,9,19,12,12,8,5,9,4,8,8,16,10,2,8,3,16,0,8,10,6,8,2,17,15,14,14,11,4,5,15,2,7,8,12,9,15,13,11,7,5,4,10,2,15,15,10,10,8,14,11,11,8,0,9,13,6,2,9,5,9,2,1,19,2,0,0,1,2,6,10,6,9,9,1,8,0,15,9,8,10,10,14,17,14,19,6,4,15,17,11,16,12,5,8,8,10,1,8,4,17,14,0,19,11,14,0,1,14,7,9,9,7,7,16,0,7,9,9,12,12,12,6,12,15,9,10,0,6,1,15,16,9,1,5,8,11,12,15,14,16,16,9,16,5,5,9,16,16,9,9,7,14,8,7,8,6,8,7,16,9,7,7,8,2,1,9,9,14,9,12,1,2,8,8,8,10,6,8,2,8,8,13,19,19,9,0,15,16,0,10,12,14,5,0,9,12,0,0,18,18,7,10,18,10,14,7,7,5,7,9,9,1,14,14,15,7,8,10,2,2,2,3,12,13,10,17,8,16,15,2,2,2,18,9,19,15,2,12,1,1,10,12,9,15,2,19,15,10,17,2,0,0,18,18,0,7,18,14,15,15,19,14,0,18,14,16,5,14,14,14,14,18,14,18,15,10,7,19,9,1,10,2,13,15,2,0,19,14,14,15,14,18,15,10,10,12,15,19,19,10,10,9,4,15,9,19,10,8,5,9,2,12,8,10,8,10,2,1,9,15,6,3,2,2,19,6,8,8,10,10,2,8,2,19,15,9,1,6,10,10,19,0,9,19,2,10,2,2,6,4,14,18,9,16,16,2,10,2,8,12,10,2,19,2,10,16,15,2,13,10,14,18,18,18,18,18,6,5,16,16,9,14,18,18,18,14,18,18,18,18,18,18,18,18,5,10,8,17,18,14,18,14,18,18,18,0,17,17,7,9,10,2,2,19,8,10,10,19,2,8,7,10,10,6,12,9,2,6,2,1,6,9,10,8,2,19,16,1,9,6,10,2,10,1,9,2,12,1,2,1,9,15,8,5,7,9,9,12,2,9,16,12,8,17,2,11,4,4,17,17,19,14,16,14,15,8,11,18,19,3,2,2,14,5,14,19,8,4,8,14,8,18,8,15,8,2,6,10,9,5,16,14,10,2,8,2,8,1,15,19,10,2,10,10,10,19,11,18,18,18,13,18,18,12,0,11,0,0,5,7,0,7,7,9,5,12,12,15,8,9,9,5,12,19,15,9,15,2,13,2,2,13,9,19,19,15,4,17,13,9,10,2,10,9,19,15,19,13,10,9,10,2,19,19,2,2,10,19,9,2,2,8,10,12,17,17,8,2,14,4,18,10,18,2,11,5,4,12,19,19,11,2,2,4,0,0,10,4,11,11,2,17,6,15,14,18,19,15,9,15,6,15,8,10,10,15,12,5,8,2,2,2,10,17,2,0,17,0,0,0,0,13,2,19,15,6,10,3,13,6,10,10,5,5,11,5,4,17,14,14,2,2,17,11,17,19,4,10,10,4,13,10,17,17,17,2,17,17,4,7,8,11,8,4,11,11,11,7,14,6,9,13,15,14,2,10,1,10,19,10,2,2,15,6,17,5,17,17,4,2,4,0,15,6,15,15,15,2,10,18,17,10,5,16,5,4,16,16,16,18,14,2,0,8,5,2,10,6,10,2,10,15,9,1,6,2,2,2,4,10,18,15,14,0,0,0,11,16,18,4,14,0,10,10,10,4,14,17,15,14,16,9,4,14,11,17,5,7,19,19,15,1,6,2,2,12,17,10,8,1,19,15,8,10,10,18,0,1,4,14,19,15,10,14,4,11,17,14,1,11,17,14,17,17,2,8,8,2,1,16,6,10,10,17,7,4,4,4,17,10,4,8,4,17,17,11,4,1,1,14,8,11,1,2,16,5,17,17,4,8,9,1,8,10,2,2,10,12,13,17,10,10,8,13,8,8,6,19,10,17,2,17,4,10,2,8,13,10,8,19,17,17,6,12,1,1,10,2,10,4,14,13,19,12,12,2,12,1,2,2,6,10,19,10,4,11,14,18,17,10,9,1,5,1,15,9,2,15,6,8,2,13,8,6,10,2,19,19,19,2,15,4,2,4,14,4,17,4,4,10,14,14,10,19,13,9,10,5,8,3,12,8,13,10,17,8,9,3,2,17,2,2,15,13,15,10,13,17,17,10,10,17,19,10,10,19,2,2,2,10,7,4,10,1,7,12,9,9,4,1,5,7,5,16,16,7,9,6,1,6,7,7,19,8,1,12,9,12,1,6,9,16,19,10,10,4,4,10,2,10,2,17,19,15,15,10,2,10,10,13,10,10,10,4,10,6,15,9,9,2,10,17,10,13,4,11,13,17,17,3,15,13,10,19,6,2,2,2,3,3,10,19,15,15,8,6,2,4,8,2,13,4,10,2,13,2,18,18,17,4,8,10,2,10,4,6,2,17,18,10,10,9,19,10,2,1,11,11,5,11,11,17,4,14,17,17,10,8,17,17,10,13,17,10,13,13,17,13,10,10,10,17,9,1,9,7,15,15,12,12,15,9,19,7,7,7,15,15,19,2,19,19,17,17,3,1,19,10,2,14,13,17,10,17,17,5,13,14,5,6,6,11,8,6,17,15,6,14,19,19,19,15,4,12,10,16,12,12,16,19,10,9,4,2,17,5,8,7,11,19,15,15,2,2,8,10,0,10,14,4,10,10,8,12,4,4,4,17,5,10,2,2,17,1,12,7,16,6,1,10,15,19,10,10,17,18,16,18,15,13,3,14,2,8,19,4,5,6,14,19,17,11,14,14,11,14,2,9,15,6,19,2,8,17,13,8,14,15,6,13,2,7,7,5,16,6,14,10,2,13,10,19,6,6,2,2,2,10,13,13,11,19,4,4,8,4,4,14,11,5,15,2,9,6,19,10,17,6,15,2,10,15,15,6,2,2,10,10,8,18,18,18,10,14,8,5,2,2,2,2,6,19,10,2,18,17,19,19,7,10,19,15,10,15,10,8,17,17,10,2,10,17,17,17,10,5,10,2,10,2,17,7,19,15,19,6,19,8,10,17,16,9,10,2,17,17,4,6,10,17,14,17,14,10,2,16,14,13,13,13,17,13,2,15,10,10,14,17,10,10,17,14,17,10,10,18,18,0,18,18,18,18,18,10,15,4,12,13,5,12,15,15,4,4,2,17,2,2,14,14,11,11,11,16,3,7,12,12,15,18,0,18,18,18,16,16,5,4,16,16,16,16,9,11 ] )
def Euc_to_fst(vector_lib, n_comp=5, pop_max=8, Iter=20, bias_range=[20, 300], Eigen=False, Scale=False, Centre=True): ### Select pre and post processing measures. length_haps = vector_lib.shape[1] print('length haps: {}, N iterations: {}, range pops: {}'.format( length_haps, Iter, pop_max)) #### Predict predicted = [] #def controled_fsts(vector_lib,Eigen,length_haps,Scale,Center,N_pops,n_comp,Iter,N_sims,MixL,MixP,Pairs): lengths_vector = [] ### store distances between centroids biased_pairwise = [] ### store PC projection: dist_PC_corrected = {x: [] for x in range(n_comp)} ### store fsts fst_store = [] ### proceed. for rep in range(Iter): N_pops = np.random.choice(range(3, pop_max), 1, replace=False)[0] ## Population Sizes and labels bias_scheme = np.random.choice(range(bias_range[0], bias_range[1]), N_pops, replace=False) bias_labels = np.repeat(np.array([x for x in range(N_pops)]), bias_scheme) ### triangular matrices extract. iu1 = np.triu_indices(N_pops, 1) # for centroid comparison iu_bias = np.triu_indices(sum(bias_scheme), 1) iu_control = np.triu_indices(2, 1) Pops = np.random.choice(vector_lib.shape[0], N_pops, replace=False) #print('Iter: {}, vectors selected: {}, hap length: {}'.format(rep,Pops,length_haps)) ########## FST freqs_selected = vector_lib[Pops, :length_haps] Pairwise = Ste.return_fsts2(freqs_selected) #fsts_compare = scale(Pairwise.fst) fsts_compare = Pairwise.fst fst_store.extend(fsts_compare) ## lengths lengths_vector.extend([length_haps] * len(fsts_compare)) #### generate data and perform PCA data = [] for k in range(N_pops): probs = vector_lib[Pops[k], :] m = bias_scheme[k] Haps = [[ np.random.choice([1, 0], p=[1 - probs[x], probs[x]]) for x in range(length_haps) ] for acc in range(m)] data.extend(Haps) data2 = np.array(data) if Scale: data2 = scale(data2) pca = PCA(n_components=n_comp, whiten=False, svd_solver='randomized').fit(data2) feat_bias = pca.transform(data2) if Eigen: feat_bias = feat_bias * pca.explained_variance_ratio_ #### Centroid distances bias_centroids = [ np.mean(feat_bias[ [y for y in range(feat_bias.shape[0]) if bias_labels[y] == z], :], axis=0) for z in range(N_pops) ] bias_centroids = np.array(bias_centroids) bias_pair_dist = pairwise_distances(bias_centroids, metric='euclidean') bias_pair_dist = bias_pair_dist[iu1] #bias_pair_dist= scale(bias_pair_dist) biased_pairwise.extend(bias_pair_dist) Size = length_haps fst_lm_range = [0, .3] Lindexes = [ x for x in range(len(lengths_vector)) if lengths_vector[x] == Size and fst_store[x] >= fst_lm_range[0] and fst_store[x] <= fst_lm_range[1] ] y_true = [np.log(biased_pairwise[x]) for x in Lindexes] fst_x = [np.log(fst_store[x]) for x in Lindexes] m_coeff, b = np.polyfit(y_true, fst_x, 1) return m_coeff, b, fst_x, y_true
def eval_topk(train_file, test_file, k, batch_size, dist_function, gamma, threads): train_data = np.load(train_file) test_data = np.load(test_file) train_z = train_data['z'] train_labels = train_data['labels'] n_train, dz = train_z.shape print(train_z.shape) test_z = test_data['z'] test_labels = test_data['labels'] test_pred_probs = test_data['pred_probs'] test_confs = test_data['confs'] print(test_confs.shape) n_test, n_classes = test_pred_probs.shape # scatter the labels if len(train_labels.shape) == 1 or train_labels.shape[1] == 1: temp = np.zeros((n_train, n_classes), dtype=int) temp[np.arange(n_train), train_labels] = 1 train_labels = temp if len(test_labels.shape) == 1 or test_labels.shape[1] == 1: temp = np.zeros((n_test, n_classes), dtype=int) temp[np.arange(n_test), test_labels] = 1 test_labels = temp sparsity = [] correct = 0 agreed = 0 total_points = 0 mae = 0.0 n_batches = int(np.ceil(n_test / batch_size)) for b in range(n_batches): if b < n_batches - 1: indices = np.arange(b * batch_size, (b + 1) * batch_size) else: indices = np.arange(b * batch_size, n_test) batch_size_b = len(indices) test_points = test_z[indices, :] if dist_function == 'Gauss': dists = pairwise_distances(test_points, train_z, metric='sqeuclidean', n_jobs=threads) dists = np.exp(-gamma * dists) elif dist_function == 'Laplace': dists = pairwise_distances(test_points, train_z, metric='l1', n_jobs=threads) dists = 0.5 * np.exp(-1.0 * dists) elif dist_function == 'InverseQuad': dists = pairwise_distances(test_points, train_z, metric='sqeuclidean', n_jobs=threads) dists = 1.0 / (dists + gamma) else: raise ValueError("Distance function not recognized.") # sort each row by weight (smallest first) order = np.argsort(dists, axis=1) # trying alternate masking mask = np.zeros_like(dists) for j in range(len(indices)): mask[np.ones(k, dtype=int) * j, order[j, -k:]] = 1 # hopefully, we get exactly k points per row print("{:d}/{:d}".format(b, n_batches), np.min(mask.sum(1)), np.max(mask.sum(1))) # compute the weighted sums per class using only these points class_dists = np.dot(mask * dists, train_labels) class_dist_sums = class_dists.sum(1) topk_probs = class_dists / class_dist_sums.reshape((batch_size_b, 1)) # measure accuracy correct += np.sum( class_dists.argmax(1) == test_labels[indices, :].argmax(1)) # also measure agreement with predicted labels agreed += np.sum( class_dists.argmax(1) == test_pred_probs[indices, :].argmax(1)) total_points += batch_size_b test_preds = test_pred_probs[indices, :].argmax(1) mae += np.sum( np.abs(topk_probs[np.arange(batch_size_b), test_preds] - test_pred_probs[indices, test_preds])) / float(batch_size_b) print(total_points, n_test) acc = correct / float(total_points) print(acc) agreement = agreed / float(total_points) print(agreement) return acc, agreement
def reliefF(X, y, mode="rank", **kwargs): """ This function implements the reliefF feature selection Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels kwargs: {dictionary} parameters of reliefF: k: {int} choices for the number of neighbors (default k = 5) Output ------ score: {numpy array}, shape (n_features,) reliefF score for each feature Reference --------- Robnik-Sikonja, Marko et al. "Theoretical and empirical analysis of relieff and rrelieff." Machine Learning 2003. Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013. """ def feature_ranking(score): """ Rank features in descending order according to reliefF score, the higher the reliefF score, the more important the feature is """ idx = np.argsort(score, 0) return idx[::-1] if "k" not in list(kwargs.keys()): k = 5 else: k = kwargs["k"] n_samples, n_features = X.shape # calculate pairwise distances between instances distance = pairwise_distances(X, metric='manhattan') score = np.zeros(n_features) # the number of sampled instances is equal to the number of total instances for idx in range(n_samples): near_hit = [] near_miss = dict() self_fea = X[idx, :] c = np.unique(y).tolist() stop_dict = dict() for label in c: stop_dict[label] = 0 del c[c.index(y[idx])] p_dict = dict() p_label_idx = float(len(y[y == y[idx]])) / float(n_samples) for label in c: p_label_c = float(len(y[y == label])) / float(n_samples) p_dict[label] = p_label_c / (1 - p_label_idx) near_miss[label] = [] distance_sort = [] distance[idx, idx] = np.max(distance[idx, :]) for i in range(n_samples): distance_sort.append([distance[idx, i], int(i), y[i]]) distance_sort.sort(key=lambda x: x[0]) for i in range(n_samples): # find k nearest hit points if distance_sort[i][2] == y[idx]: if len(near_hit) < k: near_hit.append(distance_sort[i][1]) elif len(near_hit) == k: stop_dict[y[idx]] = 1 else: # find k nearest miss points for each label if len(near_miss[distance_sort[i][2]]) < k: near_miss[distance_sort[i][2]].append(distance_sort[i][1]) else: if len(near_miss[distance_sort[i][2]]) == k: stop_dict[distance_sort[i][2]] = 1 stop = True for (key, value) in list(stop_dict.items()): if value != 1: stop = False if stop: break # update reliefF score near_hit_term = np.zeros(n_features) for ele in near_hit: near_hit_term = np.array( abs(self_fea - X[ele, :])) + np.array(near_hit_term) near_miss_term = dict() for (label, miss_list) in list(near_miss.items()): near_miss_term[label] = np.zeros(n_features) for ele in miss_list: near_miss_term[label] = np.array( abs(self_fea - X[ele, :])) + np.array( near_miss_term[label]) score += near_miss_term[label] / (k * p_dict[label]) score -= near_hit_term / k if mode == 'raw': return score elif mode == 'index': return feature_ranking(score) elif mode == 'rank': return reverse_argsort(feature_ranking(score), X.shape[1])
def PearsonCorrelation(UserItemMatrix): similarity = 1 - pairwise_distances(UserItemMatrix, metric='correlation') similarity[np.isnan(similarity)] = 0 return similarity
appendix_col = np.zeros( (R_train.shape[0], unique_movie_num - R_train.shape[1])) R_train = np.concatenate((R_train, appendix_col), axis=1) # Computes the R matrix on testing data R_test = test_df.pivot(index='userId', columns='movieId', values='rating').fillna(0).values appendix_row = np.zeros((unique_user_num - R_test.shape[0], R_test.shape[1])) R_test = np.concatenate((R_test, appendix_row), axis=0) appendix_col = np.zeros((R_test.shape[0], unique_movie_num - R_test.shape[1])) R_test = np.concatenate((R_test, appendix_col), axis=1) ########### user-based collaborative filtering ################################ # Computes user-based similarity matrix S_user = pairwise_distances(R_train, metric='euclidean') # Training set prediction #mean_user_rating = R_train.mean(axis = 1) mean_user_rating = np.zeros(R_train.shape[0]) for i in range(R_train.shape[0]): mean_user_rating[i] = R_train[i].mean() mean_user_rating_train = mean_user_rating[:, np.newaxis] difference_train = R_train - mean_user_rating_train numerator_train_ub = np.dot(S_user, difference_train) denominator_train_ub = np.abs(S_user).sum(axis=1)[:, np.newaxis] prediction_train_ub = mean_user_rating[:, np.newaxis] + numerator_train_ub\ / denominator_train_ub # Computes root mean squate error of training set prediction_specified_train_ub = prediction_train_ub[R_train.nonzero()]
def printPrediction(model, smilesData): # FIXME hardcoded smilesDf = pd.DataFrame(smilesData, columns=[cc.exp['params']['data']['smiles']]) input = data.formatSequentialInput(smilesDf) output = model.predict(input) for i, smiles in enumerate(smilesData): print 'Prediction for {}'.format(smiles) print output[i] distanceMatrixCosine = pairwise_distances(output, metric='cosine') distanceMatrixCorrel = pairwise_distances(output, metric='correlation') distanceMatrixEuclid = pairwise_distances(output, metric='euclidean') print 'Distance matrix cosine' print distanceMatrixCosine print 'Distance matrix correlation' print distanceMatrixCorrel print 'Distance matrix euclid' print distanceMatrixEuclid ''' layerIdx = 1 cfg = model.get_config()[:layerIdx+1] cfg[0]['config']['dropout_U'] = 0 cfg[0]['config']['dropout_W'] = 0 print cfg[0] print cfg[1] # del cfg[1] # layerIdx -= 1 # print cfg cfg[layerIdx]['config']['return_sequences'] = True ''' layerIdx = 2 cfg = model.get_config()[:layerIdx+1] del cfg[1] layerIdx -= 1 # print cfg cfg[layerIdx]['config']['return_sequences'] = True seqModel = Sequential.from_config(cfg) seqModel.set_weights(model.get_weights()) seqModel.layers[layerIdx].return_sequences = True outputFunction = K.function([seqModel.layers[0].input], [seqModel.layers[layerIdx].output]) outputSymbols = outputFunction([input])[0] outputLastSymbol = outputSymbols[:,outputSymbols.shape[1]-1,:] distanceMatrixLastSymbolCorrel = np.corrcoef(outputLastSymbol) print 'Distance matrix last symbol correlation' print distanceMatrixLastSymbolCorrel
def test_knn_imputer_weight_distance(na): X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) # Test with "distance" weight nn = KNeighborsRegressor(metric="euclidean", weights="distance") X_rows_idx = [0, 2, 3, 4, 5, 6] nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0]) knn_imputed_value = nn.predict(X[1:2, 1:])[0] # Manual calculation X_neighbors_idx = [0, 2, 3, 4, 5] dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na) weights = 1 / dist[:, X_neighbors_idx].ravel() manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights) X_imputed_distance1 = np.array( [[0, 0], [manual_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]] ) # NearestNeighbor calculation X_imputed_distance2 = np.array( [[0, 0], [knn_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]] ) imputer = KNNImputer(weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed_distance1) assert_allclose(imputer.fit_transform(X), X_imputed_distance2) # Test with weights = "distance" and n_neighbors=2 X = np.array( [ [na, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ] ) # neighbors are rows 1, 2, the nan_euclidean_distances are: dist_0_1 = np.sqrt((3 / 2) * ((1 - 0) ** 2 + (2 - 0) ** 2)) dist_0_2 = np.sqrt((3 / 2) * ((2 - 0) ** 2 + (3 - 0) ** 2)) imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2]) X_imputed = np.array( [ [imputed_value, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ] ) imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) # Test with varying missingness patterns X = np.array( [ [1, 0, 0, 1], [0, na, 1, na], [1, 1, 1, na], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ] ) # Get weights of donor neighbors dist = nan_euclidean_distances(X, missing_values=na) r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]] r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]] r1c1_nbor_wt = 1 / r1c1_nbor_dists r1c3_nbor_wt = 1 / r1c3_nbor_dists r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]] r2c3_nbor_wt = 1 / r2c3_nbor_dists # Collect donor values col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy() col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy() # Final imputed values r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt) r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt) r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt) X_imputed = np.array( [ [1, 0, 0, 1], [0, r1c1_imp, 1, r1c3_imp], [1, 1, 1, r2c3_imp], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ] ) imputer = KNNImputer(weights="distance", missing_values=na) assert_allclose(imputer.fit_transform(X), X_imputed) X = np.array( [ [0, 0, 0, na], [1, 1, 1, na], [2, 2, na, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [na, 7, 7, 7], ] ) dist = pairwise_distances( X, metric="nan_euclidean", squared=False, missing_values=na ) # Calculate weights r0c3_w = 1.0 / dist[0, 2:-1] r1c3_w = 1.0 / dist[1, 2:-1] r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] r7c0_w = 1.0 / dist[7, 2:7] # Calculate weighted averages r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) r7c0 = np.average(X[2:7, 0], weights=r7c0_w) X_imputed = np.array( [ [0, 0, 0, r0c3], [1, 1, 1, r1c3], [2, 2, r2c2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [r7c0, 7, 7, 7], ] ) imputer_comp_wt = KNNImputer(missing_values=na, weights="distance") assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
def train(ocsvm, X_train, X_test, Y_test, kernel, nu, GridSearch=True, **kwargs): if X_train.ndim > 2: X_train_shape = X_train.shape X_train = X_train.reshape(X_train_shape[0], np.prod(X_train_shape[1:])) else: X_train = X_train if kernel in ('DegreeKernel', 'WeightedDegreeKernel'): # get_kernel_matrix(kernel=kernel, X_train=X_train, **kwargs) # svm.fit(K_train) print('unexpected behaviour') else: if GridSearch and kernel == 'rbf': # use grid search cross-validation to select gamma print("Using GridSearchCV for hyperparameter selection...") # sample small hold-out set from test set for hyperparameter selection. Save as val set. n_test_set = len(X_test) n_val_set = int(0.1 * n_test_set) n_test_out = 0 n_test_norm = 0 n_val_out = 0 n_val_norm = 0 while (n_test_out == 0) | (n_test_norm == 0) | (n_val_out == 0) | (n_val_norm ==0): perm = np.random.permutation(n_test_set) X_val = X_test[perm[:n_val_set]] y_val = Y_test[perm[:n_val_set]] # only accept small test set if AUC can be computed on val and test set n_test_out = np.sum(Y_test[perm[:n_val_set]]) n_test_norm = np.sum(Y_test[perm[:n_val_set]] == 0) n_val_out = np.sum(Y_test[perm[n_val_set:]]) n_val_norm = np.sum(Y_test[perm[n_val_set:]] == 0) X_test = X_test[perm[n_val_set:]] Y_test = Y_test[perm[n_val_set:]] n_val = len(y_val) n_test_set = len(Y_test) val_scores = np.zeros((len(y_val), 1)) test_scores = np.zeros((len(Y_test), 1)) cv_auc = 0.0 cv_acc = 0 cv_f1 = 0 g_best = 0.1 for gamma in np.logspace(-10, -1, num=10, base=2): # train on selected gamma cv_svm = svm.OneClassSVM(kernel='rbf', nu=nu, gamma=gamma) cv_svm.fit(X_train) # predict on small hold-out set val_acc, _, _, _, val_f1_score, val_auc_roc = predict(cv_svm, X_val, y_val, kernel) # save model if AUC on hold-out set improved if val_f1_score > cv_f1: # print('gamma set to: ', g_best) ocsvm = cv_svm g_best = gamma cv_auc = val_auc_roc cv_f1 = val_f1_score # save results of best cv run # diag['val']['auc'] = cv_auc # diag['val']['acc'] = cv_acc oc_svm = svm.OneClassSVM(kernel='rbf', nu=nu, gamma=g_best) ocsvm.fit(X_train) else: # if rbf-kernel, re-initialize svm with gamma minimizing the # numerical error if kernel == 'rbf': gamma = 1 / (np.max(pairwise_distances(X_train)) ** 2) # ocsvm = svm.OneClassSVM(kernel='rbf', nu=nu, gamma=gamma) ocsvm.fit(X_train) gamma = gamma return ocsvm
def test_agglomerative_clustering(): # Check that we obtain the correct number of clusters with # agglomerative clustering. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]), linkage=linkage) assert_raises(ValueError, clustering.fit, X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") assert_raises(ValueError, clustering.fit, X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering(n_clusters=10, connectivity=np.ones( (n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)
def run_cluster(compl, qfib, qsym, cfg): """ """ cl_radius = cfg.find_orientations.clustering.radius min_compl = cfg.find_orientations.clustering.completeness algorithm = cfg.find_orientations.clustering.algorithm start = time.clock() # time this num_above = sum(np.array(compl) > min_compl) if num_above == 0: # nothing to cluster qbar = cl = np.array([]) elif num_above == 1: # short circuit qbar = qfib[:, np.array(compl) > min_compl] cl = [1] else: # use compiled module for distance # just to be safe, must order qsym as C-contiguous qsym = np.array(qsym.T, order='C').T quat_distance = lambda x, y: xfcapi.quat_distance( np.array(x, order='C'), np.array(y, order='C'), qsym) qfib_r = qfib[:, np.array(compl) > min_compl] logger.info("Feeding %d orientations above %.1f%% to clustering", qfib_r.shape[1], 100 * min_compl) if algorithm == 'dbscan' and not have_sklearn: algorithm = 'fclusterdata' logger.warning( "sklearn >= 0.14 required for dbscan, using fclusterdata") if algorithm == 'dbscan': pdist = pairwise_distances(qfib_r.T, metric=quat_distance, n_jobs=-1) core_samples, labels = dbscan(pdist, eps=np.radians(cl_radius), min_samples=1, metric='precomputed') cl = np.array(labels, dtype=int) + 1 elif algorithm == 'fclusterdata': cl = cluster.hierarchy.fclusterdata(qfib_r.T, np.radians(cl_radius), criterion='distance', metric=quat_distance) else: raise RuntimeError("Clustering algorithm %s not recognized" % algorithm) nblobs = len(np.unique(cl)) qbar = np.zeros((4, nblobs)) for i in range(nblobs): npts = sum(cl == i + 1) qbar[:, i] = rot.quatAverage(qfib_r[:, cl == i + 1].reshape(4, npts), qsym).flatten() logger.info("clustering took %f seconds", time.clock() - start) logger.info( "Found %d orientation clusters with >=%.1f%% completeness" " and %2f misorientation", qbar.size / 4, 100. * min_compl, cl_radius) return np.atleast_2d(qbar), cl
# with my understanding, the correct way is as following # u*sqrt(s) as the user-feature-array # sqrt(s)*vt as the item-feature-array # user_similarity = pairwise_distances(u*sqrt(s), metric='cosine') # item_similarity = pairwise_distances(sqrt(s)*vt, metric='cosine') # if one-never-exit user/item wanted tobe predicted # A = USV^ ====> u' = new * V^(-1) * S^(-1) # then compute the similarity between u' among other users # as same as item user_train_matrix = u.dot(np.sqrt(s_diag_matrix)) item_train_matrix = np.sqrt(s_diag_matrix).dot(vt) print str(user_train_matrix.shape), '\tuser_train_matrix\t', type(user_train_matrix) print str(item_train_matrix.shape), '\titem_train_matrix\t', type(item_train_matrix) user_similarity = pairwise_distances(user_train_matrix, metric='cosine') item_similarity = pairwise_distances(item_train_matrix.T, metric='cosine') print str(user_similarity.shape), '\tuser_similarity\t\t', type(user_similarity) print str(item_similarity.shape), '\titem_similarity\t\t', type(item_similarity) ### ==== 6) predict by diff type ==== ### def rmse(prediction, ground_truth): prediction = prediction[ground_truth.nonzero()].flatten() ground_truth = ground_truth[ground_truth.nonzero()].flatten() return sqrt(mean_squared_error(prediction, ground_truth)) def predict(ratings, similarity, type='user'): if type == 'user': mean_user_rating = ratings.mean(axis=1) ## 获取打分的均值 ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) ## np.newaxis 是用来规范维度的,可以将(6,)->(6,1)
def Fst_predict(vector_lib, m_coeff, b, n_comp=5, pop_max=8, Iter=20, bias_range=[20, 300], Eigen=False, Scale=False, Centre=True): ### Select pre and post processing measures. length_haps = vector_lib.shape[1] print('length haps: {}, N iterations: {}, range pops: {}'.format( length_haps, Iter, pop_max)) #### Predict predicted = [] #def controled_fsts(vector_lib,Eigen,length_haps,Scale,Center,N_pops,n_comp,Iter,N_sims,MixL,MixP,Pairs): lengths_vector = [] ### store distances between centroids biased_pairwise = [] ### store PC projection: dist_PC_corrected = {x: [] for x in range(n_comp)} ### store fsts fst_store = [] ### proceed. for rep in range(Iter): N_pops = np.random.choice(range(3, pop_max), 1, replace=False)[0] ## Population Sizes and labels bias_scheme = np.random.choice(range(bias_range[0], bias_range[1]), N_pops, replace=False) bias_labels = np.repeat(np.array([x for x in range(N_pops)]), bias_scheme) ### triangular matrices extract. iu1 = np.triu_indices(N_pops, 1) # for centroid comparison iu_bias = np.triu_indices(sum(bias_scheme), 1) iu_control = np.triu_indices(2, 1) Pops = np.random.choice(vector_lib.shape[0], N_pops, replace=False) #print('Iter: {}, vectors selected: {}, hap length: {}'.format(rep,Pops,length_haps)) ########## FST freqs_selected = vector_lib[Pops, :length_haps] Pairwise = Ste.return_fsts2(freqs_selected) #fsts_compare = scale(Pairwise.fst) fsts_compare = Pairwise.fst fst_store.extend(fsts_compare) ## lengths lengths_vector.extend([length_haps] * len(fsts_compare)) #### generate data and perform PCA data = [] for k in range(N_pops): probs = vector_lib[Pops[k], :] m = bias_scheme[k] Haps = [[ np.random.choice([1, 0], p=[1 - probs[x], probs[x]]) for x in range(length_haps) ] for acc in range(m)] data.extend(Haps) data2 = np.array(data) if Scale: data2 = scale(data2) pca = PCA(n_components=n_comp, whiten=False, svd_solver='randomized').fit(data2) feat_bias = pca.transform(data2) if Eigen: feat_bias = feat_bias * pca.explained_variance_ratio_ #### Centroid distances bias_centroids = [ np.mean(feat_bias[ [y for y in range(feat_bias.shape[0]) if bias_labels[y] == z], :], axis=0) for z in range(N_pops) ] bias_centroids = np.array(bias_centroids) bias_pair_dist = pairwise_distances(bias_centroids, metric='euclidean') bias_pair_dist = bias_pair_dist[iu1] #bias_pair_dist= scale(bias_pair_dist) fst_pred = [np.exp(m_coeff * np.log(x) + b) for x in bias_pair_dist] predicted.extend(fst_pred) fig = [go.Scatter(x=fst_store, y=predicted, mode='markers')] layout = go.Layout(title='test of prediction', yaxis=dict(title='predicted Fst'), xaxis=dict(title='observed Fst')) fig = go.Figure(data=fig, layout=layout) iplot(fig)
import numpy as np import pandas as pd from sklearn.metrics.pairwise import pairwise_distances header = ['user_id', 'sport_rating', 'moive_rating'] df = pd.read_csv('user.csv', names=header) train = df.as_matrix() user_similarity = pairwise_distances(train, metric='cosine') # print user_similarity item_similarity = pairwise_distances(train.T, metric='cosine') # print item_similarity mean_user_rating = train.mean(axis=1) ratings_diff = (train - mean_user_rating[:, np.newaxis]) pred_user = mean_user_rating[:, np.newaxis] + user_similarity.dot( ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T # print pred_user pred_item = train.dot(item_similarity) / np.array( [np.abs(item_similarity).sum(axis=1)]) # print pred_item ######################################################################## from boto3.dynamodb.conditions import Key, Attr from boto3.session import Session import os import sys import json
def load(hdf5_file_name, data, minPts, eps = None, quantile = 50, subsamples_matrix = None, samples_weights = None, metric = 'minkowski', p = 2, verbose = True): """Determines the radius 'eps' for DBSCAN clustering of 'data' in an adaptive, data-dependent way. Parameters ---------- hdf5_file_name : file object or string The handle or name of an HDF5 data structure where any array needed for DBSCAN and too large to fit into memory is to be stored. data : array of shape (n_samples, n_features) An array of features retained from the data-set to be analysed. Subsamples of this curated data-set can also be analysed by a call to DBSCAN by providing an appropriate list of selected samples labels, stored in 'subsamples_matrix' (see below). subsamples_matrix : array of shape (n_runs, n_subsamples), optional (default = None) Each row of this matrix contains a set of indices identifying the samples selected from the whole data-set for each of 'n_runs' independent rounds of DBSCAN clusterings. minPts : int The number of points within an epsilon-radius hypershpere for the said region to qualify as dense. eps : float, optional (default = None) Sets the maximum distance separating two data-points for those data-points to be considered as part of the same neighborhood. quantile : int, optional (default = 50) If 'eps' is not provided by the user, it will be determined as the 'quantile' of the distribution of the k-nearest distances to each sample, with k set to 'minPts'. samples_weights : array of shape (n_runs, n_samples), optional (default = None) Holds the weights of each sample. A sample with weight greater than 'minPts' is guaranteed to be a core sample; a sample with negative weight tends to prevent its 'eps'-neighbors from being core. Weights are absolute and default to 1. metric : string or callable, optional (default = 'euclidean') The metric to use for computing the pairwise distances between samples (each sample corresponds to a row in 'data'). If metric is a string or callable, it must be compatible with metrics.pairwise.pairwise_distances. p : float, optional (default = 2) If a Minkowski metric is used, 'p' determines its power. verbose : Boolean, optional (default = True) Whether to display messages reporting the status of the computations and the time it took to complete each major stage of the algorithm. Returns ------- eps : float The parameter of DBSCAN clustering specifying if points are density-reachable. This is either a copy of the value provided at input or, if the user did not specify a value of 'eps' at input, the return value if the one determined from k-distance graphs from the data-set. References ---------- Ester, M., H. P. Kriegel, J. Sander and X. Xu, "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise". In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 """ data = np.array(data, copy = False) if data.ndim > 2: raise ValueError("\nERROR: DBSCAN_multiplex @ load:\n" "the data array is of dimension %d. Please provide a two-dimensional " "array instead.\n" % data.ndim) if subsamples_matrix is None: subsamples_matrix = np.arange(data.shape[0], dtype = int) subsamples_matrix = subsamples_matrix.reshape(1, -1) else: subsamples_matrix = np.array(subsamples_matrix, copy = False) if subsamples_matrix.ndim > 2: raise ValueError("\nERROR: DBSCAN_multiplex @ load:\n" "the array of subsampled indices is of dimension %d. " "Please provide a two-dimensional array instead.\n" % subsamples_matrix.ndim) if (data.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(data.sum()) and not np.all(np.isfinite(data))): raise ValueError('\nERROR: DBSCAN_multiplex @ load:\n' 'the data vector contains at least one infinite or NaN entry.\n') if (subsamples_matrix.dtype.type is np.int_ and not np.isfinite(subsamples_matrix.sum()) and not np.all(np.isfinite(subsamples_matrix))): raise ValueError('\nERROR: DBSCAN_multiplex @ load:\n' 'the array of subsampled indices contains at least one infinite or NaN entry.\n') if not np.all(subsamples_matrix >= 0): raise ValueError('\nERROR: DBSCAN_multiplex @ load:\n' 'the sampled indices should all be positive integers.\n') N_samples = data.shape[0] N_runs, N_subsamples = subsamples_matrix.shape if N_subsamples > N_samples: raise ValueError('\nERROR: DBSCAN_multiplex @ load:\n' 'the number of sampled indices cannot exceed the total number of samples in the whole data-set.\n') for i in xrange(N_runs): subsamples_matrix[i] = np.unique(subsamples_matrix[i]) if not isinstance(minPts, int): raise TypeError("\nERROR: DBSCAN_multiplex @ load:\n" "the parameter 'minPts' must be an integer.\n") if minPts < 2: raise ValueError("\nERROR: DBSCAN_multiplex @ load:\n" "the value of 'minPts' must be larger than 1.\n") if eps is None: # Determine the parameter 'eps' as the median of the distribution # of the maximum of the minPts-nearest neighbors distances for each sample. if verbose: print("INFO: DBSCAN_multiplex @ load:\n" "starting the determination of an appropriate value of 'eps' for this data-set" " and for the other parameter of the DBSCAN algorithm set to {minPts}.\n" "This might take a while.".format(**locals())) beg_eps = time.time() quantile = np.rint(quantile) quantile = np.clip(quantile, 0, 100) k_distances = kneighbors_graph(data, minPts, mode = 'distance', metric = metric, p = p).data radii = np.zeros(N_samples, dtype = float) for i in xrange(0, minPts): radii = np.maximum(radii, k_distances[i::minPts]) if quantile == 50: eps = round(np.median(radii, overwrite_input = True), 4) else: eps = round(np.percentile(radii, quantile), 4) end_eps = time.time() if verbose: print("\nINFO: DBSCAN_multiplex @ load:\n" "done with evaluating parameter 'eps' from the data-set provided." " This took {} seconds. Value of epsilon: {}.".format(round(end_eps - beg_eps, 4), eps)) else: if not (isinstance(eps, float) or isinstance(eps, int)): raise ValueError("\nERROR: DBSCAN_multiplex @ load:\n" "please provide a numeric value for the radius 'eps'.\n") if not eps > 0.0: raise ValueError("\nERROR: DBSCAN_multiplex @ load:\n" "the radius 'eps' must be positive.\n") eps = round(eps, 4) # For all samples with a large enough neighborhood, 'neighborhoods_indices' # and 'neighborhoods_indptr' help us find the neighbors to every sample. Note # that this definition of neighbors leaves the original point in, # which will be considered later. if verbose: print("\nINFO: DBSCAN_multiplex @ load:\n" "identifying the neighbors within an hypersphere of radius {eps} around each sample," " while at the same time evaluating the number of epsilon-neighbors for each sample.\n" "This might take a fair amount of time.".format(**locals())) beg_neigh = time.time() fileh = tables.open_file(hdf5_file_name, mode = 'r+') DBSCAN_group = fileh.create_group(fileh.root, 'DBSCAN_group') neighborhoods_indices = fileh.create_earray(DBSCAN_group, 'neighborhoods_indices', tables.Int32Atom(), (0,), 'Indices array for sparse matrix of neighborhoods', expectedrows = int((N_samples ** 2) / 50)) # 'neighborhoods_indptr' is such that for each of row i of the data-matrix # neighborhoods_indices[neighborhoods_indptr[i]:neighborhoods_indptr[i+1]] # contains the column indices of row i from the array of # 'eps'-neighborhoods. neighborhoods_indptr = np.zeros(1, dtype = np.int64) # For each sample, 'neighbors_counts' will keep a tally of the number # of its neighbors within a hypersphere of radius 'eps'. # Note that the sample itself is counted as part of this neighborhood. neighbors_counts = fileh.create_carray(DBSCAN_group, 'neighbors_counts', tables.Int32Atom(), (N_runs, N_samples), 'Array of the number of neighbors around each sample of a set of subsampled points', filters = None) chunks_size = get_chunk_size(N_samples, 3) for i in xrange(0, N_samples, chunks_size): chunk = data[i:min(i + chunks_size, N_samples)] D = pairwise_distances(chunk, data, metric = metric, p = p, n_jobs = 1) D = (D <= eps) if samples_weights is None: for run in xrange(N_runs): x = subsamples_matrix[run] M = np.take(D, x, axis = 1) legit_rows = np.intersect1d(i + np.arange(min(chunks_size, N_samples - i)), x, assume_unique = True) M = np.take(M, legit_rows - i, axis = 0) neighbors_counts[run, legit_rows] = M.sum(axis = 1) del M else: for run in xrange(N_runs): x = subsamples_matrix[run] M = np.take(D, x, axis = 1) legit_rows = np.intersect1d(i + np.arange(min(chunks_size, N_samples - i)), x, assume_unique = True) M = np.take(M, legit_rows - i, axis = 0) neighbors_counts[run, legit_rows] = np.array([np.sum(samples_weights[x[row]]) for row in M]) del M candidates = np.where(D == True) del D neighborhoods_indices.append(candidates[1]) _, nbr = np.unique(candidates[0], return_counts = True) counts = np.cumsum(nbr) + neighborhoods_indptr[-1] del candidates neighborhoods_indptr = np.append(neighborhoods_indptr, counts) fileh.create_carray(DBSCAN_group, 'neighborhoods_indptr', tables.Int64Atom(), (N_samples + 1,), 'Array of cumulative number of column indices for each row', filters = None) fileh.root.DBSCAN_group.neighborhoods_indptr[:] = neighborhoods_indptr[:] fileh.create_carray(DBSCAN_group, 'subsamples_matrix', tables.Int32Atom(), (N_runs, N_subsamples), 'Array of subsamples indices', filters = None) fileh.root.DBSCAN_group.subsamples_matrix[:] = subsamples_matrix[:] fileh.close() end_neigh = time.time() if verbose: print("\nINFO: DBSCAN_multiplex @ load:\n" "done with the neighborhoods. This step took {} seconds.".format(round(end_neigh - beg_neigh, 4))) gc.collect() return eps
def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # "cityblock" uses scikit-learn metric, cityblock (function) is # scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Using size_threshold argument should raise # a deprecation warning assert_warns(DeprecationWarning, manhattan_distances, X, Y, size_threshold=10) # Test cosine as a string metric versus cosine callable # The string "cosine" uses sklearn.metric, # while the function cosine is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unknown assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
def _calculateDistanceMatrix(self): return pairwise_distances(self.emb.x, metric='cosine')
def test_pairwise_distances_argmin_min(): # Check pairwise minimum distances computation for any metric X = [[0], [1]] Y = [[-1], [2]] Xsp = dok_matrix(X) Ysp = csr_matrix(Y, dtype=np.float32) # euclidean metric D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean") D2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # sparse matrix case Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") assert_array_equal(Dsp, D) assert_array_equal(Esp, E) # We don't want np.matrix here assert_equal(type(Dsp), np.ndarray) assert_equal(type(Esp), np.ndarray) # Non-euclidean scikit-learn metric D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan") D2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(E, [1., 1.]) D, E = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") D2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (callable) D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (string) D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan") np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) # Test batch_size deprecation warning assert_warns_message(DeprecationWarning, "version 0.22", pairwise_distances_argmin_min, X, Y, batch_size=500, metric='euclidean')
def build_all_2(): print 'For each class, we build all the trees and save them in CSVs' path_to_save = '../data/test/try' """ nar_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/narrative') write_tree_in_csv(nar_trees) arg_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/argumentative') write_tree_in_csv(arg_trees) inf_trees = return_trees_from_merge('~/Documents/s2/tal/discourseAnalysis/data/informative') write_tree_in_csv(inf_trees) des_trees = [] # Attention, contient couples de (trees + tree_ID) ou tree_ID est le nom du fichier. all_trees = nar_trees + arg_trees + inf_trees + des_trees int2cl = {0:'narrative', 1:'argumentative', 2:'informative',3:'descriptive'} T = [t[0] for t in all_trees] pickle.dump(T,open(path_to_save+'trees.pkl','wb'))""" T = pickle.load(open('../data/trees_with_labels.pkl','r')) T = [t[0] for t in T] """y_nar = [0 for t in nar_trees] y_arg = [1 for t in arg_trees] y_inf = [2 for t in inf_trees] y_des = [3 for t in des_trees] y = np.array( y_nar + y_arg + y_inf + y_des ) pickle.dump(y,open(path_to_save+'labels.pkl','wb'))""" index = ['bin','count','norm','height','tfid'] print 'Dicts' D_bin = vectorizers.build_bin_vects(T) D_count = vectorizers.build_count_vects(T) D_norm = vectorizers.build_norm_vects(T) D_height = vectorizers.build_height_vects(T) D_tfid = vectorizers.build_tfid_vects(T) D_all = {'bin':D_bin ,'count': D_count,'norm': D_norm,'height': D_height,'tfid': D_tfid} pickle.dump(D_all,open(path_to_save+'dicts.pkl','wb')) print 'Vects' vectorizer = feature_extraction.DictVectorizer(sparse=False) V_bin = vectorizer.fit_transform(D_bin) V_count = vectorizer.fit_transform(D_count) V_norm = vectorizer.fit_transform(D_norm) V_height = vectorizer.fit_transform(D_height) V_tfid = vectorizer.fit_transform(D_tfid) V_all = {'bin':V_bin ,'count': V_count,'norm': V_norm,'height': V_height,'tfid': V_tfid} pickle.dump(V_all,open(path_to_save+'vects.pkl','wb')) #Y = vectorizer.inverse_transform(V_bin) print 'Kernels' ## tree kernels #max_depth = 15 #T_p = [ctree.prune(t,max_depth) for t in T] #K_tree = kernels.compute_gram(T_p,T_p,kernels.tree_kernel) #pickle.dump(K_tree,open(path_to_save+'tree_kernel.pkl')) print 'vector kernels' print 'linear' K_bin_lin = pairwise.linear_kernel(V_bin) K_count_lin = pairwise.linear_kernel(V_count) K_norm_lin = pairwise.linear_kernel(V_norm) K_height_lin = pairwise.linear_kernel(V_height) K_tfid_lin = pairwise.linear_kernel(V_tfid) K_all_lin = {'bin':K_bin_lin, 'count':K_count_lin, 'norm':K_norm_lin, 'height':K_height_lin, 'tfid':K_tfid_lin} print 'rbf' K_bin_rbf = pairwise.rbf_kernel(V_bin) K_count_rbf = pairwise.rbf_kernel(V_count) K_norm_rbf = pairwise.rbf_kernel(V_norm) K_height_rbf = pairwise.rbf_kernel(V_height) K_tfid_rbf = pairwise.rbf_kernel(V_tfid) K_all_rbf = {'bin':K_bin_rbf, 'count':K_count_rbf, 'norm':K_norm_rbf, 'height':K_height_rbf, 'tfid':K_tfid_rbf} print 'cosine sim' K_bin_cos_sim = pairwise.cosine_similarity(V_bin) K_count_cos_sim = pairwise.cosine_similarity(V_count) K_norm_cos_sim = pairwise.cosine_similarity(V_norm) K_height_cos_sim = pairwise.cosine_similarity(V_height) K_tfid_cos_sim = pairwise.cosine_similarity(V_tfid) K_all_cos_sim = {'bin':K_bin_cos_sim, 'count':K_count_cos_sim, 'norm':K_norm_cos_sim, 'height':K_height_cos_sim, 'tfid':K_tfid_cos_sim} print 'euclidean distance' K_bin_eucl_dist = pairwise.pairwise_distances(V_bin,metric='euclidean') K_count_eucl_dist = pairwise.pairwise_distances(V_count,metric='euclidean') K_norm_eucl_dist = pairwise.pairwise_distances(V_norm,metric='euclidean') K_height_eucl_dist = pairwise.pairwise_distances(V_height,metric='euclidean') K_tfid_eucl_dist = pairwise.pairwise_distances(V_tfid,metric='euclidean') K_all_eucl_dist = {'bin':K_bin_eucl_dist, 'count':K_count_eucl_dist, 'norm':K_norm_eucl_dist, 'height':K_height_eucl_dist, 'tfid':K_tfid_eucl_dist} print 'minkowski distance' K_bin_mink_dist = pairwise.pairwise_distances(V_bin,metric='minkowski') K_count_mink_dist = pairwise.pairwise_distances(V_count,metric='minkowski') K_norm_mink_dist = pairwise.pairwise_distances(V_norm,metric='minkowski') K_height_mink_dist = pairwise.pairwise_distances(V_height,metric='minkowski') K_tfid_mink_dist = pairwise.pairwise_distances(V_tfid,metric='minkowski') K_all_mink_dist = {'bin':K_bin_mink_dist, 'count':K_count_mink_dist, 'norm':K_norm_mink_dist, 'height':K_height_mink_dist, 'tfid':K_tfid_mink_dist} K_all = {'lin':K_all_lin, 'rbf':K_all_rbf, 'cos_sim':K_all_cos_sim,'eucl_dist':K_all_eucl_dist,'mink_dist':K_all_mink_dist} pickle.dump(K_all,open(path_to_save+'vect_kernels.pkl','wb')) print "done"
def test_pairwise_callable_nonstrict_metric(): # paired_distances should allow callable metric where metric(x, x) != 0 # Knowing that the callable is a strict metric would allow the diagonal to # be left uncalculated and set to 0. assert_equal(pairwise_distances([[1.]], metric=lambda x, y: 5)[0, 0], 5)
def furthest_sample_pts(pts_input): D = pairwise_distances(pts_input, metric='euclidean') (perm, lambdas) = getGreedyPerm(D) return perm, lambdas
def gaus_feats(diags, centers, inertias, eps=1e-10): return np.exp(-pairwise.pairwise_distances(diags, Y=centers) / (inertias + eps))
def lapl_feats(diags, centers, inertias, eps=1e-10): return np.exp(-np.sqrt( pairwise.pairwise_distances(diags, Y=centers) / (inertias + eps)))
for line in ratings.itertuples(): data_matrix[line[1]-1, line[2]-1] = line[3] # In[91]: pd.DataFrame(data=user_prediction).head() pd.DataFrame(item_prediction).head() # In[81]: from sklearn.metrics.pairwise import pairwise_distances user_similarity = pairwise_distances(data_matrix, metric='cosine') item_similarity = pairwise_distances(data_matrix.T, metric='cosine') # In[88]: #for user based cf - prediction(u,i) = sigma(r(u,i)*similarity(u,v))/sigma(sim(u,v)) where u,v are user, i is items #for item based cf - prediction (u,i) = sigma(R(u,N) * similarity(i,N))/sigma(sim(i,N)) where def predict(ratings, similarity, type='user'): if type == 'user': mean_user_rating = ratings.mean(axis=1) ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T elif type == 'item':
def construct_W(X, **kwargs): """ Construct the affinity matrix W through different ways Notes ----- if kwargs is null, use the default parameter settings; if kwargs is not null, construct the affinity matrix according to parameters in kwargs Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} parameters to construct different affinity matrix W: y: {numpy array}, shape (n_samples, 1) the true label information needed under the 'supervised' neighbor mode metric: {string} choices for different distance measures 'euclidean' - use euclidean distance 'cosine' - use cosine distance (default) neighbor_mode: {string} indicates how to construct the graph 'knn' - put an edge between two nodes if and only if they are among the k nearest neighbors of each other (default) 'supervised' - put an edge between two nodes if they belong to same class and they are among the k nearest neighbors of each other weight_mode: {string} indicates how to assign weights for each edge in the graph 'binary' - 0-1 weighting, every edge receives weight of 1 (default) 'heat_kernel' - if nodes i and j are connected, put weight W_ij = exp(-norm(x_i - x_j)/2t^2) this weight mode can only be used under 'euclidean' metric and you are required to provide the parameter t 'cosine' - if nodes i and j are connected, put weight cosine(x_i,x_j). this weight mode can only be used under 'cosine' metric k: {int} choices for the number of neighbors (default k = 5) t: {float} parameter for the 'heat_kernel' weight_mode fisher_score: {boolean} indicates whether to build the affinity matrix in a fisher score way, in which W_ij = 1/n_l if yi = yj = l; otherwise W_ij = 0 (default fisher_score = false) reliefF: {boolean} indicates whether to build the affinity matrix in a reliefF way, NH(x) and NM(x,y) denotes a set of k nearest points to x with the same class as x, and a different class (the class y), respectively. W_ij = 1 if i = j; W_ij = 1/k if x_j \in NH(x_i); W_ij = -1/(c-1)k if x_j \in NM(x_i, y) (default reliefF = false) Output ------ W: {sparse matrix}, shape (n_samples, n_samples) output affinity matrix W """ # default metric is 'cosine' if 'metric' not in kwargs.keys(): kwargs['metric'] = 'cosine' # default neighbor mode is 'knn' and default neighbor size is 5 if 'neighbor_mode' not in kwargs.keys(): kwargs['neighbor_mode'] = 'knn' if kwargs['neighbor_mode'] == 'knn' and 'k' not in kwargs.keys(): kwargs['k'] = 5 if kwargs['neighbor_mode'] == 'supervised' and 'k' not in kwargs.keys(): kwargs['k'] = 5 if kwargs['neighbor_mode'] == 'supervised' and 'y' not in kwargs.keys(): print('Warning: label is required in the supervised neighborMode!!!') exit(0) # default weight mode is 'binary', default t in heat kernel mode is 1 if 'weight_mode' not in kwargs.keys(): kwargs['weight_mode'] = 'binary' if kwargs['weight_mode'] == 'heat_kernel': if kwargs['metric'] != 'euclidean': kwargs['metric'] = 'euclidean' if 't' not in kwargs.keys(): kwargs['t'] = 1 elif kwargs['weight_mode'] == 'cosine': if kwargs['metric'] != 'cosine': kwargs['metric'] = 'cosine' # default fisher_score and reliefF mode are 'false' if 'fisher_score' not in kwargs.keys(): kwargs['fisher_score'] = False if 'reliefF' not in kwargs.keys(): kwargs['reliefF'] = False n_samples, n_features = np.shape(X) # choose 'knn' neighbor mode if kwargs['neighbor_mode'] == 'knn': k = kwargs['k'] if kwargs['weight_mode'] == 'binary': if kwargs['metric'] == 'euclidean': # compute pairwise euclidean distances D = pairwise_distances(X) D **= 2 # sort the distance matrix D in ascending order dump = np.sort(D, axis=1) idx = np.argsort(D, axis=1) # choose the k-nearest neighbors for each instance idx_new = idx[:, 0:k + 1] G = np.zeros((n_samples * (k + 1), 3)) G[:, 0] = np.tile(np.arange(n_samples), (k + 1, 1)).reshape(-1) G[:, 1] = np.ravel(idx_new, order='F') G[:, 2] = 1 # build the sparse affinity matrix W W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) bigger = np.transpose(W) > W W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) return W elif kwargs['metric'] == 'cosine': # normalize the data first X_normalized = np.power(np.sum(X * X, axis=1), 0.5) for i in range(n_samples): X[i, :] = X[i, :] / max(1e-12, X_normalized[i]) # compute pairwise cosine distances D_cosine = np.dot(X, np.transpose(X)) # sort the distance matrix D in descending order dump = np.sort(-D_cosine, axis=1) idx = np.argsort(-D_cosine, axis=1) idx_new = idx[:, 0:k + 1] G = np.zeros((n_samples * (k + 1), 3)) G[:, 0] = np.tile(np.arange(n_samples), (k + 1, 1)).reshape(-1) G[:, 1] = np.ravel(idx_new, order='F') G[:, 2] = 1 # build the sparse affinity matrix W W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) bigger = np.transpose(W) > W W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) return W elif kwargs['weight_mode'] == 'heat_kernel': t = kwargs['t'] # compute pairwise euclidean distances D = pairwise_distances(X) D **= 2 # sort the distance matrix D in ascending order dump = np.sort(D, axis=1) idx = np.argsort(D, axis=1) idx_new = idx[:, 0:k + 1] dump_new = dump[:, 0:k + 1] # compute the pairwise heat kernel distances dump_heat_kernel = np.exp(-dump_new / (2 * t * t)) G = np.zeros((n_samples * (k + 1), 3)) G[:, 0] = np.tile(np.arange(n_samples), (k + 1, 1)).reshape(-1) G[:, 1] = np.ravel(idx_new, order='F') G[:, 2] = np.ravel(dump_heat_kernel, order='F') # build the sparse affinity matrix W W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) bigger = np.transpose(W) > W W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) return W elif kwargs['weight_mode'] == 'cosine': # normalize the data first X_normalized = np.power(np.sum(X * X, axis=1), 0.5) for i in range(n_samples): X[i, :] = X[i, :] / max(1e-12, X_normalized[i]) # compute pairwise cosine distances D_cosine = np.dot(X, np.transpose(X)) # sort the distance matrix D in ascending order dump = np.sort(-D_cosine, axis=1) idx = np.argsort(-D_cosine, axis=1) idx_new = idx[:, 0:k + 1] dump_new = -dump[:, 0:k + 1] G = np.zeros((n_samples * (k + 1), 3)) G[:, 0] = np.tile(np.arange(n_samples), (k + 1, 1)).reshape(-1) G[:, 1] = np.ravel(idx_new, order='F') G[:, 2] = np.ravel(dump_new, order='F') # build the sparse affinity matrix W W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) bigger = np.transpose(W) > W W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) return W # choose supervised neighborMode elif kwargs['neighbor_mode'] == 'supervised': k = kwargs['k'] # get true labels and the number of classes y = kwargs['y'] label = np.unique(y) n_classes = np.unique(y).size # construct the weight matrix W in a fisherScore way, W_ij = 1/n_l if yi = yj = l, otherwise W_ij = 0 if kwargs['fisher_score'] is True: W = lil_matrix((n_samples, n_samples)) for i in range(n_classes): class_idx = (y == label[i]) class_idx_all = (class_idx[:, np.newaxis] & class_idx[np.newaxis, :]) W[class_idx_all] = 1.0 / np.sum(np.sum(class_idx)) return W # construct the weight matrix W in a reliefF way, NH(x) and NM(x,y) denotes a set of k nearest # points to x with the same class as x, a different class (the class y), respectively. W_ij = 1 if i = j; # W_ij = 1/k if x_j \in NH(x_i); W_ij = -1/(c-1)k if x_j \in NM(x_i, y) if kwargs['reliefF'] is True: # when xj in NH(xi) G = np.zeros((n_samples * (k + 1), 3)) id_now = 0 for i in range(n_classes): class_idx = np.column_stack(np.where(y == label[i]))[:, 0] D = pairwise_distances(X[class_idx, :]) D **= 2 idx = np.argsort(D, axis=1) idx_new = idx[:, 0:k + 1] n_smp_class = (class_idx[idx_new[:]]).size if len(class_idx) <= k: k = len(class_idx) - 1 G[id_now:n_smp_class + id_now, 0] = np.tile(class_idx, (k + 1, 1)).reshape(-1) G[id_now:n_smp_class + id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F') G[id_now:n_smp_class + id_now, 2] = 1.0 / k id_now += n_smp_class W1 = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) # when i = j, W_ij = 1 for i in range(n_samples): W1[i, i] = 1 # when x_j in NM(x_i, y) G = np.zeros((n_samples * k * (n_classes - 1), 3)) id_now = 0 for i in range(n_classes): class_idx1 = np.column_stack(np.where(y == label[i]))[:, 0] X1 = X[class_idx1, :] for j in range(n_classes): if label[j] != label[i]: class_idx2 = np.column_stack( np.where(y == label[j]))[:, 0] X2 = X[class_idx2, :] D = pairwise_distances(X1, X2) idx = np.argsort(D, axis=1) idx_new = idx[:, 0:k] n_smp_class = len(class_idx1) * k G[id_now:n_smp_class + id_now, 0] = np.tile(class_idx1, (k, 1)).reshape(-1) G[id_now:n_smp_class + id_now, 1] = np.ravel(class_idx2[idx_new[:]], order='F') G[id_now:n_smp_class + id_now, 2] = -1.0 / ((n_classes - 1) * k) id_now += n_smp_class W2 = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) bigger = np.transpose(W2) > W2 W2 = W2 - W2.multiply(bigger) + np.transpose(W2).multiply(bigger) W = W1 + W2 return W if kwargs['weight_mode'] == 'binary': if kwargs['metric'] == 'euclidean': G = np.zeros((n_samples * (k + 1), 3)) id_now = 0 for i in range(n_classes): class_idx = np.column_stack(np.where(y == label[i]))[:, 0] # compute pairwise euclidean distances for instances in class i D = pairwise_distances(X[class_idx, :]) D **= 2 # sort the distance matrix D in ascending order for instances in class i idx = np.argsort(D, axis=1) idx_new = idx[:, 0:k + 1] n_smp_class = len(class_idx) * (k + 1) G[id_now:n_smp_class + id_now, 0] = np.tile(class_idx, (k + 1, 1)).reshape(-1) G[id_now:n_smp_class + id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F') G[id_now:n_smp_class + id_now, 2] = 1 id_now += n_smp_class # build the sparse affinity matrix W W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) bigger = np.transpose(W) > W W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) return W if kwargs['metric'] == 'cosine': # normalize the data first X_normalized = np.power(np.sum(X * X, axis=1), 0.5) for i in range(n_samples): X[i, :] = X[i, :] / max(1e-12, X_normalized[i]) G = np.zeros((n_samples * (k + 1), 3)) id_now = 0 for i in range(n_classes): class_idx = np.column_stack(np.where(y == label[i]))[:, 0] # compute pairwise cosine distances for instances in class i D_cosine = np.dot(X[class_idx, :], np.transpose(X[class_idx, :])) # sort the distance matrix D in descending order for instances in class i idx = np.argsort(-D_cosine, axis=1) idx_new = idx[:, 0:k + 1] n_smp_class = len(class_idx) * (k + 1) G[id_now:n_smp_class + id_now, 0] = np.tile(class_idx, (k + 1, 1)).reshape(-1) G[id_now:n_smp_class + id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F') G[id_now:n_smp_class + id_now, 2] = 1 id_now += n_smp_class # build the sparse affinity matrix W W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) bigger = np.transpose(W) > W W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) return W elif kwargs['weight_mode'] == 'heat_kernel': G = np.zeros((n_samples * (k + 1), 3)) id_now = 0 for i in range(n_classes): class_idx = np.column_stack(np.where(y == label[i]))[:, 0] # compute pairwise cosine distances for instances in class i D = pairwise_distances(X[class_idx, :]) D **= 2 # sort the distance matrix D in ascending order for instances in class i dump = np.sort(D, axis=1) idx = np.argsort(D, axis=1) idx_new = idx[:, 0:k + 1] dump_new = dump[:, 0:k + 1] t = kwargs['t'] # compute pairwise heat kernel distances for instances in class i dump_heat_kernel = np.exp(-dump_new / (2 * t * t)) n_smp_class = len(class_idx) * (k + 1) G[id_now:n_smp_class + id_now, 0] = np.tile(class_idx, (k + 1, 1)).reshape(-1) G[id_now:n_smp_class + id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F') G[id_now:n_smp_class + id_now, 2] = np.ravel(dump_heat_kernel, order='F') id_now += n_smp_class # build the sparse affinity matrix W W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) bigger = np.transpose(W) > W W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) return W elif kwargs['weight_mode'] == 'cosine': # normalize the data first X_normalized = np.power(np.sum(X * X, axis=1), 0.5) for i in range(n_samples): X[i, :] = X[i, :] / max(1e-12, X_normalized[i]) G = np.zeros((n_samples * (k + 1), 3)) id_now = 0 for i in range(n_classes): class_idx = np.column_stack(np.where(y == label[i]))[:, 0] # compute pairwise cosine distances for instances in class i D_cosine = np.dot(X[class_idx, :], np.transpose(X[class_idx, :])) # sort the distance matrix D in descending order for instances in class i dump = np.sort(-D_cosine, axis=1) idx = np.argsort(-D_cosine, axis=1) idx_new = idx[:, 0:k + 1] dump_new = -dump[:, 0:k + 1] n_smp_class = len(class_idx) * (k + 1) G[id_now:n_smp_class + id_now, 0] = np.tile(class_idx, (k + 1, 1)).reshape(-1) G[id_now:n_smp_class + id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F') G[id_now:n_smp_class + id_now, 2] = np.ravel(dump_new, order='F') id_now += n_smp_class # build the sparse affinity matrix W W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) bigger = np.transpose(W) > W W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) return W
def test_weighted_dbscan(): # ensure sample_weight is validated with pytest.raises(ValueError): dbscan([[0], [1]], sample_weight=[2]) with pytest.raises(ValueError): dbscan([[0], [1]], sample_weight=[2, 3, 4]) # ensure sample_weight has an effect assert_array_equal( [], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0] ) assert_array_equal( [], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0] ) assert_array_equal( [0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0] ) assert_array_equal( [0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0] ) # points within eps of each other: assert_array_equal( [0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0], ) # and effect of non-positive and non-integer sample_weight: assert_array_equal( [], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0] ) assert_array_equal( [0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[ 0 ], # noqa ) assert_array_equal( [0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0], ) assert_array_equal( [], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[ 0 ], # noqa ) # for non-negative sample_weight, cores should be identical to repetition rng = np.random.RandomState(42) sample_weight = rng.randint(0, 5, X.shape[0]) core1, label1 = dbscan(X, sample_weight=sample_weight) assert len(label1) == len(X) X_repeated = np.repeat(X, sample_weight, axis=0) core_repeated, _ = dbscan(X_repeated) core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool) core_repeated_mask[core_repeated] = True core_mask = np.zeros(X.shape[0], dtype=bool) core_mask[core1] = True assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask) # sample_weight should work with precomputed distance matrix D = pairwise_distances(X) core3, label3 = dbscan( D, sample_weight=sample_weight, metric="precomputed" ) # noqa assert_array_equal(core1, core3) assert_array_equal(label1, label3) # sample_weight should work with estimator est = DBSCAN().fit(X, sample_weight=sample_weight) core4 = est.core_sample_indices_ label4 = est.labels_ assert_array_equal(core1, core4) assert_array_equal(label1, label4) est = DBSCAN() label5 = est.fit_predict(X, sample_weight=sample_weight) core5 = est.core_sample_indices_ assert_array_equal(core1, core5) assert_array_equal(label1, label5) assert_array_equal(label1, est.labels_)