def _find_nearest(query_vectors, embeds, n_nearby, batch_size): num_embeds = embeds.shape[0] num_batches = (num_embeds / batch_size) + (1 if num_embeds % batch_size != 0 else 0) nearest_k_fn = _make_nearest_k_fn(k=n_nearby) indices = [] dists = [] for b in range(num_batches): print 'Running batch {} of {}\r'.format(1 + b, num_batches), sys.stdout.flush() start = b * batch_size end = min(start + batch_size, embeds.shape[0]) batch = embeds[start:end, :] batch_indices, batch_dists = zip(*pairwise_distances_chunked( query_vectors, batch, reduce_func=nearest_k_fn, metric='euclidean')) indices.append(np.concatenate(batch_indices) + start) dists.append(np.concatenate(batch_dists)) # Combine results from each batch, sort them and take the n_nearby closest indices = np.concatenate(indices, axis=1) dists = np.concatenate(dists, axis=1) sorted_idx = np.argsort(dists, axis=1)[:, :n_nearby] indices = np.concatenate([[row[idx_row]] for row, idx_row in zip(indices, sorted_idx)]) dists = np.concatenate([[row[idx_row]] for row, idx_row in zip(dists, sorted_idx)]) return indices, dists
def silhouette_score_(X, labels, metric='euclidean'): ''' The Silhouette Coefficient is calculated using the mean intra-cluster distance (``a``) and the mean nearest-cluster distance (``b``) for each sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)`` ''' le = LabelEncoder() labels = le.fit_transform(labels) # resequence labels: 0,1,... n_samples = len(labels) label_freqs = np.bincount(labels) # check_number_of_labels(len(le.classes_), n_samples) reduce_func = functools.partial(_silhouette_reduce, labels=labels, label_freqs=label_freqs) results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func)) intra_clust_dists, inter_clust_dists = results intra_clust_dists = np.concatenate(intra_clust_dists) inter_clust_dists = np.concatenate(inter_clust_dists) denom = (label_freqs - 1).take(labels, mode='clip') with np.errstate(divide="ignore", invalid="ignore"): intra_clust_dists /= denom sil_samples = inter_clust_dists - intra_clust_dists with np.errstate(divide="ignore", invalid="ignore"): sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) # nan values are for clusters of size 1, and should be 0 return np.mean(np.nan_to_num(intra_clust_dists)), np.mean( np.nan_to_num(inter_clust_dists)), np.mean(np.nan_to_num(sil_samples))
def min_radius(n, data, target, dist_metric): """Compute minimum radius of hypersphere such that for each example in the data matrix as the centre the sphere will contain at least n examples from same class and n examples from a different class. --- Parameters: --- n: minimum number of examples from same class and different class a hypersphere with centre in each example in the dataset should contain data: Matrix containing examples' features as rows target: Matrix of target variable values dist_metric: distance metric for distance matrix computation (see documentation on function pairwise_distances from scikit-learn for valid distance metric specifiers) ------ Returns: Minimum acceptable radius of the hypersphere Author: Jernej Vivod """ # Allocate array for storing minimum acceptable radius for each example in dataset. min_r = np.empty(data.shape[0], dtype=float) # Construct distances matrix. Force generation by rows. dist_mat = sk_metrics.pairwise_distances_chunked(data, metric=dist_metric, n_jobs=-1, working_memory=0) # Go over examples and compute minimum acceptable radius for each example. for k in np.arange(data.shape[0]): dist_from_e = next(dist_mat)[0] # Get next row of distances matrix msk = target == target[k] # Get mask for examples from same class. dist_same = dist_from_e[ msk] # Get minimum distance that includes n examples from same class. dist_diff = dist_from_e[ ~msk] # Get minimum distance that includes n examples from different class. try: min_r[k] = np.max((np.sort(dist_same)[n], np.sort(dist_diff)[n - 1] )) # Compute minimum radius for this example. except IndexError: raise ValueError( 'Insufficient examples with class {0} for given value of n (n = {1})' .format(target[k], n)) return np.max( min_r ) # Return maximum of array of minimum acceptable radiuses for each example
def get_score_for_item(content, all_content, taxon): embedded_sentences_for_taxon = get_embedded_sentences_for_taxon(all_content, taxon) if not embedded_sentences_for_taxon: return [], -1; content_generator = pairwise_distances_chunked( X=[content], Y=embedded_sentences_for_taxon, working_memory=0, metric='cosine', n_jobs=-1) cosine_scores = list(enumerate(content_generator))[0][1][0] cosine_scores.sort() return cosine_scores, cosine_scores.mean();
def haus(data, sample, max_mem=float('inf')): if max_mem == float('inf'): dist = pairwise_distances(data[sample, :], data, n_jobs=-1) return(dist.min(0).max()) else: dists = pairwise_distances_chunked(data, data[sample,:], reduce_func = lambda x,y: x.min(1), working_memory = max_mem) h = 0 for x in dists: h = max([h, max(x)]) print(h) return h
def _get_mus_scaling(self, range_scaling): reduce_func = partial(self._mus_scaling_reduce_func, range_scaling=range_scaling) kwds = {'squared': True} chunked_results = list( pairwise_distances_chunked(self.X, self.X, reduce_func=reduce_func, metric='euclidean', n_jobs=self.njobs, working_memory=1024, **kwds)) neigh_dist, neigh_ind, mus, rs = zip(*chunked_results) return np.vstack(neigh_dist), np.vstack(neigh_ind), np.vstack( mus), np.vstack(rs)
def _return_mus_scaling(self, range_scaling): """ Description: adapted from kneighbors function of sklearn https://github.com/scikit-learn/scikit-learn/blob/95119c13af77c76e150b753485c662b7c52a41a2/sklearn/neighbors/_base.py#L596 It allows to keep a nearest neighbor matrix up to rank 'maxk' (few tens of points) instead of 'range_scaling' (few thousands), while computing the ratios between neighbors' distances up to neighbors' rank 'range scaling'. For big datasets it avoids out of memory errors Args: range_scaling (int): maximum neighbor rank considered in the computation of the mu ratios Returns: dist (np.ndarray(float)): the FULL distance matrix sorted in increasing order of neighbor distances up to maxk neighb_ind np.ndarray(int)): the FULL matrix of the indices of the nearest neighbors up to maxk mus np.ndarray(float)): the FULL matrix of the ratios of the neighbor distances of order 2**(i+1) and 2**i rs np.ndarray(float)): the FULL matrix of the distances of the neighbors involved in the mu estimates """ reduce_func = partial(self._mus_scaling_reduce_func, range_scaling=range_scaling) kwds = {"squared": True} chunked_results = list( pairwise_distances_chunked( self.X, self.X, reduce_func=reduce_func, metric=self.metric, n_jobs=self.njobs, working_memory=1024, **kwds, )) neigh_dist, neigh_ind, mus, rs = zip(*chunked_results) return ( np.vstack(neigh_dist), np.vstack(neigh_ind), np.vstack(mus), np.vstack(rs), )
def gen_pairwise_distances(label, **kwargs): """ (1) Represent the presence of cuis in a binary array. (2) Calculate pairwise distances between all records in the array. (3) Select record pairs with distance below the threshold. """ def apply_threshold(chunk, _): similar = chunk < self.distance_threshold return pd.DataFrame(similar, columns=self.preprocessed_data.index) binary = self.binarize(self.preprocessed_data[label], sparse_output=self.sparse) return pairwise_distances_chunked( binary, metric=self.metric, reduce_func=apply_threshold, **kwargs, )
def func(i, X): t0 = perf_counter() # D = pairwise_distances(X, n_jobs=-1) dtype = [('values', 'f8'), ('indices', 'u8')] D = np.memmap(tmp_dir + f"D_{i}.memmap", dtype=dtype, mode="w+", shape=(X.shape[0]**2, )) for j in range(0, X.shape[0]**2, chunk_size): D["indices"][j:j + chunk_size] = np.arange(j, j + chunk_size) row_ptr = 0 for chunk in pairwise_distances_chunked(X, working_memory=chunk_size): length = chunk.ravel().shape[0] D["values"][row_ptr:row_ptr + length] = chunk.ravel() row_ptr += length t1 = perf_counter() - t0 print(f"Done with distances ({i}) t={t1}") t0 = perf_counter() D.sort(order='values', axis=0) t1 = perf_counter() - t0 print(f"Done with sorting ({i}) t={t1}") t0 = perf_counter() for j in range(0, X.shape[0]**2, chunk_size): idx = D["indices"][j:j + chunk_size] D["values"][idx] = np.arange(j + 1, j + chunk_size + 1) ranked = D["values"] # np.float64 avoids overflows in the computation that comes next # ranked = stats.rankdata(D["values"], method="ordinal").astype(np.float64) t1 = perf_counter() - t0 print(f"Done with ranking ({i}) t={t1}") return ranked
def pairwise_jaccard_distances(doclist_X, doclist_Y=None): """Calculates the matrix with the pairwise Jaccard *DISTANCES* matrix for one (or two) given document list(s). If `doclist_Y` is `None`, the matrix is created between every document of the `doclist_X`, else between `doclist_X` and `doclist_Y`. Args: doclist_X (list of list of words): X - list of documents (words separated by empty space) doclist_Y (list of list of words, optional): Y - list of documents. Defaults to None. Returns: generator: generator of vertical chunks of the distance matrix """ # Transform into sparse document-word-matrix wordlist_x = doclist_X wordlist_y = doclist_Y if doclist_Y is not None else [] all_words = wordlist_x + wordlist_y # split into chunks lexicon = corpora.Dictionary() for index in range(0, len(all_words), 1000): lexicon.add_documents(all_words[index:index+1000]) lexicon.filter_extremes(no_below=5, no_above=1, keep_n=None) bow_x = [] for t in wordlist_x: bow_x.append(lexicon.doc2bow(t)) vocab_matrix_x = matutils.corpus2dense(bow_x, num_terms=len(lexicon.token2id)).T.astype(bool) if doclist_Y is not None: bow_y = [] for t in wordlist_y: bow_y.append(lexicon.doc2bow(t)) vocab_matrix_y = matutils.corpus2dense(bow_y, num_terms=len(lexicon.token2id)).T else: vocab_matrix_y = None # Calculate jaccard *DISTANCES* (no similarities yet) matrix_gen = pairwise_distances_chunked(vocab_matrix_x, vocab_matrix_y, metric='jaccard', n_jobs=-1, working_memory=256) return matrix_gen
def get_matrix_params(embeddings, *, ka, neighborhood_size, metric='euclidean', n_jobs=1, working_memory=None): reduce_function = partial(reduce_distance_matrix_func, ka=ka, neighborhood_size=neighborhood_size) neighbor_coors, neighbor_distances, kernel_width = list( map( np.concatenate, zip(*pairwise_distances_chunked(embeddings, reduce_func=reduce_function, metric=metric, n_jobs=n_jobs, working_memory=working_memory)))) return neighbor_coors, neighbor_distances, kernel_width
def fit(self, X, y): self.Y = np.unique(y) self.L = len(self.Y) if self.L < 2: raise ValueError( "There is only one unique value in target vector y.") Y_idx = [np.where(y == k) for k in self.Y] self.XY = [X[Y_idx[i]] for i in range(self.L)] self.A = np.zeros((self.L, self.L)) for i in range(self.L): for j in range(i, self.L): Xi, Xj = self.XY[i], self.XY[j] ni, nj = Xi.shape[0], Xj.shape[0] self.A[i, j] = 1.0 / (ni * nj) * sum( np.sum(M) for M in pairwise_distances_chunked(Xi, Xj)) if j > i: self.A[j, i] = self.A[i, j] if self.L > 2: k = self.L - 1 self.B = np.zeros((k, k)) for i in range(k): for j in range(i, k): self.B[i, j] = -self.A[i, j] + self.A[i, k] + self.A[ k, j] - self.A[k, k] if j > i: self.B[j, i] = self.B[i, j] return self
def predict(self, X): s = np.zeros(self.L) n = X.shape[0] for i in range(self.L): Xi = self.XY[i] ni = Xi.shape[0] s[i] = 1.0 / (ni * n) * sum( np.sum(M) for M in pairwise_distances_chunked(Xi, X)) if self.L < 3: p = (s[1] - s[0] + self.A[0, 1] - self.A[1, 1]) / ( -self.A[0, 0] + 2 * self.A[0, 1] - self.A[1, 1]) if p < 0: return np.array([0, 1]) if p > 1: return np.array([1, 0]) return np.array([p, 1 - p]) else: k = self.L - 1 t = np.zeros(k) for i in range(k): t[i] = -s[i] + self.A[i, k] + s[k] - self.A[k, k] P = cvx.Variable(k) constraints = [P >= 0, cvx.sum(P) <= 1.0] problem = cvx.Problem( cvx.Minimize(cvx.quad_form(P, self.B) - 2 * P.T @ t), constraints) problem.solve() P = np.array(P.value).squeeze() return np.append(P, 1.0 - sum(P))
def corint(x, k1=10, k2=20, DM=None): n_elements = len(x)**2 #number of elements dists, _ = get_nn(x, k2) if DM is None: chunked_distmat = pairwise_distances_chunked(x) else: chunked_distmat = DM r1 = np.median(dists[:, k1 - 1]) r2 = np.median(dists[:, -1]) n_diagonal_entries = len(x) #remove diagonal from sum count s1 = -n_diagonal_entries s2 = -n_diagonal_entries for chunk in chunked_distmat: s1 += (chunk < r1).sum() s2 += (chunk < r2).sum() Cr = np.array([s1 / n_elements, s2 / n_elements]) estq = np.diff(np.log(Cr)) / np.log(r2 / r1) return (estq)
def main(*, gtex): store = h5py.File(gtex, "r", rdcc_nbytes=CACHE_SIZE) genes = list(store["genes"][:]) expressions = store["expressions"] n_genes, n_cells = expressions.shape Z = transform_rows(expressions, lambda x: np.log10(1 + x), CHUNK_SIZE) Z = Z - np.mean(Z, axis=1)[:, None] gene_indices = np.arange(n_genes, dtype=np.int32) threshold = R2_THRESHOLD top_k = MAX_NEIGHBORS i = 0 for chunk in pairwise_distances_chunked(Z, metric="cosine", working_memory=0): # Transform cosine distance to cosine correlation. score = 1 - chunk[0] # Adjust correlation against bias. score = score - score.mean() # Then, obtain a score similar to coefficient of determination. score = score**2 # Do not count for self correlation. score[i] = 0 tops = np.argsort(score)[::-1][:top_k] tops = tops[score[tops] > threshold] tops_text = " ".join(f"{j}" for j in tops) print(f"{i}: {tops_text}") i += 1
def _corrint(self, X): n_elements = len(X) ** 2 # number of elements dists, _ = get_nn(X, min(self.k2, len(X) - 1)) if self.DM is False: chunked_distmat = pairwise_distances_chunked(X) else: chunked_distmat = X r1 = np.median(dists[:, min(self.k1 - 1, len(X) - 2)]) r2 = np.median(dists[:, -1]) n_diagonal_entries = len(X) # remove diagonal from sum count s1 = -n_diagonal_entries s2 = -n_diagonal_entries for chunk in chunked_distmat: s1 += (chunk < r1).sum() s2 += (chunk < r2).sum() Cr = np.array([s1 / n_elements, s2 / n_elements]) estq = np.diff(np.log(Cr)) / np.log(r2 / r1) return estq[0]
def kneighbors(self, X, n_neighbors=None, return_distance=True): """Find the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]), or numpy ndarray with shape([n_cases,n_readings,n_dimensions]) y : {array-like, sparse matrix} Target values of shape = [n_samples] n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. If False, distances will not be returned Returns ------- dist : array Array representing the lengths to points, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ self.check_is_fitted() # Transpose to work correctly with distance functions X = X.transpose((0, 2, 1)) if n_neighbors is None: n_neighbors = self.n_neighbors elif n_neighbors <= 0: raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) else: if not np.issubdtype(type(n_neighbors), np.integer): raise TypeError( "n_neighbors does not take %s value, " "enter integer value" % type(n_neighbors) ) if X is not None: query_is_train = False X = check_array(X, accept_sparse="csr", allow_nd=True) else: query_is_train = True X = self._fit_X # Include an extra neighbor to account for the sample itself being # returned, which is removed later n_neighbors += 1 train_size = self._fit_X.shape[0] if n_neighbors > train_size: raise ValueError( "Expected n_neighbors <= n_samples, " " but n_samples = %d, n_neighbors = %d" % (train_size, n_neighbors) ) n_samples = X.shape[0] sample_range = np.arange(n_samples)[:, None] n_jobs = effective_n_jobs(self.n_jobs) if self._fit_method == "brute": reduce_func = partial( self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance, ) # for efficiency, use squared euclidean distances kwds = ( {"squared": True} if self.effective_metric_ == "euclidean" else self.effective_metric_params_ ) result = pairwise_distances_chunked( X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, **kwds ) else: raise ValueError("internal: _fit_method not recognized") if return_distance: dist, neigh_ind = zip(*result) result = np.vstack(dist), np.vstack(neigh_ind) else: result = np.vstack(result) if not query_is_train: return result else: # If the query data is the same as the indexed data, we would like # to ignore the first nearest neighbor of every sample, i.e # the sample itself. if return_distance: dist, neigh_ind = result else: neigh_ind = result sample_mask = neigh_ind != sample_range # Corner case: When the number of duplicates are more # than the number of neighbors, the first NN will not # be the sample, but a duplicate. # In that case mask the first duplicate. dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False neigh_ind = np.reshape(neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) if return_distance: dist = np.reshape(dist[sample_mask], (n_samples, n_neighbors - 1)) return dist, neigh_ind return neigh_ind
vectordf = df[0].apply(lambda x:list(map(int,x))) final = pd.DataFrame(vectordf)[0].apply(pd.Series) #removing more garbage from memory del(df) del(vectordf) gc.collect() print("Performing hamming distance analysis on high frequency kmers. {}".format(time.asctime())) if myargs.temp: #writes distance matrix to specified location filename = os.path.join(myargs.temp,"distance.dat") kmerdist=np.memmap(filename, dtype='float32',mode='w+',shape=(comb(final.shape[0],2,exact=True),)) if myargs.mem: dist_gen = pairwise_distances_chunked(final,metric='hamming',n_jobs=-1,working_memory=myargs.mem) position = final.shape[0]-1 remaining = 1 total = 0 del(kmerdist) for temp in dist_gen: print("Processing distance matrix chunk. {}".format(time.asctime())) kmerdist=np.memmap(filename,dtype='float32',mode='r+') for r in range(len(temp)): kmerdist[total:total+position] = temp[r][remaining:] total+=position position-=1 remaining+=1 del(kmerdist) gc.collect() kmerdist=np.memmap(filename,dtype='float32',mode='r')
def kneighbors(self, X, n_neighbors=None, sketch_method=None, candidates_scale=None, return_distance=False): """Fast finds the approximate K-neighbors of each point using sketch. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_query, n_features). The query point or points. n_neighbors : :obj:`int`, :obj: Number of neighbors to get. sketch_method : {:obj:`None`, 'symmetric', 'asymmetric', 'g_asymmetric', 'PCA'}, defalut = :obj:`None` Method to be used to filter candidates before rank the real distances. If non None value passed to the constructor, this value will be ignored. If both constructor and this method get None, It will not use any sketch filter, act just like normal KNN. See constructor for more details. candidates_scale : :obj:`int`, default is the value passed to the constructor Scale up n_neighbors as number of candidate when filtering using sketch. return_distance : :obj:`boolean`, default = :obj:`False`. If False, distances will not be returned Returns ------- dist : :obj:`array` Array representing the lengths to points, only present if return_distance= :obj:`True` ind : :obj:`array` Indices of the nearest points in the population matrix. """ check_is_fitted(self, ["_fit_X"]) if n_neighbors is None: n_neighbors = self.n_neighbors X = check_array(X) if self.sketch_method is not None: sketch_method = self.sketch_method # reduce_func for neighbors reduce_func_k = partial(self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance) kwds = ({'squared': True}) n_jobs = effective_n_jobs(self.n_jobs) # find candidates if sketch_method is None: # KNN pass else: if candidates_scale is None: candidates_scale = self.candidates_scale n_candidates = self.n_neighbors * candidates_scale reduce_func_1 = partial(self._kneighbors_reduce_func, n_neighbors=n_candidates, return_distance=False) if sketch_method == 'symmetric': sketch_X = self._sketch(X) candidates = list( pairwise_distances_chunked(sketch_X, self._sketch_X, reduce_func=reduce_func_1, metric=paired_hamming_distance, n_jobs=n_jobs)) elif sketch_method == 'asymmetric': # TODO: sketch X (query points) sketch_X, weight = self._sketch(X, return_weight=True) _sketch_X_weight = sketch_X + weight # encode sketch_X and weight together # TODO: filter candidates candidates = list( pairwise_distances_chunked( _sketch_X_weight, self._sketch_X, reduce_func=reduce_func_1, metric=paired_asymmetric_distance, n_jobs=n_jobs)) elif sketch_method == 'PCA': # sketch X (query points) sketch_X = self._pca.transform(X) # filter candidates candidates = list( pairwise_distances_chunked(sketch_X, self._pca_X, reduce_func=reduce_func_1, metric=self.effective_metric_, n_jobs=n_jobs, **kwds)) elif sketch_method == 'g_asymmetric': # TODO: sketch X (query points) sketch_X, weight, g_sketch_X, g_weight = self._sketch( X, return_weight=True, return_label=True) _sketch_X_weight = sketch_X + weight # encode sketch_X and weight together # TODO: filter label Candidate_inds = [] for g_sketch_X_i, g_weight_i in zip( g_sketch_X, g_weight): # for each query point labels = self._getlabels( g_weight_i, g_sketch_X_i, self.g_threshold) # get query point's labels inds = set() for label in labels: inds |= self._g_dict[label] sketchlist = g_sketch_X_i wlist = g_weight_i while len(inds) < n_candidates: wlist = [ w if w >= self.g_threshold else math.inf for w in g_weight_i ] ind = np.argmin(wlist) # find index of smallest weight sketchlist[ind] = 1 - sketchlist[ind] label = ''.join(str(elm) for elm in sketchlist) inds |= self._g_dict[label] del wlist Candidate_inds.append( inds ) # get row number of data points that matched query point's labels # TODO: filter candidates candidates = [] for i in range(len(Candidate_inds)): # for each query point candidate_inds = sorted(list( Candidate_inds[i])) # get matched inds tmp1 = self._sketch_X[candidate_inds, :] tmp2 = _sketch_X_weight[[i]] iinds = list( pairwise_distances_chunked( tmp2, tmp1, reduce_func=reduce_func_1, metric=paired_asymmetric_distance, n_jobs=n_jobs)) iinds[0][0] = np.array( [candidate_inds[ii] for ii in list(iinds[0][0])]) candidates += iinds else: raise ValueError( "%s sketch_method has not been implemented.".format( sketch_method)) candidates = np.vstack(candidates) # result to return if return_distance: dists = np.empty([0, n_neighbors]) neight_inds = np.empty([0, n_neighbors], dtype=int) # find neighbors if sketch_method is None: # KNN # find neighbors from all data points result = list( pairwise_distances_chunked(X, self._fit_X, reduce_func=reduce_func_k, metric=self.effective_metric_, n_jobs=n_jobs, **kwds)) if return_distance: dist, neigh_ind = zip(*result) result = np.vstack(dist), np.vstack(neigh_ind) else: result = np.vstack(result) else: # find neighbors from the candidate points. for i in range(len(candidates)): result = list( pairwise_distances_chunked(X[[i], :], self._fit_X[candidates[i]], reduce_func=reduce_func_k, metric=self.effective_metric_, n_jobs=n_jobs, **kwds)) if return_distance: dist, neigh_ind = zip(*result) dist = np.vstack(dist) neigh_ind = candidates[i][np.vstack(neigh_ind).reshape(-1)] dists = np.concatenate((dists, dist), axis=0) neight_inds = np.vstack((neight_inds, neigh_ind)) else: neigh_ind = candidates[i][np.vstack(result)[0]] neight_inds = np.vstack((neight_inds, neigh_ind)) if return_distance: result = dists, neight_inds else: result = neight_inds return result
# ------------------------------------------------------------------ # Generate taxon_branch_homogeneity_scores.csv # ------------------------------------------------------------------ # Calculate size and mean cosine scores for each branch branch_homogeneity = [] for branch in labelled.level1taxon.unique(): total_cosine_for_branch = np.zeros(1) denominator_for_branch = np.zeros(1) branch_embeddings = embedded_clean_content[labelled['level1taxon'] == branch] branch_size = branch_embeddings.shape[0] for chunk in pairwise_distances_chunked(branch_embeddings, metric='cosine', n_jobs=-1): total_cosine_for_branch += np.sum(chunk) denominator_for_branch += np.prod(chunk.shape) mean_cosine_for_branch = (total_cosine_for_branch / denominator_for_branch).item() branch_homogeneity.append([branch, branch_size, mean_cosine_for_branch]) # Put mean cosine scores for each branch into a sorted dataframe branch_homogeneity_scores = pd.DataFrame(branch_homogeneity, columns=['branch', 'branch_size', 'mean_cosine_score']) \ .sort_values('mean_cosine_score', ascending=False) # Add weighted score branch_homogeneity_scores['min_max_branch_size'] = ( branch_homogeneity_scores['branch_size'] - branch_homogeneity_scores['branch_size'].min()) / ( branch_homogeneity_scores['branch_size'].max() - branch_homogeneity_scores['branch_size'].min())
def kneighbors(self, X, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]), or numpy ndarray with shape([n_cases,n_readings,n_dimensions]) y : {array-like, sparse matrix} Target values of shape = [n_samples] n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. If False, distances will not be returned Returns ------- dist : array Array representing the lengths to points, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ self.check_is_fitted() X = check_X(X, enforce_univariate=False) X = nested_to_3d_numpy(X) if n_neighbors is None: n_neighbors = self.n_neighbors elif n_neighbors <= 0: raise ValueError( "Expected n_neighbors > 0. Got %d" % n_neighbors ) else: if not np.issubdtype(type(n_neighbors), np.integer): raise TypeError( "n_neighbors does not take %s value, " "enter integer value" % type(n_neighbors)) if X is not None: query_is_train = False X = check_array(X, accept_sparse='csr', allow_nd=True) else: query_is_train = True X = self._fit_X # Include an extra neighbor to account for the sample itself being # returned, which is removed later n_neighbors += 1 train_size = self._fit_X.shape[0] if n_neighbors > train_size: raise ValueError( "Expected n_neighbors <= n_samples, " " but n_samples = %d, n_neighbors = %d" % (train_size, n_neighbors) ) n_samples = X.shape[0] sample_range = np.arange(n_samples)[:, None] n_jobs = effective_n_jobs(self.n_jobs) if self._fit_method == 'brute': reduce_func = partial(self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance) # for efficiency, use squared euclidean distances kwds = ({'squared': True} if self.effective_metric_ == 'euclidean' else self.effective_metric_params_) result = pairwise_distances_chunked( X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, **kwds) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): raise ValueError( "%s does not work with sparse matrices. Densify the data, " "or set algorithm='brute'" % self._fit_method) if LooseVersion(joblib_version) < LooseVersion('0.12'): # Deal with change of API in joblib delayed_query = delayed(self._tree.query, check_pickle=False) parallel_kwargs = {"backend": "threading"} else: delayed_query = delayed(self._tree.query) parallel_kwargs = {"prefer": "threads"} result = Parallel(n_jobs, **parallel_kwargs)( delayed_query( X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) ) else: raise ValueError("internal: _fit_method not recognized") if return_distance: dist, neigh_ind = zip(*result) result = np.vstack(dist), np.vstack(neigh_ind) else: result = np.vstack(result) if not query_is_train: return result else: # If the query data is the same as the indexed data, we would like # to ignore the first nearest neighbor of every sample, i.e # the sample itself. if return_distance: dist, neigh_ind = result else: neigh_ind = result sample_mask = neigh_ind != sample_range # Corner case: When the number of duplicates are more # than the number of neighbors, the first NN will not # be the sample, but a duplicate. # In that case mask the first duplicate. dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False neigh_ind = np.reshape( neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) if return_distance: dist = np.reshape( dist[sample_mask], (n_samples, n_neighbors - 1)) return dist, neigh_ind return neigh_ind
A = np.load("audio.npy") #A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) threshold = 6e-12 def get_edges(D_chunk, start): below_threshold = np.where(D_chunk < threshold) edges = np.array(below_threshold) edges[0] += start dist = D_chunk[below_threshold] return edges, dist g = pairwise_distances_chunked(A, metric='cosine') edge_source = [] edge_dest = [] edge_weight = [] start = 0 while 1: try: D_chunk = next(g) edges, dist = get_edges(D_chunk, start) start += D_chunk.shape[0] edge_source.extend(edges[0]) edge_dest.extend(edges[1]) edge_weight.extend(dist) except StopIteration: break
from dask.multiprocessing import get from multiprocessing import cpu_count nCores = cpu_count() from sklearn.metrics import pairwise_distances_chunked from sklearn.metrics.pairwise import euclidean_distances from scipy.spatial.distance import wminkowski def rf(dchunk , start): return dchunk.argsort(axis=1)[:,:knnk] t1 = time.time() distMat = pairwise_distances_chunked(kdf, distSample , reduce_func=rf , metric='nan_euclidean',n_jobs=-1, force_all_finite=False) distMat = list(distMat)[0] imputeIndices = pd.DataFrame(distMat) t2 = time.time() print(t2-t1) def KNN(kdf): #dist = euclidean_distances(kdf , distSample) dist = pairwise_distances(kdf, distSample, metric='nan_euclidean', force_all_finite=False) indices = dist.argsort(axis=1)[:,:knnk] #kdf = kdf[].fillna() return indices kdf.shape
import numpy as np from sklearn.metrics import pairwise_distances_chunked import pandas as pd df = pd.read_csv("data/human/04_sl_input/cell_line_compare_expr.csv") X = df.iloc[:, 1:10721].transpose() gen = pairwise_distances_chunked(X) gen2 = next(gen) # this was enough #gen3 = next(gen) np.savetxt("cell_dist_mat.txt", gen2)
def _evaluate_map(data_loader, model, writer, epoch, logging_label, no_cuda, log_interval, map, **kwargs): """ The evaluation routine Parameters ---------- data_loader : torch.utils.data.DataLoader The dataloader of the evaluation set model : torch.nn.module The network model being used writer : tensorboardX.writer.SummaryWriter The tensorboard writer object. Used to log values on file for the tensorboard visualization. epoch : int Number of the epoch (for logging purposes) logging_label : string Label for logging purposes. Typically 'test' or 'valid'. Its prepended to the logging output path and messages. no_cuda : boolean Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used. log_interval : int Interval limiting the logging of mini-batches. Default value of 10. map : str Specify value for mAP computation. Possible values are ("auto", "full" or specify K for AP@K) Returns ------- mAP : float Mean average precision for evaluated on this split """ multi_run = kwargs['run'] if 'run' in kwargs else None # Switch to evaluate mode (turn off dropout & such ) model.eval() labels, outputs = [], [] # For use with the multi-crop transform multi_crop = False # Iterate over whole evaluation set pbar = tqdm(enumerate(data_loader), total=len(data_loader), unit='batch', ncols=150, leave=False) with torch.no_grad(): for batch_idx, (data, label) in pbar: # Check if data is provided in multi-crop form and process accordingly if len(data.size()) == 5: multi_crop = True bs, ncrops, c, h, w = data.size() data = data.view(-1, c, h, w) if not no_cuda: data = data.cuda() # Compute output out = model(data) if multi_crop: out = out.view(bs, ncrops, -1).mean(1) # Store output outputs.append(out.data.cpu().numpy()) labels.append(label.data.cpu().numpy()) # Log progress to console if batch_idx % log_interval == 0: pbar.set_description(logging_label + ' Epoch: {} [{}/{} ({:.0f}%)]'.format( epoch, batch_idx * len(data), len(data_loader.dataset), 100. * batch_idx / len(data_loader))) # Measure accuracy (FPR95) num_tests = len(data_loader.dataset.file_names) labels = np.concatenate(labels, 0).reshape(num_tests) outputs = np.concatenate(outputs, 0) # Cosine similarity distance distances = pairwise_distances_chunked(outputs, metric='cosine', n_jobs=16) logging.debug('Computed pairwise distances') t = time.time() mAP, per_class_mAP = compute_mapk(distances, labels, k=map) writer.add_text('Per class mAP at epoch {}\n'.format(epoch), json.dumps(per_class_mAP, indent=2, sort_keys=True)) logging.debug('Completed evaluation of mAP in {}'.format( datetime.timedelta(seconds=int(time.time() - t)))) logging.info('\33[91m ' + logging_label + ' set: mAP: {}\n\33[0m'.format(mAP)) # Logging the epoch-wise accuracy if multi_run is None: writer.add_scalar(logging_label + '/mAP', mAP, epoch) else: writer.add_scalar(logging_label + '/mAP{}'.format(multi_run), mAP, epoch) return mAP
def create_intra_samples_model(self): """Create intra sample model This model compute depth correlation within samples. This will be used later to compute a new intra z-score in a new sample """ # Keep row every step line # reset index because we are going to work on integer index logging.info(f"Create intra model") sub_raw = self.raw.reset_index() # sub_raw = sub_raw[sub_raw.index % self.sampling == 0] # Create Mask index # This is used to avoid pairwise comparaison within same name # For example, if name is = [A,A,A,B,B,C], it computes the following mask # A A A B B C # A 0 0 0 1 1 0 # A 0 0 0 1 1 0 # A 0 0 0 1 1 0 # B 1 1 1 0 0 1 # B 1 1 1 0 0 1 # C 1 1 1 1 1 0 index = sub_raw["name"] mask = np.array([i[0] == i[1] for i in product(index, index) ]).reshape(len(index), len(index)) # return to multiindex sub_raw = sub_raw.set_index(["name", "chrom", "pos"]) def _reduce(chunk, start): """This function is called internally by pairwise_distances_chunked @see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances_chunked.html This function looks for the maximum correlation value in the chunk matrix and return the id Same name in pairwise are skiped by the mask For example: A B C A NA 0.9 0.8 B 0.5 NA 0.4 C 0.3 0.7 NA Will return a dataframe: id idx corr A B 0.9 B C 0.4 C B 0.9 Args: chunk (TYPE): Description start (TYPE): Description Returns: TYPE: Description """ # skip na value chunk[np.isnan(chunk)] = 1 # correlation metrics from sklearn is 1 - corr chunk = 1 - chunk rows_size = chunk.shape[0] select_mask = mask[start:start + rows_size] # looks for id of maximum correlation value idx = np.argmax(np.ma.masked_array(chunk, select_mask), axis=1) # We only get idx, let's get correlation value corr = [] for i, index in enumerate(idx): corr.append(chunk[i][index]) # Create a dataframe return pd.DataFrame({ "idx": idx, "corr": corr }, index=range(start, start + rows_size)) # Perform pairwise correlation by using pairwise_distances_chunked to avoid memory limit all_reduce_chunk = [] # -1 mean all jobs for chunk in pairwise_distances_chunked(sub_raw, metric="correlation", reduce_func=_reduce, n_jobs=self.threads): all_reduce_chunk.append(chunk) self.intra_model = pd.concat(all_reduce_chunk) ss = sub_raw.reset_index(drop=True) # avoid warning : polynomial.py:630: RuntimeWarning: invalid value encountered in true_divide with np.errstate(divide="ignore", invalid="ignore"): for i, row in self.intra_model.iterrows(): j = row["idx"] x = ss.loc[i, :] y = ss.loc[j, :] try: coef, intercept = tuple(np.polyfit(x, y, 1)) yp = x * coef + intercept error = yp - y std = error.std() except: coef, intercept = 0, 0 std = np.NaN self.intra_model.loc[i, "coef"] = coef self.intra_model.loc[i, "intercept"] = intercept self.intra_model.loc[i, "std2"] = std self.intra_model = self.intra_model.set_index(sub_raw.index)
def interlist_diversity(predicted_results: pd.DataFrame, click_column: str, k: int, user_id_column: str = Constants.user_id, item_id_column: str = Constants.item_id, user_sample_size: Union[int, float, None] = 10000, seed: int = Constants.default_seed, metric: Union[str, Callable] = 'cosine', num_runs: int = 10, n_jobs: int = 1, working_memory: int = None) -> Tuple[float, int]: """ Inter-List Diversity@k measures the inter-list diversity of the recommendations when only k recommendations are made to the user. It measures how user's lists of recommendations are different from each other. This metric has a range in :math:`[0, 1]`. The higher this metric is, the more diversified lists of items are recommended to different users. Let :math:`U` denote the set of :math:`N` unique users, :math:`u_i`, :math:`u_j \in U` denote the i-th and j-th user in the user set, :math:`i, j \in \{1,2,\cdots,N\}`. :math:`R_{u_i}` is the binary indicator vector representing provided recommendations for :math:`u_i`. :math:`I` is the set of all unique user pairs, :math:`\\forall~i<j, \{u_i, u_j\} \in I`. .. math:: Inter \mbox{-} list~diversity = \\frac{\sum_{i,j, \{u_i, u_j\} \in I}(cosine\_distance(R_{u_i}, R_{u_j}))}{|I|} By default, the reported metric is averaged over a number of ``num_runs`` (default=10) evaluations with each run using ``user_sample_size`` (default=10000) users, to ease the computing process and meanwhile get close approximation of this metric. When ``user_sample_size=None``, all users will be used in evaluation. Parameters ---------- predicted_results: pd.DataFrame Recommendations data frame with (user_id, item_id, score) in each row. k: int Top-k recommendations to consider. user_id_column: str User id column name. item_id_column: str Item id column name. click_column: str Recommendation score column name. user_sample_size: Union[int, float, None] When input is an integer, it defines the number of randomly sampled users. When input is float, it defines the proportion of users to randomly sample for evaluation. If it is None, all users are included. Default=10,000. seed: int The seed used to create random state. metric: Union[str, Callable] Default = 'cosine'. The distance metric leveraged by sklearn.metrics.pairwise_distances_chunked. The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. num_runs: int num_runs is used to report the approximation of Inter-List Diversity over multiple runs on smaller samples of users, default=10, for a speed-up on evaluations. The sampling size is defined by user_sample_size. The final result is averaged over the multiple runs. n_jobs: int Number of jobs to use for computation in parallel, leveraged by sklearn.metrics.pairwise_distances_chunked. -1 means using all processors. Default=1. working_memory: Union[int, None] Maximum memory for temporary distance matrix chunks, leveraged by sklearn.metrics.pairwise_distances_chunked. Example input: working_memory = 1024. When None (default), the value of sklearn.get_config()['working_memory'], i.e. 1024M, is used. Returns ------- Inter-list diversity metric, number of unique users as the support to get the metric """ # Sample users if user_sample_size is not None: results_over_runs = [] supports_over_runs = [] # Create a different seed for each run rng = np.random.default_rng(seed) seeds = rng.integers(0, num_runs * 10, num_runs) for i in range(num_runs): df = sample_users(predicted_results, user_id_column, user_sample_size, seed=seeds[i]) res, support = interlist_diversity(df, click_column, k, user_id_column=user_id_column, item_id_column=item_id_column, user_sample_size=None, metric=metric, n_jobs=n_jobs, working_memory=working_memory) results_over_runs.append(res) supports_over_runs.append(support) inter_list_diversity = np.mean(results_over_runs) support = int(np.mean(supports_over_runs)) return inter_list_diversity, support df = predicted_results # Sort by user and score, and take the top k scores. df = get_sorted_clicks(df, user_id_column, click_column, k) # Given user/item id column names, create sparse matrix as the new representation of user-item interactions. sparse_matrix = tocsr(df, user_id_column, item_id_column) # Get pairwise cosine distances chunked_sum_cosine_distances = map( sum, pairwise_distances_chunked(sparse_matrix, reduce_func=reduce_func, metric=metric, n_jobs=n_jobs, working_memory=working_memory)) # Sum of all cosine distances of unique pairs sum_cosine_distances = sum(list(chunked_sum_cosine_distances)) / 2.0 # Get number of pairs num_pairs = np.sum(range(sparse_matrix.shape[0])) # Calculate metric if num_pairs == 0: inter_list_diversity = np.nan warnings.warn( 'Inter-List Diversity will be nan when there is only one single user.' ) else: inter_list_diversity = sum_cosine_distances / num_pairs if np.abs(inter_list_diversity) <= 1e-06: inter_list_diversity = 0.0 # Calculate support, set it to be the number of users support = len(df[user_id_column].unique()) return inter_list_diversity, support
def min_radius(self, n, data, target, dist_metric, mode, **kwargs): """ Compute minimum radius of hypersphere such that for each example in the data matrix as the centre the sphere will contain at least n examples from same class and n examples from a different class. Args: n : int -- minimum number of examples from same class and different class a hypersphere with centre in each example in the dataset should contain data : Array[np.float64] -- Matrix containing examples' features as rows target : Array[np.int] Matrix of target variable values dist_metric : Callable[[Array[np.float64], Array[np.float64]], np.float64] -- distance metric for distance matrix computation mode : str -- equal to 'index' if selecting examples by their index and equal to 'example' if passing in explicit examples. **kwargs -- argument with keyword learned_metric_func can contain a learned metric function. Returns: np.float64 : Minimum acceptable radius of the hypersphere """ # Allocate array for storing minimum acceptable radius for each example in dataset. min_r = np.empty(data.shape[0], dtype=np.float) # Initialize distance matrix. dist_mat = None # If operating in learned metric space. if mode == "index": dist_metric_aux = lambda x1, x2: dist_metric( np.ones(data.shape[1], dtype=np.float), x1[np.newaxis], x2[ np.newaxis]) dist_func = partial(kwargs['learned_metric_func'], dist_metric_aux) dist_func_adapter = lambda x1, x2: dist_func( np.int( np.where(np.sum(np.equal(x1, data), 1) == data.shape[1])[0] [0]), np.int( np.where(np.sum(np.equal(x2, data), 1) == data.shape[1])[0] [0])) dist_mat = sk_metrics.pairwise_distances_chunked( data, metric=dist_func_adapter, working_memory=0) elif mode == "example": # else dist_func = lambda x1, x2: dist_metric( np.ones(data.shape[1], dtype=np.float), x1[np.newaxis], x2[ np.newaxis]) dist_mat = sk_metrics.pairwise_distances_chunked(data, metric=dist_func, n_jobs=-1, working_memory=0) else: raise ValueError('Unknown mode specifier {0}'.format(mode)) # Go over examples and compute minimum acceptable radius for each example. for k in np.arange(data.shape[0]): dist_from_e = next(dist_mat)[ 0] # Get next row of distances matrix. msk = target == target[k] # Get mask for examples from same class. dist_same = dist_from_e[ msk] # Get minimum distance that includes n examples from same class. dist_diff = dist_from_e[ ~msk] # Get minimum distance that includes n examples from different class. try: min_r[k] = np.max( (np.sort(dist_same)[n], np.sort(dist_diff)[n - 1] )) # Compute minimum radius for this example. except IndexError: raise ValueError( 'Insufficient examples with class {0} for given value of n (n = {1})' .format(target[k], n)) return np.max( min_r ) # Return maximum of array of minimum acceptable radiuses for each example
def calculate_distances(self): print('\n Calculating distances - TFIDF') self.tfidf_distances = pairwise_distances_chunked( self.tfidf_embeddings, metric='cosine', n_jobs=-1) """
def __init__(self, configs: Union[Configuration, dict, nx.DiGraph]): if type(configs) == nx.DiGraph: # Assume we're creating a copy super().__init__(configs) return elif type(configs) == dict: configs = SmallWorldTopology.Configuration(**configs) super().__init__() self.__dict__.update(asdict(configs)) assert ( len(self.minicolumn_shape) == 3 ), "Minicolumn shape must be of dimension 3 (3D)" assert ( len(self.macrocolumn_shape) == 3 ), "Macrocolumn shape must be of dimension 3 (3D)" # Initial neuron positions (all separated by neuron_spacing) i, j, k = np.multiply(self.macrocolumn_shape, self.minicolumn_shape) grid = np.mgrid[:i, :j, :k].reshape(3, -1) x, y, z = grid * self.neuron_spacing # Adding minicolumnSpacing (from random to small world topology) if self.minicolumn_spacing > 0: for d in range(3): # For each dimension grid[d] //= self.minicolumn_shape[d] x += grid[0] * self.minicolumn_spacing y += grid[1] * self.minicolumn_spacing z += grid[2] * self.minicolumn_spacing positions = map(lambda p: {"position": p}, zip(x, y, z)) self.add_nodes_from(zip(range(len(x)), positions)) # Distance-based random connectivity positions = np.stack(np.asarray(self.nodes.data("position"))[:, 1]) if ( self.sparse_init ): # Slower but iterative (for adjacency matrices that don't fit in memory) distances = pairwise_distances_chunked( positions, metric="euclidean", n_jobs=-1, reduce_func=lambda chunk, start: bsr_matrix( np.random.random(chunk.shape) < self.p_max * np.exp(-chunk / self.intracolumnar_sparseness) ), working_memory=self.mem_available, ) adjacency_matrix = vstack(list(distances)) adjacency_matrix.setdiag(0) # Avoid self-connections self.add_edges_from(zip(*adjacency_matrix.nonzero())) else: distances = cdist(positions, positions, "euclidean") probabilities = self.p_max * np.exp( -distances / self.intracolumnar_sparseness ) np.fill_diagonal(probabilities, 0) # Avoid self-connections rand_matrix = np.random.random(probabilities.shape) i, j = np.nonzero(rand_matrix < probabilities) self.add_edges_from(zip(i, j)) n_neurons = self.number_of_nodes() self.inhibitory_neurons = set( np.random.permutation(n_neurons)[: int(n_neurons * self.inhibitory_prob)] ) for u, v in self.edges: if u in self.inhibitory_neurons: self.edges[u, v]["weight"] = -np.random.uniform( *self.inhibitory_init_weight_range ) else: self.edges[u, v]["weight"] = np.random.uniform( *self.excitatory_init_weight_range ) if self.spectral_radius_norm: spectral_radius = lambda matrix: np.max(np.abs(np.linalg.eigvals(matrix))) adj = nx.adjacency_matrix(self, weight="weight").todense() scale = 1.0 / spectral_radius(np.abs(adj)) for i, (u, v) in enumerate(self.edges): self.edges[u, v]["weight"] = self.edges[u, v]["weight"] * scale if _logger.isEnabledFor(logging.INFO): # Some extra info about the topology out_degrees = np.array(self.out_degree())[:, 1] reporter.log_metrics( { "number-of-neurons": n_neurons, "number-of-synapses": self.number_of_edges(), "excitatory-ratio": 100.0 * (1.0 - len(self.inhibitory_neurons) / n_neurons), "avg-out-degree": np.mean(out_degrees), "nb-out-degree-0": len(out_degrees) - np.count_nonzero(out_degrees), "nb-isolates": nx.number_of_isolates(self), } )