def test_radius_neighbors_graph(): """Test radius_neighbors_graph to build the Nearest Neighbor graph.""" X = np.array([[0, 1], [1.01, 1.0], [2, 0]]) A = neighbors.radius_neighbors_graph(X, 1.5, mode="connectivity") assert_array_equal(A.todense(), [[1.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0]]) A = neighbors.radius_neighbors_graph(X, 1.5, mode="distance") assert_array_almost_equal(A.todense(), [[0.0, 1.01, 0.0], [1.01, 0.0, 1.40716026], [0.0, 1.40716026, 0.0]])
def test_include_self_neighbors_graph(): """Test include_self parameter in neighbors_graph""" X = [[2, 3], [4, 5]] kng = neighbors.kneighbors_graph(X, 1, include_self=True).A kng_not_self = neighbors.kneighbors_graph(X, 1, include_self=False).A assert_array_equal(kng, [[1.0, 0.0], [0.0, 1.0]]) assert_array_equal(kng_not_self, [[0.0, 1.0], [1.0, 0.0]]) rng = neighbors.radius_neighbors_graph(X, 5.0, include_self=True).A rng_not_self = neighbors.radius_neighbors_graph(X, 5.0, include_self=False).A assert_array_equal(rng, [[1.0, 1.0], [1.0, 1.0]]) assert_array_equal(rng_not_self, [[0.0, 1.0], [1.0, 0.0]])
def test_radius_neighbors_graph_sparse(seed=36): """Test radius_neighbors_graph to build the Nearest Neighbor graph for sparse input.""" rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) for n_neighbors in [1, 2, 3]: for mode in ["connectivity", "distance"]: assert_array_almost_equal( neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(), neighbors.radius_neighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), )
def _get_affinity_matrix(self, X, Y=None): """Calculate the affinity matrix from data Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. If affinity is "precomputed" X : array-like, shape (n_samples, n_samples), Interpret X as precomputed adjacency graph computed from samples. Returns ------- affinity_matrix, shape (n_samples, n_samples) """ if self.affinity == 'precomputed': self.affinity_matrix_ = X print( type( self.affinity_matrix_)) return self.affinity_matrix_ # nearest_neigh kept for backward compatibility if self.affinity == 'nearest_neighbors': if sparse.issparse(X): warnings.warn("Nearest neighbors affinity currently does " "not support sparse input, falling back to " "rbf affinity") self.affinity = "rbf" else: self.n_neighbors_ = (self.n_neighbors if self.n_neighbors is not None else max(int(X.shape[0] / 10), 1)) self.affinity_matrix_ = kneighbors_graph(X, self.n_neighbors_) # currently only symmetric affinity_matrix supported self.affinity_matrix_ = 0.5 * (self.affinity_matrix_ + self.affinity_matrix_.T) return self.affinity_matrix_ if self.affinity == 'radius_neighbors': if self.neighbors_radius is None: self.neighbors_radius_ = np.sqrt(X.shape[1]) # to put another defaault value, like diam(X)/sqrt(dimensions)/10 else: self.neighbors_radius_ = self.neighbors_radius self.gamma_ = (self.gamma if self.gamma is not None else 1.0 / X.shape[1]) self.affinity_matrix_ = radius_neighbors_graph(X, self.neighbors_radius_, mode='distance') self.affinity_matrix_.data **= 2 self.affinity_matrix_.data /= -self.neighbors_radius_**2 self.affinity_matrix_.data = np.exp( self.affinity_matrix_.data, self.affinity_matrix_.data ) return self.affinity_matrix_ if self.affinity == 'rbf': self.gamma_ = (self.gamma if self.gamma is not None else 1.0 / X.shape[1]) self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_) return self.affinity_matrix_ self.affinity_matrix_ = self.affinity(X) return self.affinity_matrix_
def test_non_euclidean_kneighbors(): rng = np.random.RandomState(0) X = rng.rand(5, 5) # Find a reasonable radius. dist_array = pairwise_distances(X).flatten() np.sort(dist_array) radius = dist_array[15] # Test kneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.kneighbors_graph( X, 3, metric=metric).toarray() nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X) assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray()) # Test radiusneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.radius_neighbors_graph( X, radius, metric=metric).toarray() nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X) assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).toarray()) # Raise error when wrong parameters are supplied, X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3, metric='euclidean') X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs, radius, metric='euclidean')
def test_radius_neighbors_graph(self): x = [[0], [3], [1]] df = pdml.ModelFrame(x) result = df.neighbors.radius_neighbors_graph(1.5) expected = neighbors.radius_neighbors_graph(x, 1.5) self.assert_numpy_array_almost_equal(result.toarray(), expected.toarray())
def example2(): """画出radius-近邻关系图 距离<=radius的将被看做近邻 """ train = np.array([[1,2,4,7,9,10]]).transpose() graph = radius_neighbors_graph(train, 2.5) # radius = 2.5 print(graph) print(graph.toarray())
def test_radius_neighbors_graph(): # Test radius_neighbors_graph to build the Nearest Neighbor graph. X = np.array([[0, 1], [1.01, 1.], [2, 0]]) A = neighbors.radius_neighbors_graph(X, 1.5, mode='connectivity') assert_array_equal( A.toarray(), [[1., 1., 0.], [1., 1., 1.], [0., 1., 1.]]) A = neighbors.radius_neighbors_graph(X, 1.5, mode='distance') assert_array_almost_equal( A.toarray(), [[0., 1.01, 0.], [1.01, 0., 1.40716026], [0., 1.40716026, 0.]])
def distance_matrix( X, flindex = None, mode='radius_neighbors', neighbors_radius=None, symmetrize = True, n_neighbors=0 ): # DNearest neighbors has issues. TB FIXED if mode == 'nearest_neighbors': warnings.warn("Nearest neighbors currently does not work" "falling back to radius neighbors") mode = 'radius_neighbors' if mode == 'radius_neighbors': neighbors_radius_ = (neighbors_radius if neighbors_radius is not None else 1.0 / X.shape[1]) # to put another defaault value, like diam(X)/sqrt(dimensions)/10 if flindex is not None: distance_matrix = fl_radius_neighbors_graph(X, neighbors_radius_, flindex, mode='distance') else: distance_matrix = radius_neighbors_graph(X, neighbors_radius_, mode='distance') return distance_matrix
def _build_graph(self): """Compute the graph Laplacian.""" # Graph sparsification if self.sparsify == "epsilonNN": self.A_ = radius_neighbors_graph(self.X_, self.radius, include_self=False) else: Q = kneighbors_graph(self.X_, self.n_neighbors, include_self=False).astype(np.bool) if self.sparsify == "kNN": self.A_ = (Q + Q.T).astype(np.float64) elif self.sparsify == "MkNN": self.A_ = (Q.multiply(Q.T)).astype(np.float64) # Edge re-weighting if self.reweight == "rbf": W = rbf_kernel(self.X_, gamma=self.t) self.A_ = self.A_.multiply(W) return sp.csgraph.laplacian(self.A_, normed=self.normed)
def make_graph_radius(x, radius, metric='euclidean', normalize_dists=True): use_sklearn = False if use_sklearn: dists = radius_neighbors_graph(x, radius, mode='connectivity', metric=metric) else: assert metric == 'euclidean' p = x.shape[1] #max_dist = norm(np.ones(p) - np.zeros(p)) x = normalize(x) #dists = pairwise.pairwise_distances(x,x,metric) / max_dist dists = pairwise.pairwise_distances(x, x, metric) if normalize_dists: dists /= dists.max() dists[np.diag_indices_from(dists)] = 0 dists[dists > radius] = 0 dists[dists != 0] = 1 return dists
def build_graph(X, graph_params=GraphParams(), metric='euclidean'): """Builds a graph (knn or epsilon) weight matrix W W is sparse - to be optimized somehow """ graph_type = graph_params.type sigma2 = graph_params.sigma2 graph_thresh = graph_params.thresh n = len(X) W = np.zeros((n, n)) if graph_type is 'knn': D = kneighbors_graph(X, graph_thresh, metric=metric, mode='distance').toarray() elif graph_type is 'eps': graph_thresh = -sigma2 * np.log(graph_thresh) D = radius_neighbors_graph(X, graph_thresh, metric=metric, mode='distance').toarray() W[D > 0] = np.exp(-D[D > 0] / sigma2) return W
def neighbors_plot(self): import gc from numpy import histogram import numpy as np from sklearn.neighbors import radius_neighbors_graph start_pos, end_pos, paths = FileSplitter.points() del start_pos, end_pos gc.collect() neighbors = radius_neighbors_graph(paths, radius=0.005) del paths gc.collect() neighbors = neighbors.toarray() x = np.matrix(neighbors) x = x.sum(axis=1) counts = [d[0, 0] for d in x] hist, edges = histogram(counts, bins=10, density=False) self.plot_on_bokeh_hist('neighbors_hist.html', '# of Neighbors', '# of Occurrance', 'Neighbors Within Radius', hist, edges) pass
def fit_predict(self, X): if type(self.radius_) is int: if input == "point cloud": adj = kneighbors_graph(X, n_neighbors=self.radius_, metric=self.metric_) if input == "distance matrix": adj = np.zeros(X.shape) idxs = np.argpartition(X, self.radius_, axis=1)[:, :self.radius_] for i in range(len(X)): adj[i, idxs[i, :]] = np.ones(len(idxs[i])) else: if input == "point cloud": adj = radius_neighbors_graph(X, radius=self.radius_, metric=self.metric_) if input == "distance matrix": adj = np.where(X <= self.radius_, np.ones(X.shape), np.zeros(X.shape)) _, clusters = csgraph.connected_components(adj) return clusters
def test_non_euclidean_kneighbors(): rng = np.random.RandomState(0) X = rng.rand(5, 5) # Find a reasonable radius. dist_array = pairwise_distances(X).flatten() np.sort(dist_array) radius = dist_array[15] # Test kneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.kneighbors_graph(X, 3, metric=metric).toarray() nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X) assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray()) # Test radiusneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.radius_neighbors_graph(X, radius, metric=metric).toarray() nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X) assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).toarray()) # Raise error when wrong parameters are supplied, X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3, metric='euclidean') X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs, radius, metric='euclidean')
def generate_edges(X, mode='kneighbors_graph', n_neighbors=3, radius=0.1): """ returns array with pairs of indices [vertex_from, vertex_to] and weight vector """ n_neighbors = min(n_neighbors, len(X) - 1) if n_neighbors == 0: return X[:, 3].reshape(-1, 1), np.zeros((1, 5)), np.zeros((2, 1)) if mode == 'kneighbors_graph': adjacency_matrix = np.array((kneighbors_graph(X=X[:, :3], n_neighbors=n_neighbors, mode='distance')).todense()) elif mode == 'radius_neighbors_graph': adjacency_matrix = np.array((radius_neighbors_graph(X=X[:, :3], radius=radius, mode='distance')).todense()) else: raise 'Unknown mode {}'.format(mode) rows, cols = np.where(adjacency_matrix > 0) edges = np.vstack([rows, cols]) weights = adjacency_matrix[rows, cols] nodes_features = X[:, 3].reshape(-1, 1) edges_features = X[edges.T[:, 0]] - X[edges.T[:, 1]] return nodes_features, np.c_[edges_features, weights], edges.astype(int)
def main(): data, _ = make_swiss_roll(random_state=1) n_knn = 3 kng = kneighbors_graph(data, n_neighbors=n_knn) title = 'KNN Graph where n_neighbors={}'.format(n_knn) plot_graph(data, kng, title=title) n_knn = 4 kng = kneighbors_graph(data, n_neighbors=n_knn) title = 'KNN Graph where n_neighbors={}'.format(n_knn) plot_graph(data, kng) radius = 6.5 rng = radius_neighbors_graph(data, radius=radius) title = 'RN Graph where radius={}'.format(radius) plot_graph(data, rng, title) n_neighbors = 5 delta = 0.95 ckng = cknneighbors_graph(data, n_neighbors=5, delta=0.95) title = 'CKNN Graph where n_neighbors={}, delta={}'\ .format(n_neighbors, delta) plot_graph(data, ckng, title)
def _build_graph(self): """Compute the graph Laplacian.""" # Graph sparsification if self.sparsify == 'epsilonNN': self.A_ = radius_neighbors_graph(self.X_, self.radius, include_self=False) else: Q = kneighbors_graph( self.X_, self.n_neighbors, include_self = False ).astype(np.bool) if self.sparsify == 'kNN': self.A_ = (Q + Q.T).astype(np.float64) elif self.sparsify == 'MkNN': self.A_ = (Q.multiply(Q.T)).astype(np.float64) # Edge re-weighting if self.reweight == 'rbf': W = rbf_kernel(self.X_, gamma=self.t) self.A_ = self.A_.multiply(W) return sp.csgraph.laplacian(self.A_, normed=self.normed)
def geodesic_radius(self, points=None, use_cache=True): if use_cache and self.geodesic_d is not None: return self.geodesic_d if points is None: points = self.points dist = self.euclidean_distances() nbrs_inc = np.argsort(dist, axis=1) max_dist = -1 for i in range(dist.shape[0]): achieved_neighbors = 0 while achieved_neighbors < min(self.n_neighbors, dist.shape[0]): j = achieved_neighbors if max_dist < dist[i][nbrs_inc[i][j]]: max_dist = dist[i][nbrs_inc[i][j]] achieved_neighbors += 1 nbrs = (NearestNeighbors(algorithm='auto', n_neighbors=self.n_neighbors, radius=max_dist, n_jobs=self.n_jobs) .fit(points)) kng = radius_neighbors_graph( nbrs, max_dist, mode='distance', n_jobs=self.n_jobs) self.geodesic_d = graph_shortest_path(kng, method='D', directed=False) return self.geodesic_d
def computeGraph(self, R=None, similarity='He', g=1, th_gauss=0.1): """ Computes a sparse graph for the self graph structure. The self graph must containg a T-matrix, self.T Inputs: :self.T: Data matrix :R: Radius. Edges link all data pairs at distance lower than R This is to forze a sparse graph. :similarity: Similarity measure used to compute affinity matrix Available options are: 'l1' :1 minus l1 distance 'He' :1 minus squared Hellinger's distance (JS) (sklearn-based implementation) 'Gauss' :An exponential function of the squared l2 distance :g: Exponent for the affinity mapping (not used for 'Gauss') :th_gauss: Similarity threshold All similarity values below this threshold are set to zero. This is only for the gauss method, the rest of them compute the threshold automatically from R). Returns: :self.edgeT_id: List of edges, as pairs (i, j) of indices :self.affinityT: List of affinity values for each pair in edgeT_id :self.df_edges: Pandas dataframe with one row per edge and columns 'Source', 'Target' and 'Weihgt'. The weight is equal to the (mapped) affinity value """ logging.info(f"-- Computing graph with {self.n_nodes} nodes") logging.info(f"-- Similarity measure: {similarity}") # ######################### # Computing Distance Matrix # This is just to abbreviate Tg = self.Tg # Select Distance measure for radius_neighbor_graph if similarity in ['Gauss', 'He']: d = 'l2' # Note: l2 seems equivalent to minkowski (p=2) elif similarity in ['l1']: d = 'l1' # Note: l1 seems equivalent to manhattan else: logging.error("computeTsubGraph ERROR: Unknown similarity measure") exit() # Select secondary radius R0 = R # Compute the connectivity graph of all pair of nodes at distence # below R0 # IMPORTANT: Note that, despite radius_neighbors_graph has an option # 'distance' that returns the distance values, it cannot be used in # any case because the distance matrix does not distinghish between # nodes at distance > R0 and nodes at distance = 0 t0 = time() logging.info(f'-- -- Computing neighbors_graph ...') if similarity in ['He']: # We must compute the connectivity graph because module # radius_neighbors_graph looses edges between nodes at zero # distance D = radius_neighbors_graph(np.sqrt(Tg), radius=R0, mode='connectivity', metric=d) elif similarity in ['l1', 'Gauss']: D = radius_neighbors_graph(Tg, radius=R0, mode='connectivity', metric=d) logging.info(f' in {time()-t0} seconds') # ############################################## # From distance matrix to list of weighted edges # Compute lists with origin, destination and value for all edges in # the graph affinity matrix. orig_id, dest_id = D.nonzero() # Since the graph is undirected, we select ordered pairs orig_id, # dest_id only self.edgeT_id = list(filter(lambda i: i[0] < i[1], zip(orig_id, dest_id))) # #################### # Computing Affinities logging.info(f"-- -- Computing affinities for {len(self.edgeT_id)}" + " edges ...",) t0 = time() if similarity == 'He': # A new self.edgeT_id is returned because the function filters out # affinity values below th. self.edgeT_id, self.affinityT = self.he_affinity(Tg, R, g) elif similarity == 'l1': self.edgeT_id, self.affinityT = self.l1_affinity(Tg, R, g) elif similarity == 'Gauss': self.edgeT_id, self.affinityT = self.l2_affinity(Tg, R, th_gauss) else: logging.error("computeTsubGraph ERROR: Unknown similarity measure") logging.info(f" reduced to {len(self.edgeT_id)} edges") logging.info(f' Computed in {time()-t0} seconds') logging.info(("-- -- Graph generated with {0} nodes and {1} " + "edges").format(self.n_nodes, len(self.edgeT_id))) return
def seeds_merge( varr: xr.DataArray, max_proj: xr.DataArray, seeds: pd.DataFrame, thres_dist=5, thres_corr=0.6, noise_freq: Optional[float] = None, ) -> pd.DataFrame: """ Merge seeds based on spatial distance and temporal correlation of their activities. This function build an adjacency matrix by thresholding spatial distance between seeds and temporal correlation between activities of seeds. It then merge seeds using the adjacency matrix by only keeping the seed with maximum intensity in the max projection within each connected group of seeds. The merge is therefore transitive. Parameters ---------- varr : xr.DataArray Input movie data. Should have dimension "height", "width" and "frame". max_proj : xr.DataArray Max projection of the movie data. seeds : pd.DataFrame Dataframe of seeds to be merged. thres_dist : int, optional Threshold of distance between seeds in pixel. By default `5`. thres_corr : float, optional Threshold of temporal correlation between activities of seeds. By default `0.6`. noise_freq : float, optional Cut-off frequency for optional smoothing of activities before computing the correlation. If `None` then no smoothing will be done. By default `None`. Returns ------- seeds : pd.DataFrame The resulting seeds dataframe with an additional column "mask_mrg", indicating whether the seed should be kept after the merge. If the column already exists in input `seeds` it will be overwritten. """ print("computing distance") nng = radius_neighbors_graph(seeds[["height", "width"]], thres_dist) print("computing correlations") adj = adj_corr(varr, nng, seeds[["height", "width"]], noise_freq) print("merging seeds") adj = adj > thres_corr adj = adj + adj.T labels = label_connected(adj, only_connected=True) iso = np.where(labels < 0)[0] seeds_final = set(iso.tolist()) for cur_cmp in np.unique(labels): if cur_cmp < 0: continue cur_smp = np.where(labels == cur_cmp)[0] cur_max = np.array([ max_proj.sel(height=seeds.iloc[s]["height"], width=seeds.iloc[s]["width"]) for s in cur_smp ]) max_seed = cur_smp[np.argmax(cur_max)] seeds_final.add(max_seed) seeds["mask_mrg"] = False seeds.loc[list(seeds_final), "mask_mrg"] = True return seeds
def main( fastq_r1, fastq_r2, barcodes, locations, tag_sequence, output_pdf, extra_pdf=None, debug=False, percentile=95.0, ): """ This script generates some plots for mapping barcoded reads. Reads sequences from FASTQ_R1 and FASTQ_R2. Assumes that the first read contains a 15bp barcode split across two locations, along with an 8bp UMI. The second read is assumed to have TAG_SEQUENCE in bases 20-40. """ create_logger(debug, dryrun=False) output_pdf = Path(output_pdf) log.debug(f"Reading from {fastq_r1}") with gzip.open(fastq_r1, "rt") as fh: r1_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)] log.debug(f"Reading from {fastq_r2}") with gzip.open(fastq_r2, "rt") as fh: r2_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)] log.debug(f"Reading {barcodes}") with open(barcodes) as fh: raw_bcs = ["".join(line.strip().split(",")) for line in fh] log.debug(f"Reading {locations}") with open(locations) as fh: x = np.array([float(v) for v in fh.readline().strip().split(",")]) y = np.array([float(v) for v in fh.readline().strip().split(",")]) xy = np.vstack((x, y)).T if extra_pdf is not None: extra_pdf_pages = PdfPages(extra_pdf) umi_counts = Counter(r[32:41] for r in r1_reads) log.debug( f"Found {len(umi_counts)} UMIs with {sum(umi_counts.values())} total counts" ) plot_log_hist(umi_counts.values(), "Reads per UMI", extra_pdf_pages) else: extra_pdf_pages = None # pre-emptively remove poly-T/N sequences ok_barcodes = [not set(bc).issubset({"T", "N"}) for bc in raw_bcs] xy = xy[ok_barcodes, :] bead_barcodes = [bc for ok, bc in zip(ok_barcodes, raw_bcs) if ok] log.info(f"Read {len(raw_bcs)} barcodes and filtered to {len(bead_barcodes)}") seq_barcodes = sorted(r1[:8] + r1[26:32] for r1 in r1_reads) # remove poly-T sequence if present seq_barcodes = [seq for seq in seq_barcodes if set(seq) != {"T"}] log.info(f"Found {len(set(seq_barcodes))} unique barcodes in sequencing data") log.info("Computing barcode matching") log.debug("Computing radius neighbor graph") # adjacency matrix for all beads within radius of each other radius_matrix = radius_neighbors_graph(xy, radius=10.0) log.debug("Computing hamming neighbor graph") # adjacency matrix for all barcodes within hamming distance 1 hamming_matrix = hamming1_adjacency(bead_barcodes) # just multiply together to get the combined adjacency matrix! combined_graph = nx.from_scipy_sparse_matrix(radius_matrix.multiply(hamming_matrix)) # add xy coordinates to graph so we can analyze later for n, (x, y) in zip(combined_graph.nodes, xy): combined_graph.nodes[n]["x"] = x combined_graph.nodes[n]["y"] = y # get connected components to find groups of similar/close barcodes bead_groups = list(nx.connected_components(combined_graph)) # calculate degenerate (ambiguous bases -> N) barcodes degen_bead_barcodes = [ degen_barcode({bead_barcodes[j] for j in bg}) for bg in bead_groups ] log.debug( f"Collapsed {len(bead_groups)} bead groups into" f" {len(set(degen_bead_barcodes))} barcodes" ) # average xy for grouped beads to get centroids bead_xy = dict() for bg, degen_bc in zip(bead_groups, degen_bead_barcodes): bg_graph = combined_graph.subgraph(bg) mean_x, mean_y = np.array( [[nd["x"], nd["y"]] for _, nd in bg_graph.nodes(data=True)] ).mean(0) bead_xy[degen_bc] = (mean_x, mean_y) barcode_matching = bipartite_matching( bead_barcodes, degen_bead_barcodes, bead_groups, seq_barcodes ) if extra_pdf is not None: tag_barcodes = [r2[20:40] for r2 in r2_reads] tag_counts = Counter(tag_barcodes) sum(1 for r1 in r1_reads if (r1[:8] + r1[26:32]) in barcode_matching) umis_per_tag = defaultdict(set) for r1, r2 in zip(r1_reads, r2_reads): umis_per_tag[r2[20:40]].add(r1[32:41]) plot_log_hist(tag_counts.values(), "Reads per tag", extra_pdf_pages) plot_log_hist( list(map(len, umis_per_tag.values())), "UMIs per tag", extra_pdf_pages ) log.debug(f"Counting UMIs and reads per bead for sequence {tag_sequence}") reads_per_umi_per_bead = defaultdict(Counter) umis_per_bead = defaultdict(set) reads_per_bead = Counter() for r1, r2 in zip(r1_reads, r2_reads): seq_bc = r1[:8] + r1[26:32] if seq_bc not in barcode_matching: continue if r2[20:40] != tag_sequence: continue bead_bc = barcode_matching[seq_bc] umi = r1[32:41] reads_per_umi_per_bead[bead_bc][umi] += 1 umis_per_bead[bead_bc].add(umi) reads_per_bead[bead_bc] += 1 filtered_barcodes = [bc for bc in degen_bead_barcodes if umis_per_bead[bc]] bead_xy_a = np.vstack([bead_xy[dbc] for dbc in filtered_barcodes]) with gzip.open(output_pdf.with_suffix(".reads_per_umi.txt.gz"), "wt") as out: print("bead_barcodes\tumi\treads", file=out) for bc in filtered_barcodes: for umi in reads_per_umi_per_bead[bc][umi]: print(f"{bc}\t{umi}\t{reads_per_umi_per_bead[bc][umi]}", file=out) with output_pdf.with_suffix(".txt").open("w") as out: print("bead_barcode\tumis\treads", file=out) for bc in filtered_barcodes: print(f"{bc}\t{len(umis_per_bead[bc])}\t{reads_per_bead[bc]}", file=out) if extra_pdf is not None: plot_log_hist( [len(umis_per_bead[bc]) for bc in filtered_barcodes], "UMIs per bead", extra_pdf_pages, ) extra_pdf_pages.close() pdf_pages = PdfPages(output_pdf) log.info("Making plots") spatial_plot( bead_xy_a, [len(umis_per_bead[bc]) for bc in filtered_barcodes], "UMIs per bead", pdf_pages, pct=percentile, ) spatial_plot( bead_xy_a, [np.log10(len(umis_per_bead[bc])) for bc in filtered_barcodes], "log10 UMIs per bead", pdf_pages, pct=percentile, ) spatial_plot( bead_xy_a, [reads_per_bead[bc] for bc in filtered_barcodes], "Reads per bead", pdf_pages, pct=percentile, ) spatial_plot( bead_xy_a, [np.log10(1 + reads_per_bead[bc]) for bc in filtered_barcodes], "log10 reads per bead", pdf_pages, pct=percentile, ) pdf_pages.close() log.info("Done!")
def traj_segment_generator(pi, env, horizon, adam, vfgrad, stochastic, total_gen): gen_graph_this_episode=total_gen # to generate or not a graph during this episode stats=[] # variable to keep statistics and save them on disk G = nx.Graph() # Graph variable states= [] # History of visited states node_ptr=0 # Pointer used to keep track of the states' list i_episode=0 print('New graph at episode {}'.format(i_episode)) t = 0 ac = env.action_space.sample() # not used, just so we have the datatype new = True # marks if we're on first timestep of an episode ob = env.reset() states.append(ob) cur_ep_ret = 0 # return in current episode cur_ep_len = 0 # len of current episode ep_rets = [] # returns of completed episodes in this segment ep_lens = [] # lengths of ... # Initialize history arrays obs = np.array([ob for _ in range(horizon)]) rews = np.zeros(horizon, 'float32') vpreds = np.zeros(horizon, 'float32') sigmapreds = np.zeros(horizon, 'float32') news = np.zeros(horizon, 'int32') acs = np.array([ac for _ in range(horizon)]) prevacs = acs.copy() while True: prevac = ac ac, vpred, sigmapred = pi.act(stochastic, ob) # Slight weirdness here because we need value function at time T # before returning segment [0, T-1] so we get the correct # terminal value if t > 0 and t % horizon == 0: yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "sigmapred": sigmapreds, "new" : news, "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new), "nextsigmapred": sigmapred * (1-new), "ep_rets" : ep_rets, "ep_lens" : ep_lens} # Be careful!!! if you change the downstream algorithm to aggregate # several of these batches, then be sure to do a deepcopy ep_rets = [] ep_lens = [] i = t % horizon obs[i] = ob vpreds[i] = vpred sigmapreds[i] = sigmapred news[i] = new acs[i] = ac prevacs[i] = prevac ob, rew, new, _ = env.step(ac) rews[i] = rew states.append(ob) node_ptr+=1 # Move pointer G.add_edge(node_ptr-1,node_ptr) # Add transition to model if rew and gen_graph_this_episode and len(states) > 20: gen_graph_this_episode=0 # Only generate one graph per episode total_gen=max(0,total_gen-1) # Decrease the total amount of graph generations # Radius-Bsaed Nearest Neighbours search to add edges radius = 5. states = np.array(states) adj = nn.radius_neighbors_graph(states,radius) adj = adj+nx.adjacency_matrix(G) aug_G = nx.from_scipy_sparse_matrix(adj) # Augmented Graph # Identify the sources and the sinks source = 0 sink = len(states) -1 max_sources = 40 # Max number of sources max_sinks=40 # Max number of sinks other_sources =list(range(max_sources)) other_sinks =list(range(len(states)-max_sinks,len(states))) # Create the features and labels for GCN features = np.eye(len(states), dtype=np.float32) features = sparse_to_tuple(sp.lil_matrix(features)) labels = np.zeros((len(states))) labels[-max_sinks:] = 1 labels = encode_onehot(labels) # Diffuse the reward signal diffused = get_graph(aug_G.edges(),adj,features,labels,source,sink,other_sources,other_sinks) #Smoothen the diffused result interpol = make_interpolater(min(diffused),max(diffused),0,1.) targets = interpol(diffused) #Apply to the value function for epo in range(100): grads = vfgrad(states,targets,1.) adam.update(grads, 1e-3) states= list(states) cur_ep_ret += rew cur_ep_len += 1 if new: gen_graph_this_episode=total_gen # Reset the gen_graph variable i_episode+=1 if i_episode % 3 ==0 and gen_graph_this_episode: print('New graph at episode {} Remaining graphs {}'.format(i_episode,total_gen)) G = nx.Graph() states= [] node_ptr=-1 # reset pointer ep_rets.append(cur_ep_ret) ep_lens.append(cur_ep_len) cur_ep_ret = 0 cur_ep_len = 0 ob = env.reset() states.append(ob) node_ptr+=1 # to avoid making edges between a terminal state and an initial state t += 1
print("Current Working Directory ", os.getcwd()) cur_data_dir = os.getcwd() mat_fname = pjoin(cur_data_dir, 'isomap.mat') matFile1 = sio.loadmat(mat_fname) data = matFile1['images'] data.shape pixelno = data.shape[0] imageno = data.shape[1] data = data.T A = radius_neighbors_graph(data, eps, mode='connectivity', metric='minkowski', p=P, include_self=False) A.toarray() MIN = np.sum(A.toarray(), axis=1) min(MIN) MIN.shape MAX = np.sum(A.toarray(), axis=1) max(MAX) x, y = A.toarray().nonzero()[0], A.toarray().nonzero()[1] edges = [(i, j) for i, j in zip(x, y)] nodename = range(0, len(data))
def _pairwise_similarity(self, embeddings, edge_type="d"): if edge_type == 'd': embeddings_X = embeddings[:, 0:int(self.embedding_d / 2)] embeddings_Y = embeddings[:, int(self.embedding_d / 2):self.embedding_d] if self.directed_distance == "euclidean_ball": embeddings_stacked = np.vstack([embeddings_X, embeddings_Y]) adj = radius_neighbors_graph(embeddings_stacked, radius=self.margin, n_jobs=-2) adj = adj[0:embeddings_X.shape[0], :][:, embeddings_X.shape[0]:] print("radius_neighbors_graph") elif self.directed_distance == "euclidean": adj = pairwise_distances(X=embeddings_X, Y=embeddings_Y, metric="euclidean", n_jobs=-2) # Get node-specific adaptive threshold # adj = self.transform_adj_adaptive_threshold(adj, margin=0) # adj = self.transform_adj_beta_exp(adj, edge_types="d", sample_negative=self.negative_sampling_ratio) adj = np.exp(-2.0 * adj) print("Euclidean dist") elif self.directed_distance == "cosine": adj = pairwise_distances(X=embeddings_X, Y=embeddings_Y, metric="cosine", n_jobs=-2) print("Cosine similarity") elif self.directed_distance == "dot_sigmoid": adj = np.matmul(embeddings_X, embeddings_Y.T) adj = sigmoid(adj) print("Dot product & sigmoid") elif self.directed_distance == "dot_softmax": adj = np.matmul(embeddings_X, embeddings_Y.T) adj = softmax(adj) print("Dot product & softmax") elif edge_type == 'u': if self.undirected_distance == "euclidean_ball": adj = radius_neighbors_graph(embeddings, radius=self.margin, n_jobs=-2) elif self.undirected_distance == "euclidean": adj = pairwise_distances(X=embeddings, metric="euclidean", n_jobs=-2) # adj = np.exp(-2.0 * adj) adj = self.transform_adj_beta_exp(adj, edge_types=["u", "u_n"], sample_negative=False) # adj = self.transform_adj_adaptive_threshold(adj, margin=self.margin/2) print("Euclidean dist") elif self.undirected_distance == "cosine": adj = pairwise_distances(X=embeddings, metric="cosine", n_jobs=-2) elif self.undirected_distance == "dot_sigmoid": adj = np.matmul(embeddings, embeddings.T) adj = sigmoid(adj) elif self.undirected_distance == "dot_softmax": adj = np.matmul(embeddings, embeddings.T) adj = softmax(adj) else: raise Exception("Unsupported edge_type", edge_type) return adj
def NNGraph(self, data, limit=0.4): # Create the nearest neighbors graph graph = radius_neighbors_graph(data, limit, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False) graph = graph.toarray() return graph
def compute_graph(X, dims, r_cut, metric=pbc): BT = BallTree(X, metric=metric, dims=dims) rng_con = radius_neighbors_graph(BT, r_cut, mode="connectivity") A = np.matrix(rng_con.toarray()) G = nx.from_numpy_matrix(A) return G
from scipy.sparse import csgraph # https://medium.com/@tomernahshon/spectral-clustering-from-scratch-38c68968eae0 # In[5]: random_state = 21 X, value = make_moons(150, noise=.07, random_state=random_state) fig, ax = plt.subplots() ax.set_title('Truth') ax.scatter(X[:, 0], X[:, 1], c='k', s=50) # In[6]: A = radius_neighbors_graph(X, 0.4, mode='distance') G = csgraph.laplacian(A, normed=False) # In[ ]: radius_neighbors_graph(np.array([[0, 0], [1, 1], [1, 2]]), radius=2, mode='distance').toarray() # In[10]: G.toarray()[0, :] # In[11]: eigval, eigvec = np.linalg.eig(G.toarray())
def get_local_densities(data, kernel_mult = 2.0, metric = 'manhattan'): """For each sample point of the data-set 'data', estimate a local density in feature space by counting the number of neighboring data-points within a particular region centered around that sample point. Parameters ---------- data : array of shape (n_samples, n_features) The data-set, a fraction of whose sample points will be extracted by density sampling. kernel_mult : float, optional (default = 2.0) The kernel multiplier, which determine (in terms of the median of the distribution of distances among nearest neighbors) the extent of the regions centered around each sample point to consider for the computation of the local density associated to that particular sample point. metric : string, optional (default = 'manhattan') The distance metric used to determine the nearest-neighbor to each data-point. The DistanceMetric class defined in scikit-learn's library lists all available metrics. Returns ------- local_densities : array of shape (n_samples,) The i-th entry of this vector corresponds to the local density of the i-th sample point in the order of the rows of 'data'. """ data = np.atleast_2d(data) assert isinstance(kernel_mult, numbers.Real) and kernel_mult > 0 kernel_width = kernel_mult * median_min_distance(data, metric) N_samples = data.shape[0] if 8.0 * get_chunk_size(N_samples, 1) > N_samples: A = radius_neighbors_graph(data, kernel_width, mode = 'connectivity', metric = metric, include_self = True) rows, _ = A.nonzero() with NamedTemporaryFile('w', delete = True, dir = './') as file_name: fp = np.memmap(file_name, dtype = int, mode = 'w+', shape = rows.shape) fp[:] = rows[:] _, counts = np.unique(fp, return_counts = True) local_densities = np.zeros(N_samples, dtype = int) for i in xrange(N_samples): local_densities[i] = counts[i] else: local_densities = np.zeros(N_samples, dtype = int) chunks_size = get_chunk_size(N_samples, 2) for i in xrange(0, N_samples, chunks_size): chunk = data[i:min(i + chunks_size, N_samples)] D = pairwise_distances(chunk, data, metric, n_jobs = 1) D = (D <= kernel_width) local_densities[i + np.arange(min(chunks_size, N_samples - i))] = D.sum(axis = 1) return local_densities
def main(args): name_of_pdf_dir = os.path.basename(args.directory_with_pdfs) all_text = get_all_pdf_text_concatenated(args.directory_with_pdfs) pars = pd.Series(all_text.split('\n\n')).str.replace('\n', ' ') pars.str.len().apply(lambda x: np.log2(x + 1)).astype(int).value_counts() # TODO, is this being stored anywhere? text_keywords = keywords(all_text, scores=True, lemmatize=True, words=args.num_keywords) lower_bound_chars, upper_bound_chars = args.lower_bound_chars, args.upper_bound_chars word_count = int((lower_bound_chars + upper_bound_chars) / (2 * (avg_word_len + 1))) lens = pars.str.len() # paragraph lengths nice_pars = pars[(lens >= lower_bound_chars)] # paragraphs we want to use nice_pars = nice_pars.apply( partial(text_reduce_return, upper_bound_chars=upper_bound_chars, max_word_count=word_count) ) vecs = emb(tuple(nice_pars), args.tfhub_sentence_encoder_url).numpy() D = sk.metrics.pairwise_distances(vecs, metric='cosine') # pairwise distances of vectors R = scipy.sparse.csgraph.minimum_spanning_tree(D).max() # reduced graph G = neighbors.radius_neighbors_graph(vecs, R, metric='cosine') core = nx.k_core(nx.Graph(G)) # Capitalize all occurrences of keywords for easy display on the output # TODO, make matching case insensitive pattern = re.compile(f"\\b({tz.pipe(text_keywords, tz.pluck(0), '|'.join)})\\b") nice_pars = nice_pars.apply( lambda x: re.sub(pattern, lambda m: m.group().upper(), x)) # TODO add [[]] around our keywords for zettelkasten core_nodes = core.nodes core_pars = np.array(nice_pars)[core_nodes] core_vecs = vecs[core_nodes] sil_u, n, lab, sil, p = clust(nx.adjacency_matrix(core), core_vecs, 8) layers = nx.onion_layers(core) df = pd.DataFrame( data=[{"Label": par, "Cluster ID": cid, "Silhouette Score": ss} for par, cid, ss in zip(core_pars, lab, sil)]) df = df[df["Silhouette Score"] > 0] df['Cluster ID'] = df.apply(lambda row: "T" + str(row['Cluster ID']), axis=1) # add footer to dataframe so that csv export will be imported by gsheet's tree map plotter correctly for cluster_id in df['Cluster ID'].unique(): df = df.append({"Label": cluster_id, "Cluster ID": name_of_pdf_dir, "Silhouette Score": None}, ignore_index=True) else: df = df.append({"Label": name_of_pdf_dir, "Cluster ID": None, "Silhouette Score": None}, ignore_index=True) df.to_csv(args.output_filename, index=False) return { "text_keywords": text_keywords }
try: return summarize(paragraph, word_count=word_count).replace("\n", " ") or \ paragraph[:upper_bound_chars] except ValueError: # usually happens if there aren't multiple sentences in paragraph return paragraph[:upper_bound_chars] nice_pars = nice_pars.apply(text_reduce_return) len(nice_pars), len(pars) vecs = emb(tuple(nice_pars), "https://tfhub.dev/google/universal-sentence-encoder-large/5").numpy() D = sk.metrics.pairwise_distances(vecs, metric='cosine') # pairwise distances of vectors R = scipy.sparse.csgraph.minimum_spanning_tree(D).max() # reduced graph G = neighbors.radius_neighbors_graph(vecs, R, metric='cosine') @curry def clust(g, v, n): pipe = pipeline.Pipeline([ ('agg', cluster.AgglomerativeClustering(n, connectivity=g, linkage='ward', affinity='euclidean')) ]) labels = pipe.fit_predict(v) silh = sk.metrics.silhouette_samples(v, labels, metric='cosine') return (silh.mean(), n, labels, silh, pipe) core = nx.k_core(nx.Graph(G)) # Capitalize all occurrences of keywords for easy display on the output pattern = re.compile(f"\\b({tz.pipe(keywords, tz.pluck(0), '|'.join)})\\b") # TODO, make matching case insensitive nice_pars = nice_pars.apply(lambda x: re.sub(pattern, lambda m: m.group().upper(), x)) # TODO, add [[]] around our keywords
def get_local_densities(data, kernel_mult=2.0, metric='manhattan'): """For each sample point of the data-set 'data', estimate a local density in feature space by counting the number of neighboring data-points within a particular region centered around that sample point. Parameters ---------- data : array of shape (n_samples, n_features) The data-set, a fraction of whose sample points will be extracted by density sampling. kernel_mult : float, optional (default = 2.0) The kernel multiplier, which determine (in terms of the median of the distribution of distances among nearest neighbors) the extent of the regions centered around each sample point to consider for the computation of the local density associated to that particular sample point. metric : string, optional (default = 'manhattan') The distance metric used to determine the nearest-neighbor to each data-point. The DistanceMetric class defined in scikit-learn's library lists all available metrics. Returns ------- local_densities : array of shape (n_samples,) The i-th entry of this vector corresponds to the local density of the i-th sample point in the order of the rows of 'data'. """ data = np.atleast_2d(data) assert isinstance(kernel_mult, numbers.Real) and kernel_mult > 0 kernel_width = kernel_mult * median_min_distance(data, metric) N_samples = data.shape[0] if 8.0 * get_chunk_size(N_samples, 1) > N_samples: A = radius_neighbors_graph(data, kernel_width, mode='connectivity', metric=metric, include_self=True) rows, _ = A.nonzero() with NamedTemporaryFile('w', delete=True, dir='./') as file_name: fp = np.memmap(file_name, dtype=int, mode='w+', shape=rows.shape) fp[:] = rows[:] _, counts = np.unique(fp, return_counts=True) local_densities = np.zeros(N_samples, dtype=int) for i in xrange(N_samples): local_densities[i] = counts[i] else: local_densities = np.zeros(N_samples, dtype=int) chunks_size = get_chunk_size(N_samples, 2) for i in xrange(0, N_samples, chunks_size): chunk = data[i:min(i + chunks_size, N_samples)] D = pairwise_distances(chunk, data, metric, n_jobs=1) D = (D <= kernel_width) local_densities[i + np.arange(min(chunks_size, N_samples - i))] = D.sum(axis=1) return local_densities
# Import libraries import json import matplotlib.pyplot as plt, numpy as np, pandas as pd from sklearn.neighbors import radius_neighbors_graph from scipy.sparse.csgraph import connected_components # Contact spacing dist1 = 0.22 dist2 = 0.46 # Get connected components and distances data = pd.read_csv("output/dispcont.csv", names=["x", "y", "w", "one"], usecols=["x", "y"]) data["cc"] = connected_components(radius_neighbors_graph(data.values, 0.06))[1] ccs = data.groupby("cc").count().reset_index().sort_values( "x")[-4:]["cc"].values data["dist"] = np.sqrt(data["x"]**2 + data["y"]**2) # Get points refpts = [] pts = [] for cc in ccs: # Filter to connected component ccdata = data.loc[data["cc"] == cc] # Reference point that is closest to center refpt = ccdata.loc[ccdata["dist"].idxmin()] ccdata["refdist"] = np.sqrt((ccdata["x"] - refpt["x"])**2 + (ccdata["y"] - refpt["y"])**2)
def fit(self, X, y=None): """ Fit the ToMATo class on a point cloud: compute the ToMATo clusters and store the corresponding labels in a numpy array called labels_ Parameters: X (numpy array of shape (num_points) x (num_coordinates)): input point cloud. y (n x 1 array): point labels (unused). """ num_pts = X.shape[0] if self.verbose: print("Computing density estimator") self.density_estimator.fit(X) self.density_values = self.density_estimator.score_samples(X) if self.verbose: plt.scatter(X[:,0], X[:,1], s=5., c=self.density_values) plt.show() if self.verbose: print("Computing underlying graph") if self.n_neighbors is not None: A = kneighbors_graph(X, self.n_neighbors).toarray() A = np.minimum(A + A.T, np.ones(A.shape)) elif self.radius is not None: A = radius_neighbors_graph(X, self.radius).toarray() else: radius = estimate_scale(X, N=100, inp="point cloud", C=10., beta=0.) if self.verbose: print("radius = " + str(radius)) A = radius_neighbors_graph(X, radius).toarray() if self.verbose: print("Sorting points by density") sorted_idxs = np.flip(np.argsort(self.density_values)) inv_sorted_idxs = np.arange(num_pts) for i in range(num_pts): inv_sorted_idxs[sorted_idxs[i]] = i if self.verbose: print("Computing tau") if self.tau is not None: tau = self.tau else: st = gd.SimplexTree() for i in range(num_pts): st.insert([i], filtration=-self.density_values[i]) for i in range(num_pts): for j in range(i+1,num_pts): if A[i,j] == 1.: st.insert([i,j], filtration=max(-self.density_values[i],-self.density_values[j])) d = st.persistence() plot = gd.plot_persistence_diagram(d) plot.show() dgm = st.persistence_intervals_in_dimension(0) persistences = np.sort([abs(y-x) for (x,y) in dgm]) if self.n_clusters is not None: tau = (persistences[-self.n_clusters-1] + persistences[-self.n_clusters]) / 2 else: n_clusters = np.argmax(np.flip(persistences[1:-1] - persistences[:-2])) + 2 tau = (persistences[-n_clusters-1] + persistences[-n_clusters]) / 2 if self.verbose: print("tau = " + str(tau)) if self.verbose: print("Applying UF sequentially") diag, parents = {}, -np.ones(num_pts, dtype=np.int32) for i in range(num_pts): current_pt = sorted_idxs[i] neighbors = np.squeeze(np.argwhere(A[current_pt,:] == 1.)) higher_neighbors = [n for n in neighbors if inv_sorted_idxs[n] <= i] if len(neighbors.shape) > 0 else [] if higher_neighbors == []: parents[current_pt] = current_pt diag[current_pt] = -np.inf else: g = higher_neighbors[np.argmax(self.density_values[np.array(higher_neighbors)])] pg = self.find(g, parents) parents[current_pt] = pg for neighbor in higher_neighbors: pn = self.find(neighbor, parents) val = min(self.density_values[pg], self.density_values[pn]) if pg != pn and val < self.density_values[current_pt] + tau and val > tau: self.union(pg, pn, parents, self.density_values) pp = pg if self.density_values[pg] < self.density_values[pn] else pn diag[pp] = current_pt self.labels_ = np.array([self.find(n, parents) for n in range(num_pts)]) self.labels_ = LabelEncoder().fit_transform(np.where(self.density_values[self.labels_] > tau, self.labels_, -np.ones(self.labels_.shape)))
def main(argv): parser = argparse.ArgumentParser(epilog="NOTE: it is important to have a smooth histogram for accurate fitting\n\n") parser.add_argument("filename", help="input filename") parser.add_argument("-m", "--metric" , type=str, help="define the scipy distance to be used (Default: euclidean or hamming for MSA)",default='euclidean') parser.add_argument("-x", "--matrix", help="if the input file contains already the complete upper triangle of a distance matrix (2 Formats: (idx_i idx_j distance) or simply distances list ) (Opt)", action="store_true") parser.add_argument("-k", "--n_neighbors", type=int, help="nearest_neighbors parameter (Default k=3)", default=3) parser.add_argument("-r", "--radius", type=float, help="use neighbor radius instead of nearest_neighbors (Opt)",default=0.) parser.add_argument("-b", "--n_bins", type=int, help="number of bins for distance histogram (Default 50)",default=50) parser.add_argument("-M", "--r_max", type=float, help="fix the value of distance distribution maximum in the fit (Opt, -1 force the standard fit, avoiding consistency checks)",default=0) parser.add_argument("-n", "--r_min", type=float, help="fix the value of shortest distance considered in the fit (Opt, -1 force the standard fit, avoiding consistency checks)",default=-10) parser.add_argument("-D", "--direct", help="analyze the direct (not graph) distances (Opt)", action="store_true") parser.add_argument("-I", "--projection", help="produce an Isomap projection using the first ID components (Opt)", action="store_true") args = parser.parse_args() input_f = args.filename me=args.metric n_neighbors = args.n_neighbors radius=args.radius+0 MSA=False n_bins = args.n_bins rmax=args.r_max mm=-10000 print '\nFile name: ', input_f #0 Reading input file f1 = open(input_f) data = [] data_line = [] labels = [] for line in f1: if line[0]==">" : MSA=True labels.append(line) if line[0]!=">" and MSA==True : data.append([ord(x) for x in line[:-1]]) data_line.append(line) elif line[0]!="#" and MSA==False : data.append([float(x) for x in line.split()]) data_line.append(line) f1.close() data = n.asarray(data) if MSA : me='hamming' if args.matrix : me='as from the input file' print 'Metric: ', me if radius>0. and (args.direct==False) : print 'Nearest Neighbors Radius:', radius elif (args.direct==False): print 'Nearest Neighbors number K: ', n_neighbors else : print 'Distance distribution are calculated based on the direct input-space distances ' if radius>0. : filename = str(input_f.split('.')[0])+'R'+str(radius) else : filename = str(input_f.split('.')[0])+'K'+str(n_neighbors) #0 #1 Computing geodesic distance on connected points of the input file and relative histogram if args.matrix : if data.shape[1] == 1 : dist_mat=distance.squareform(data.ravel()) mm=dist_mat.shape[1] elif data.shape[1] == 3 : mm=int(max(data[:,1])) dist_mat=n.zeros((mm,mm)) for i in range(0,data.shape[0]): dist_mat[int(data[i,0])-1,int(data[i,1])-1]=data[i,2] dist_mat[int(data[i,1])-1,int(data[i,0])-1]=data[i,2] else : print 'ERROR: The distances input is not in the right matrix format' ; sys.exit(2) print "\n# points: ", mm A=n.zeros((mm,mm)) rrr=[] if radius > 0. : for i in range(0,mm): ll=dist_mat[i] < radius A[i,ll]=dist_mat[i,ll] else : rrr=n.argsort(dist_mat) for i in range(0,mm): ll=rrr[i,0:n_neighbors+1] A[i,ll]=dist_mat[i,ll] radius = A.max() if args.direct : C=dist_mat else : C= graph_shortest_path(A,directed=False) else : print "\n# points, coordinates: ", data.shape if args.direct : C=distance.squareform(distance.pdist(data,me)); elif radius>0. : A = radius_neighbors_graph(data, radius,metric=me,mode='distance') C= graph_shortest_path(A,directed=False) else : A = kneighbors_graph(data, n_neighbors,metric=me,mode='distance') C= graph_shortest_path(A,directed=False) radius=A.max() C=n.asmatrix(C) connect=n.zeros(C.shape[0]) conn=n.zeros(C.shape[0]) for i in range(0,C.shape[0]) : conn_points=n.count_nonzero(C[i]) conn[i]=conn_points if conn_points > C.shape[0]/2. : connect[i]=1 else : C[i]=0 if n.count_nonzero(connect) > C.shape[0]/2. : print 'Number of connected points:', n.count_nonzero(connect), '(',100*n.count_nonzero(connect)/C.shape[0],'% )' else : print 'The neighbors graph is highly disconnected, increase K or Radius parameters' ; sys.exit(2) if n.count_nonzero(connect) < data.shape[0] : data_connect_file = open('connected_data_{0}.dat'.format(filename), "w") for i in range(0,C.shape[0]) : if connect[i]==1 : if MSA : data_connect_file.write(labels[i]) data_connect_file.write(data_line[i]) data_connect_file.close() indices = n.nonzero(n.triu(C,1)) dist_list = n.asarray( C[indices] )[-1] dist_file= open('dist_{0}.dat'.format(filename), "w") for i in range(0, len(dist_list)): dist_file.write("%s " % ((dist_list[i]))) dist_file.close() h=n.histogram(dist_list,n_bins) dx=h[1][1]-h[1][0] plt.figure(1) plt.plot(h[1][0:n_bins]+dx/2,h[0],'o-',label='histogram') plt.xlabel('r') plt.ylabel('N. counts') plt.legend() plt.savefig(filename+'_hist.png') distr_x = [] distr_y = [] avg=n.mean(dist_list) std=n.std(dist_list) if rmax> 0 : avg=rmax std=min(std,rmax) print '\nNOTE: You fixed r_max for the initial fitting, average will have the same value' else : mm=n.argmax(h[0]) rmax=h[1][mm]+dx/2 if args.r_max== -1 : print '\nNOTE: You forced r_max to the maximum of the distribution in the initial fitting, avoiding consistency checks with the average' avg=rmax std=min(std,rmax) if args.r_min>= 0 : print '\nNOTE: You fixed r_min for the initial fitting: r_min = ',args.r_min if args.r_min== -1 : print '\nNOTE: You forced r_min to the standard procedure in the initial fitting' print '\nDistances Statistics:' print 'Average, standard dev., n_bin, bin_size, r_max, r_NN_max:', avg , std, n_bins, dx, rmax, radius,'\n' #1 tmp=1000000 if(args.r_min>=0) : tmp=args.r_min elif(args.r_min==-1) : tmp=rmax-std if(n.fabs(rmax-avg)>std+2.*dx) : print 'ERROR: There is a problem with the r_max detection:' print ' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)' print ' or r_max and r_avg are too distant and you may consider to fix the first detection of r_max with option -M' print ' or to change the neighbor parameter with (-r/-k)' plt.show() sys.exit() elif(rmax<= min(radius+dx,tmp)) : print 'ERROR: There is a problem with the r_max detection, it is shorter than the largest distance in the neighbors graph.' print ' You may consider to fix the first detection of r_max with option -M and/or the r_min with option -n to fix the fit range' print ' or to decrease the neighbors parameter with (-r/-k). For example It is possible to enforce the standard fit range with ' print ' r_min=r_max-2*sigma running option "-n -1"' plt.show() sys.exit() #2 Finding actual r_max and std. dev. to define fitting interval [rmin;rM] distr_x=h[1][0:n_bins]+dx/2 distr_y=h[0][0:n_bins] res= n.empty(25) left_distr_x = n.empty(n_bins) left_distr_y = n.empty(n_bins) left_distr_x= distr_x[n.logical_and(n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.0),distr_y[:]>0.000001)] left_distr_y= n.log(distr_y[n.logical_and(n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.0),distr_y[:]>0.000001)]) if(left_distr_y.shape[0]<4) : print('ERROR: Too few datapoints to fit the distribution:') print(' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)') print(' or the distance distribution itself has some issue') plt.show() print('R, Dfit, Dmin', 'ERROR3' , '\n') sys.exit() coeff = n.polyfit(left_distr_x,left_distr_y,2,full='False') a0=coeff[0][0] b0=coeff[0][1] c0=coeff[0][2] rmax_old=rmax std_old=std rmax = -b0/a0/2.0 if(args.r_max>0) : rmax=args.r_max #if(args.r_max==-1) : rmax=avg #to be used in future in case of problem with Ymax if a0<0 and n.fabs(rmax-rmax_old)<std_old/2+dx : std=n.sqrt(-1/a0/2.) else: rmax=avg std=std_old left_distr_x= distr_x[n.logical_and(distr_y[:]>0.000001,n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.+dx))] left_distr_y= n.log(distr_y[n.logical_and(distr_y[:]>0.000001, n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.+dx))]) if(left_distr_y.shape[0]<4) : print('ERROR: Too few datapoints to fit the distribution:') print(' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)') print(' or the distance distribution itself has some issue') plt.show() sys.exit() coeff = n.polyfit(left_distr_x,left_distr_y,2,full='False') a=coeff[0][0] b=coeff[0][1] c=coeff[0][2] rmax_old=rmax std_old=std if a<0. : rmax = -b/a/2. std=n.sqrt(-1/a/2.) # it was a0 rmin=max(rmax-2*std-dx/2,0.) if(args.r_min>=0) : rmin=args.r_min elif (rmin < radius and args.r_min!=-1) : rmin = radius print '\nWARNING: For internal consistency r_min has been fixed to the largest distance (r_NN_max) in the neighbors graph.' print ' It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" ' print ' or you can use -n to manually define a desired value (Example: -n 0.1)\n' rM=rmax+dx/4 if(n.fabs(rmax-rmax_old)>std_old/4+dx ) : #fit consistency check print '\nWARNING: The histogram is probably not smooth enough (you may try to change n_bin with -b), rmax is fixed to the value of first iteration\n' rmax=rmax_old a=a0 b=b0 c=c0 if(args.r_min>=0) : rmin=args.r_min elif (rmin < radius and args.r_min!=-1) : rmin = radius print '\nWARNING2: For internal consistency r_min has been fixed to the largest distance in the neighbors graph (r_NN_max).' print ' It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" ' print ' or you can use -n to manually define a desired value (Example: -n 0.1)\n' rM=rmax+dx/4 #2 #3 Gaussian Fitting to determine ratio R left_distr_x= distr_x[n.logical_and(n.logical_and(distr_x[:]>rmin,distr_x[:]<=rM),distr_y[:]>0.000001)]/rmax left_distr_y= n.log(distr_y[n.logical_and(n.logical_and(distr_x[:]>rmin,distr_x[:]<=rM),distr_y[:]>0.000001)])-(4*a*c-b**2)/4./a if(left_distr_y.shape[0]<4) : print('ERROR: Too few datapoints to fit the distribution:') print(' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)') print(' or the distance distribution itself has some issue') plt.show() sys.exit() fit = curve_fit(func2,left_distr_x,left_distr_y) ratio=n.sqrt(fit[0][0]) y1=func2(left_distr_x,fit[0][0]) #3 #4 Geodesics D-Hypersphere Distribution Fitting to determine Dfit fit = curve_fit(func,left_distr_x,left_distr_y) Dfit=(fit[0][0])+1 y2=func(left_distr_x,fit[0][0],fit[0][1],fit[0][2]) #4 #5 Determination of Dmin D_file = open('D_residual_{0}.dat'.format(filename), "w") for D in range(1,26): y=(func(left_distr_x,D-1,1,0)) for i in range(0, len(y)): res[D-1] = n.linalg.norm((y)-(left_distr_y))/n.sqrt(len(y)) D_file.write("%s " % D) D_file.write("%s\n" % res[D-1]) Dmin = n.argmax(-res)+1 y=func(left_distr_x,Dmin-1,fit[0][1],0) #5 #6 Printing results print '\nFITTING PARAMETERS:' print 'rmax, std. dev., rmin', rmax,std,rmin print '\nFITTING RESULTS:' print 'R, Dfit, Dmin', ratio,Dfit,Dmin , '\n' if(Dmin == 1) : print 'NOTE: Dmin = 1 could indicate that the choice of the input parameters is not optimal or simply an underestimation of a 2D manifold\n' if(Dfit > 25) : print('NOTE: Dfit > 25 could indicate that the choice of the input parameters is not optimal or that the the distance distribution itself has some issue \n') fit_file= open('fit_{0}.dat'.format(filename), "w") for i in range(0, len(y)): fit_file.write("%s " % left_distr_x[i]) fit_file.write("%s " % ((left_distr_y[i]))) fit_file.write("%s " % ((y1[i]))) fit_file.write("%s " % ((y2[i]))) fit_file.write("%s\n" % ((y[i]))) fit_file.close() stat_file= open('statistics_{0}.dat'.format(filename), "w") statistics = str('# Npoints, rmax, standard deviation, R, D_fit, Dmin \n# \ {}, {}, {}, {}, {}, {}\n'.format(n.count_nonzero(connect),rmax,std,ratio,Dfit,Dmin)) stat_file.write("%s" % statistics) for i in range(0, len(distr_x)-2): if distr_y[i]>0.000001 : stat_file.write("%s " % distr_x[i]) stat_file.write("%s " % distr_y[i]) stat_file.write("%s\n" % n.log(distr_y[i])) stat_file.close() plt.figure(2) plt.plot(left_distr_x,left_distr_y,'o-',label=str(input_f.split('.')[0])) plt.plot(left_distr_x,y1,label='Gaussian fit for R ratio') plt.plot(left_distr_x,y2,label='D-Hypersphere Fit for D_fit') plt.plot(left_distr_x,y,label='D_min-Hypersphere Distribution') plt.xlabel('r/r$_{max}$') plt.ylabel('log p(r)/p(r$_{max}$)') plt.legend(loc=4) plt.savefig(str(input_f.split('.')[0])+'_fit.png') plt.figure(3) plt.plot(range(1,26),res,'o-',label=str(input_f.split('.')[0])+' D_min') plt.legend() plt.xlabel('D') plt.ylabel('RMDS') plt.show() plt.savefig(str(input_f.split('.')[0])+'_Dmin.png') #6 #7 Optional: Isomap projection if args.projection : from sklearn.decomposition import KernelPCA C2=(distance.squareform(dist_list))**2 C2=-.5*C2 obj_pj=KernelPCA(n_components=100,kernel="precomputed") proj=obj_pj.fit_transform(C2) n.savetxt('proj_'+str(input_f.split('.')[0])+'.dat',proj[:,0:Dmin+1]) print 'NOTE: it is important to have a smooth histogram for accurate fitting\n'
def main(argv): parser = argparse.ArgumentParser( epilog= "NOTE: it is important to have a smooth histogram for accurate fitting\n\n" ) parser.add_argument("filename", help="input filename") parser.add_argument( "-m", "--metric", type=str, help= "define the scipy distance to be used (Default: euclidean or hamming for MSA)", default='euclidean') parser.add_argument( "-x", "--matrix", help= "if the input file contains already the complete upper triangle of a distance matrix (2 Formats: (idx_i idx_j distance) or simply distances list ) (Opt)", action="store_true") parser.add_argument("-k", "--n_neighbors", type=int, help="nearest_neighbors parameter (Default k=3)", default=3) parser.add_argument( "-r", "--radius", type=float, help="use neighbor radius instead of nearest_neighbors (Opt)", default=0.) parser.add_argument( "-b", "--n_bins", type=int, help="number of bins for distance histogram (Default 50)", default=50) parser.add_argument( "-M", "--r_max", type=float, help= "fix the value of distance distribution maximum in the fit (Opt, -1 force the standard fit, avoiding consistency checks)", default=0) parser.add_argument( "-n", "--r_min", type=float, help= "fix the value of shortest distance considered in the fit (Opt, -1 force the standard fit, avoiding consistency checks)", default=-10) parser.add_argument("-D", "--direct", help="analyze the direct (not graph) distances (Opt)", action="store_true") parser.add_argument( "-I", "--projection", help="produce an Isomap projection using the first ID components (Opt)", action="store_true") args = parser.parse_args() input_f = args.filename me = args.metric n_neighbors = args.n_neighbors radius = args.radius + 0 MSA = False n_bins = args.n_bins rmax = args.r_max mm = -10000 print '\nFile name: ', input_f #0 Reading input file f1 = open(input_f) data = [] data_line = [] labels = [] for line in f1: if line[0] == ">": MSA = True labels.append(line) if line[0] != ">" and MSA == True: data.append([ord(x) for x in line[:-1]]) data_line.append(line) elif line[0] != "#" and MSA == False: data.append([float(x) for x in line.split()]) data_line.append(line) f1.close() data = n.asarray(data) if MSA: me = 'hamming' if args.matrix: me = 'as from the input file' print 'Metric: ', me if radius > 0. and (args.direct == False): print 'Nearest Neighbors Radius:', radius elif (args.direct == False): print 'Nearest Neighbors number K: ', n_neighbors else: print 'Distance distribution are calculated based on the direct input-space distances ' if radius > 0.: filename = str(input_f.split('.')[0]) + 'R' + str(radius) else: filename = str(input_f.split('.')[0]) + 'K' + str(n_neighbors) #0 #1 Computing geodesic distance on connected points of the input file and relative histogram if args.matrix: if data.shape[1] == 1: dist_mat = distance.squareform(data.ravel()) mm = dist_mat.shape[1] elif data.shape[1] == 3: mm = int(max(data[:, 1])) dist_mat = n.zeros((mm, mm)) for i in range(0, data.shape[0]): dist_mat[int(data[i, 0]) - 1, int(data[i, 1]) - 1] = data[i, 2] dist_mat[int(data[i, 1]) - 1, int(data[i, 0]) - 1] = data[i, 2] else: print 'ERROR: The distances input is not in the right matrix format' sys.exit(2) print "\n# points: ", mm A = n.zeros((mm, mm)) rrr = [] if radius > 0.: for i in range(0, mm): ll = dist_mat[i] < radius A[i, ll] = dist_mat[i, ll] else: rrr = n.argsort(dist_mat) for i in range(0, mm): ll = rrr[i, 0:n_neighbors + 1] A[i, ll] = dist_mat[i, ll] radius = A.max() if args.direct: C = dist_mat else: C = graph_shortest_path(A, directed=False) else: print "\n# points, coordinates: ", data.shape if args.direct: C = distance.squareform(distance.pdist(data, me)) elif radius > 0.: A = radius_neighbors_graph(data, radius, metric=me, mode='distance') C = graph_shortest_path(A, directed=False) else: A = kneighbors_graph(data, n_neighbors, metric=me, mode='distance') C = graph_shortest_path(A, directed=False) radius = A.max() C = n.asmatrix(C) connect = n.zeros(C.shape[0]) conn = n.zeros(C.shape[0]) for i in range(0, C.shape[0]): conn_points = n.count_nonzero(C[i]) conn[i] = conn_points if conn_points > C.shape[0] / 2.: connect[i] = 1 else: C[i] = 0 if n.count_nonzero(connect) > C.shape[0] / 2.: print 'Number of connected points:', n.count_nonzero( connect), '(', 100 * n.count_nonzero(connect) / C.shape[0], '% )' else: print 'The neighbors graph is highly disconnected, increase K or Radius parameters' sys.exit(2) if n.count_nonzero(connect) < data.shape[0]: data_connect_file = open('connected_data_{0}.dat'.format(filename), "w") for i in range(0, C.shape[0]): if connect[i] == 1: if MSA: data_connect_file.write(labels[i]) data_connect_file.write(data_line[i]) data_connect_file.close() indices = n.nonzero(n.triu(C, 1)) dist_list = n.asarray(C[indices])[-1] dist_file = open('dist_{0}.dat'.format(filename), "w") for i in range(0, len(dist_list)): dist_file.write("%s " % ((dist_list[i]))) dist_file.close() h = n.histogram(dist_list, n_bins) dx = h[1][1] - h[1][0] plt.figure(1) plt.plot(h[1][0:n_bins] + dx / 2, h[0], 'o-', label='histogram') plt.xlabel('r') plt.ylabel('N. counts') plt.legend() plt.savefig(filename + '_hist.png') distr_x = [] distr_y = [] avg = n.mean(dist_list) std = n.std(dist_list) if rmax > 0: avg = rmax std = min(std, rmax) print '\nNOTE: You fixed r_max for the initial fitting, average will have the same value' else: mm = n.argmax(h[0]) rmax = h[1][mm] + dx / 2 if args.r_max == -1: print '\nNOTE: You forced r_max to the maximum of the distribution in the initial fitting, avoiding consistency checks with the average' avg = rmax std = min(std, rmax) if args.r_min >= 0: print '\nNOTE: You fixed r_min for the initial fitting: r_min = ', args.r_min if args.r_min == -1: print '\nNOTE: You forced r_min to the standard procedure in the initial fitting' print '\nDistances Statistics:' print 'Average, standard dev., n_bin, bin_size, r_max, r_NN_max:', avg, std, n_bins, dx, rmax, radius, '\n' #1 tmp = 1000000 if (args.r_min >= 0): tmp = args.r_min elif (args.r_min == -1): tmp = rmax - std if (n.fabs(rmax - avg) > std + 2. * dx): print 'ERROR: There is a problem with the r_max detection:' print ' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)' print ' or r_max and r_avg are too distant and you may consider to fix the first detection of r_max with option -M' print ' or to change the neighbor parameter with (-r/-k)' plt.show() sys.exit() elif (rmax <= min(radius + dx, tmp)): print 'ERROR: There is a problem with the r_max detection, it is shorter than the largest distance in the neighbors graph.' print ' You may consider to fix the first detection of r_max with option -M and/or the r_min with option -n to fix the fit range' print ' or to decrease the neighbors parameter with (-r/-k). For example It is possible to enforce the standard fit range with ' print ' r_min=r_max-2*sigma running option "-n -1"' plt.show() sys.exit() #2 Finding actual r_max and std. dev. to define fitting interval [rmin;rM] distr_x = h[1][0:n_bins] + dx / 2 distr_y = h[0][0:n_bins] res = n.empty(25) left_distr_x = n.empty(n_bins) left_distr_y = n.empty(n_bins) left_distr_x = distr_x[n.logical_and( n.logical_and(distr_x[:] > rmax - std, distr_x[:] < rmax + std / 2.0), distr_y[:] > 0.000001)] left_distr_y = n.log(distr_y[n.logical_and( n.logical_and(distr_x[:] > rmax - std, distr_x[:] < rmax + std / 2.0), distr_y[:] > 0.000001)]) if (left_distr_y.shape[0] < 4): print('ERROR: Too few datapoints to fit the distribution:') print( ' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)' ) print(' or the distance distribution itself has some issue') plt.show() print('R, Dfit, Dmin', 'ERROR3', '\n') sys.exit() coeff = n.polyfit(left_distr_x, left_distr_y, 2, full='False') a0 = coeff[0][0] b0 = coeff[0][1] c0 = coeff[0][2] rmax_old = rmax std_old = std rmax = -b0 / a0 / 2.0 if (args.r_max > 0): rmax = args.r_max #if(args.r_max==-1) : rmax=avg #to be used in future in case of problem with Ymax if a0 < 0 and n.fabs(rmax - rmax_old) < std_old / 2 + dx: std = n.sqrt(-1 / a0 / 2.) else: rmax = avg std = std_old left_distr_x = distr_x[n.logical_and( distr_y[:] > 0.000001, n.logical_and(distr_x[:] > rmax - std, distr_x[:] < rmax + std / 2. + dx))] left_distr_y = n.log(distr_y[n.logical_and( distr_y[:] > 0.000001, n.logical_and(distr_x[:] > rmax - std, distr_x[:] < rmax + std / 2. + dx))]) if (left_distr_y.shape[0] < 4): print('ERROR: Too few datapoints to fit the distribution:') print( ' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)' ) print(' or the distance distribution itself has some issue') plt.show() sys.exit() coeff = n.polyfit(left_distr_x, left_distr_y, 2, full='False') a = coeff[0][0] b = coeff[0][1] c = coeff[0][2] rmax_old = rmax std_old = std if a < 0.: rmax = -b / a / 2. std = n.sqrt(-1 / a / 2.) # it was a0 rmin = max(rmax - 2 * std - dx / 2, 0.) if (args.r_min >= 0): rmin = args.r_min elif (rmin < radius and args.r_min != -1): rmin = radius print '\nWARNING: For internal consistency r_min has been fixed to the largest distance (r_NN_max) in the neighbors graph.' print ' It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" ' print ' or you can use -n to manually define a desired value (Example: -n 0.1)\n' rM = rmax + dx / 4 if (n.fabs(rmax - rmax_old) > std_old / 4 + dx): #fit consistency check print '\nWARNING: The histogram is probably not smooth enough (you may try to change n_bin with -b), rmax is fixed to the value of first iteration\n' rmax = rmax_old a = a0 b = b0 c = c0 if (args.r_min >= 0): rmin = args.r_min elif (rmin < radius and args.r_min != -1): rmin = radius print '\nWARNING2: For internal consistency r_min has been fixed to the largest distance in the neighbors graph (r_NN_max).' print ' It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" ' print ' or you can use -n to manually define a desired value (Example: -n 0.1)\n' rM = rmax + dx / 4 #2 #3 Gaussian Fitting to determine ratio R left_distr_x = distr_x[n.logical_and( n.logical_and(distr_x[:] > rmin, distr_x[:] <= rM), distr_y[:] > 0.000001)] / rmax left_distr_y = n.log(distr_y[n.logical_and( n.logical_and(distr_x[:] > rmin, distr_x[:] <= rM), distr_y[:] > 0.000001)]) - (4 * a * c - b**2) / 4. / a if (left_distr_y.shape[0] < 4): print('ERROR: Too few datapoints to fit the distribution:') print( ' usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)' ) print(' or the distance distribution itself has some issue') plt.show() sys.exit() fit = curve_fit(func2, left_distr_x, left_distr_y) ratio = n.sqrt(fit[0][0]) y1 = func2(left_distr_x, fit[0][0]) #3 #4 Geodesics D-Hypersphere Distribution Fitting to determine Dfit fit = curve_fit(func, left_distr_x, left_distr_y) Dfit = (fit[0][0]) + 1 y2 = func(left_distr_x, fit[0][0], fit[0][1], fit[0][2]) #4 #5 Determination of Dmin D_file = open('D_residual_{0}.dat'.format(filename), "w") for D in range(1, 26): y = (func(left_distr_x, D - 1, 1, 0)) for i in range(0, len(y)): res[D - 1] = n.linalg.norm((y) - (left_distr_y)) / n.sqrt(len(y)) D_file.write("%s " % D) D_file.write("%s\n" % res[D - 1]) Dmin = n.argmax(-res) + 1 y = func(left_distr_x, Dmin - 1, fit[0][1], 0) #5 #6 Printing results print '\nFITTING PARAMETERS:' print 'rmax, std. dev., rmin', rmax, std, rmin print '\nFITTING RESULTS:' print 'R, Dfit, Dmin', ratio, Dfit, Dmin, '\n' if (Dmin == 1): print 'NOTE: Dmin = 1 could indicate that the choice of the input parameters is not optimal or simply an underestimation of a 2D manifold\n' if (Dfit > 25): print( 'NOTE: Dfit > 25 could indicate that the choice of the input parameters is not optimal or that the the distance distribution itself has some issue \n' ) fit_file = open('fit_{0}.dat'.format(filename), "w") for i in range(0, len(y)): fit_file.write("%s " % left_distr_x[i]) fit_file.write("%s " % ((left_distr_y[i]))) fit_file.write("%s " % ((y1[i]))) fit_file.write("%s " % ((y2[i]))) fit_file.write("%s\n" % ((y[i]))) fit_file.close() stat_file = open('statistics_{0}.dat'.format(filename), "w") statistics = str('# Npoints, rmax, standard deviation, R, D_fit, Dmin \n# \ {}, {}, {}, {}, {}, {}\n'.format(n.count_nonzero(connect), rmax, std, ratio, Dfit, Dmin)) stat_file.write("%s" % statistics) for i in range(0, len(distr_x) - 2): if distr_y[i] > 0.000001: stat_file.write("%s " % distr_x[i]) stat_file.write("%s " % distr_y[i]) stat_file.write("%s\n" % n.log(distr_y[i])) stat_file.close() plt.figure(2) plt.plot(left_distr_x, left_distr_y, 'o-', label=str(input_f.split('.')[0])) plt.plot(left_distr_x, y1, label='Gaussian fit for R ratio') plt.plot(left_distr_x, y2, label='D-Hypersphere Fit for D_fit') plt.plot(left_distr_x, y, label='D_min-Hypersphere Distribution') plt.xlabel('r/r$_{max}$') plt.ylabel('log p(r)/p(r$_{max}$)') plt.legend(loc=4) plt.savefig(str(input_f.split('.')[0]) + '_fit.png') plt.figure(3) plt.plot(range(1, 26), res, 'o-', label=str(input_f.split('.')[0]) + ' D_min') plt.legend() plt.xlabel('D') plt.ylabel('RMDS') plt.show() plt.savefig(str(input_f.split('.')[0]) + '_Dmin.png') #6 #7 Optional: Isomap projection if args.projection: from sklearn.decomposition import KernelPCA C2 = (distance.squareform(dist_list))**2 C2 = -.5 * C2 obj_pj = KernelPCA(n_components=100, kernel="precomputed") proj = obj_pj.fit_transform(C2) n.savetxt('proj_' + str(input_f.split('.')[0]) + '.dat', proj[:, 0:Dmin + 1]) print 'NOTE: it is important to have a smooth histogram for accurate fitting\n'
def makeGraph(dataset, model, districtStats, R, sigma): RADIUS_OF_EARTH = 6378 dataFile = json.load(open(dataset)) dates = [date for date in dataFile] # Saving locations from dictionary placesList = [] for date in dates: for state in list(dataFile[date]): if state == 'TT': pass try: for district in list(dataFile[date][state]['districts']): if district == 'Unknown' or district == 'Other State': district = randomDistrict(state) place = district + ',' + state + ',' + 'India' if not place in placesList: placesList.append(place) except KeyError: place = state + ',' + 'India' if not place in placesList: placesList.append(place) print('Updated places') # Geolocator, we save stuff to geoUP.p geolocator = Bing(api_key=apis.bing()) uniquePlacesList = list(unique_everseen(placesList)) geocodedDistrictList = list(districtStats['Coordinates']) geocodedUniqueNearestDistrictList = list( np.zeros_like(uniquePlacesList).astype(str)) # Initialize if not present if not os.path.exists('data/geoUP.p'): geocodedUniquePlacesList = list( np.zeros_like(uniquePlacesList).astype(str)) with open('data/geoUP.p', 'wb') as f: pickle.dump(geocodedUniquePlacesList, f) # Add new locations if any with open('data/geoUP.p', 'rb') as f: geocodedUniquePlacesList = pickle.load(f) for i in range(len(uniquePlacesList)): if geocodedUniquePlacesList[i] == '': geocodedUniquePlacesList[i] = ((geolocator.geocode( uniquePlacesList[i]).latitude), (geolocator.geocode( uniquePlacesList[i]).longitude)) print('Geo mapping stuff done') # Save to pickle with open('data/geoUP.p', 'wb') as f: pickle.dump(geocodedUniquePlacesList, f) for i in range(len(uniquePlacesList)): _, _, coordinate, _ = getNearestDistrictData( model, districtStats, geocodedUniquePlacesList[i]) geocodedUniqueNearestDistrictList[i] = coordinate # Map stuff to different lists this got error numberOfDistricts = len(geocodedDistrictList) numberOfDates = len(dates) arrayFinal = np.zeros((numberOfDates, numberOfDistricts, 3)) print('Making final time resolved array') for dateIndex in range(numberOfDates): for districtIndex in range(numberOfDistricts): date = dates[dateIndex] district = list(districtStats['Coordinates'])[districtIndex] try: place = uniquePlacesList[ geocodedUniqueNearestDistrictList.index(district)] # If that district is not enlisted in corona affected places except ValueError: pass if place: dump = place.split(',') number = 0 # Check to see if district or state only data if len(dump) == 2: try: number = dataFile[date][dump[0]]['total']['confirmed'] # If that state does not exist on that date except KeyError: pass else: try: number = dataFile[date][dump[1]]['districts'][ dump[0]]['total']['confirmed'] # If that district does not exist in this state on the date except KeyError: pass arrayFinal[dateIndex, districtIndex, 0] = number arrayFinal[dateIndex, districtIndex, 1] = list( districtStats['Literacy rate'])[districtIndex] arrayFinal[dateIndex, districtIndex, 2] = list( districtStats['Population'])[districtIndex] else: pass print('Array made') E = radius_neighbors_graph(model, R / RADIUS_OF_EARTH, mode='distance', metric='haversine').toarray() W = 1 - np.exp(-(E * E) / sigma) adj = np.where(W > 0, 1, 0) # edge = W.reshape(1, W.shape[0]*W.shape[1]) return arrayFinal, W, adj
def epsilon_graph(X, e): A = radius_neighbors_graph(X, e, mode='distance', include_self=False) A.toarray() return A
from sklearn.datasets import make_circles random_state = 21 #X_mn, y_mn = make_moons(150, noise=.07, random_state=random_state) #X_mn, y_mn = make_circles(150, noise=.07, random_state=random_state) X_mn, y_mn = make_circles(n_samples=400, factor=.3, noise=0.025) cmap = 'viridis' dot_size = 50 #fig, ax = plt.subplots(figsize=(9,7)) #ax.set_title('Data with ground truth labels - linear separation not # possible', fontsize=18, fontweight='demi') #ax.scatter(X_mn[:, 0], X_mn[:, 1],c=y_mn,s=dot_size, cmap=cmap) #fig.show() A = radius_neighbors_graph(X_mn, 0.4, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False) # A = kneighbors_graph(X_mn, 2, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False) A = A.toarray() print(A.shape) """ fig, ax = plt.subplots(figsize=(9,7)) ax.set_title('5 first datapoints', fontsize=18, fontweight='demi') ax.set_xlim(-1, 2) ax.set_ylim(-1,1) ax.scatter(X_mn[:5, 0], X_mn[:5, 1],s=dot_size, cmap=cmap) for i in range(5): ax.annotate(i, (X_mn[i,0],X_mn[i,1])) fig.show()
def actor_critic(sess, gcn, placeholders, env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0): global gen_graph G = nx.Graph() colors= [] totsteps =0 positions = np.arange(-1.2,0.6,0.01) velolicties = np.arange(-0.07,0.07,0.001) vinput= [] for vel in velolicties: for pos in positions: vinput.append([pos,vel]) vinput = np.array(vinput) name = "res/mountain_graph{}_seed{}.csv".format(gen_graph,seed) change_lr=0 stats = [] states= [] done=False node_ptr=0 for i_episode in range(num_episodes): # pdb.set_trace() if i_episode % 3 ==0 and gen_graph: print('new graph') G = nx.Graph() states= [] node_ptr=0 # curr_episode=i_episode-1 if i_episode % 5 == 0 and i_episode!=0: np.savetxt(name,stats,delimiter=',') state = env.reset() states.append(state) rewards = 0 losses = 0 for t in itertools.count(): # pdb.set_trace() action = estimator_policy.predict([state]) next_state, reward, done, _ = env.step(action) rewards += reward # reward+=1 # print(reward) node_ptr+=1 G.add_edge(node_ptr-1,node_ptr) # Calculate TD Target value_next = estimator_value.predict([next_state]) td_target = reward + (1-done) * discount_factor * value_next advantage = td_target - estimator_value.predict([state]) lr = 1e-3 if not change_lr else 1e-4 estimator_value.update([state], td_target,lr) loss = estimator_policy.update([state], advantage, action, lr) losses += loss state = next_state states.append(state) # print(node_ptr, len(states)-1) if done: node_ptr+=1 # to avoid making edges between an terminal state and a initial state totsteps+=t print("\rEpisode {}/{} Steps {} Total Steps {} ({}) Loss: {}".format(i_episode, num_episodes, t,totsteps, rewards,losses/t) ) stats.append(totsteps) rewards =0 if plots: # pos = {i:(states[i][0],states[i][1]) for i in range(len(states))} # this_color = [i_episode+1] * (t+1) # colors += this_color # fig,ax = plt.subplots() # plt.xlim((-1.2,0.6)) # plt.ylim((-0.07,0.07)) # nx.draw(G,pos, with_labels=False, font_size=7, node_size=5,node_color='blue') # # plt.show() # plt.savefig("graphs/graph{}.png".format(i_episode+1)) # plt.clf();plt.close() v_preds=estimator_value.predict(vinput).reshape(len(velolicties),len(positions)) fig,ax = plt.subplots() ax.imshow(v_preds, interpolation='nearest', alpha=1.) # ax.autoscale(False) # nx.draw(G,pos, with_labels=False, font_size=7, node_size=5,node_color=colors) # plt.show() # plt.axis('off') plt.xticks([]) plt.yticks([]) plt.title("Actor-Critic",fontsize=17) plt.xlabel('Position',fontsize=17) plt.ylabel('Velocity',fontsize=17) # plt.title("Diffusion-Based Approximate Value Function") plt.savefig("vpreds0/vpred{}.png".format(i_episode)) plt.clf();plt.close() if t<env._max_episode_steps-1 and gen_graph: gen_graph=0 # change_lr=1 # pdb.set_trace() aspect = (0.6 + 1.2) / (2*0.07) metric = lambda p0, p1: np.sqrt((p1[0] - p0[0]) * (p1[0] - p0[0]) + (p1[1] - p0[1]) * (p1[1] - p0[1]) * aspect) # dist='euclidean' radius = 0.02 real_states = np.array(states) adj = nn.radius_neighbors_graph(real_states,radius,metric=metric) adj = adj+nx.adjacency_matrix(G) gg = nx.from_scipy_sparse_matrix(adj) source = 0 sink = len(real_states) -1 # max_sources = 40 # max_sinks=40 # other_sources =range(max_sources) # other_sinks =range(len(real_states)-max_sinks,len(real_states)) other_sources=[source] other_sinks=[sink] max_sinks=1 # pdb.set_trace() features = featurize_state(real_states) # features = real_states # features = np.eye(len(real_states), dtype=np.float32) features = sparse_to_tuple(sp.lil_matrix(features)) labels = np.zeros((len(real_states))) labels[-max_sinks:] = 1 labels = encode_onehot(labels) V_weights = get_graph(sess,gcn,placeholders,gg.edges(),gg,real_states,adj,features,labels,source,sink,other_sources,other_sinks,featurize_state) targets = V_weights # pdb.set_trace() gcn_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, gcn.name) vf_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value_estimator") for g_v,v_v in zip(gcn_vars,vf_vars): if '_1_' in g_v.name: if 'bias' in g_v.name: # pdb.set_trace() sess.run(tf.assign(v_v,tf.expand_dims(g_v[1],0) )) else: sess.run(tf.assign(v_v,tf.expand_dims(g_v[:,1],1) )) else: sess.run(tf.assign(v_v, g_v)) # pos = {i:(real_states[i][0],real_states[i][1]) for i in range(len(real_states))} # fig,ax = plt.subplots() # nx.draw(gg,pos, with_labels=False, font_size=10, node_size=25,node_color=targets) # # plt.savefig("updated_graph/last.png") # # plt.clf();plt.close() # plt.show() # plt.close() # for epo in range(30): # estimator_value.update(real_states, targets,1e-3) # fig,ax = plt.subplots() # v_preds=estimator_value.predict(vinput).reshape(len(velolicties),len(positions)) # ax.imshow(v_preds, interpolation='nearest', alpha=1.) # # ax.autoscale(False) # # nx.draw(G,pos, with_labels=False, font_size=7, node_size=5,node_color=colors) # # plt.show() # plt.savefig("updated_preds/iter{}.png".format(epo)) # plt.clf();plt.close() fig,ax = plt.subplots() v_preds=estimator_value.predict(vinput).reshape(len(velolicties),len(positions)) ax.imshow(v_preds, interpolation='nearest', alpha=1.) # ax.autoscale(False) # nx.draw(G,pos, with_labels=False, font_size=7, node_size=5) # plt.show() # plt.close() # pdb.set_trace() break return stats
def __init__(self, lat, long, major_axis, minor_axis, psi, crater_id=None, Rbody=const.RMOON, radius=const.TRIAD_RADIUS, vcam_alt=const.DB_CAM_ALTITUDE, sort_ij=True): """Crater database abstraction keyed by crater triads that generate projective invariants using information about their elliptical shape and relative positions [1]. Input is a crater dataset [2] that has positional and geometrical (ellipse parameters) information; output is an array of 7 features per crater triad. Parameters ---------- lat : np.ndarray Crater latitude [radians] long : np.ndarray Crater longitude [radians] major_axis : np.ndarray Crater major axis [km] minor_axis : np.ndarray Crater minor axis [km] psi : np.ndarray Crater ellipse tilt angle, major axis w.r.t. East-West direction (0, pi) [radians] crater_id : np.ndarray, optional Crater identifier, defaults to enumerated array over len(lat) Rbody : float, optional Body radius, defaults to RMOON [km] radius : float, int Maximum radius to consider two craters connected, defaults to TRIAD_RADIUS [km] vcam_alt : float, int Altitude of virtual per-triad camera sort_ij : bool Whether to sort triad features with I_ij being the lowest absolute value References ---------- .. [1] Christian, J. A., Derksen, H., & Watkins, R. (2020). Lunar Crater Identification in Digital Images. https://arxiv.org/abs/2009.01228 .. [2] Robbins, S. J. (2019). A New Global Database of Lunar Impact Craters >1–2 km: 1. Crater Locations and Sizes, Comparisons With Published Databases, and Global Analysis. Journal of Geophysical Research: Planets, 124(4), 871–892. https://doi.org/10.1029/2018JE005592 """ if crater_id is None: self.crater_id = np.arange(len(lat)) else: self.crater_id = crater_id self._lat = lat self._long = long self._C_cat = conic_matrix(major_axis, minor_axis, psi) x, y, z = map(np.array, spherical_to_cartesian(Rbody, self._lat, self._long)) self._r_craters = np.array((x, y, z)).T[..., None] """ Construct adjacency matrix and generate Graph instance """ self._adjacency_matrix = radius_neighbors_graph(np.array([x, y, z]).T, radius, mode='distance', metric='euclidean', n_jobs=-1) self._graph = nx.from_scipy_sparse_matrix(self._adjacency_matrix) """ Get all crater triads using cycle basis with length = 3 https://en.wikipedia.org/wiki/Cycle_basis The following returns a nx3 array containing the indices of crater triads """ crater_triads = np.array(get_cliques_by_length(self._graph, 3)) """ Project crater triads into virtual image plane using homography """ r_M_ijk = np.moveaxis( np.concatenate( (x[crater_triads].T[None, ...], y[crater_triads].T[None, ...], z[crater_triads].T[None, ...]), axis=0), 0, 2)[..., None] r_centroid = np.mean(r_M_ijk, axis=0) r_vcam = r_centroid + ( r_centroid / LA.norm(r_centroid, axis=1)[..., None]) * vcam_alt T_CM = np.concatenate(nadir_attitude(r_vcam), axis=-1) if (LA.matrix_rank(T_CM) != 3).any(): raise Warning("Invalid camera attitude matrices present!:\n", T_CM) K = camera_matrix() P_MC = K @ LA.inv(T_CM) @ np.concatenate( (np.tile(np.identity(3), (len(r_vcam), 1, 1)), -r_vcam), axis=2) H_C_triads = np.array( list(map(crater_camera_homography, r_M_ijk, repeat(P_MC)))) """ Ensure all crater triads are clockwise """ C_triads = np.array( list(map(lambda vertex: self._C_cat[vertex], crater_triads.T))) A_i, A_j, A_k = map( lambda T, C: LA.inv(T).transpose((0, 2, 1)) @ C @ LA.inv(T), H_C_triads, C_triads) r_i, r_j, r_k = map(conic_center, (A_i, A_j, A_k)) cw_value = LA.det( np.moveaxis( np.array([[r_i[:, 0], r_i[:, 1], np.ones_like(r_i[:, 0])], [r_j[:, 0], r_j[:, 1], np.ones_like(r_i[:, 0])], [r_k[:, 0], r_k[:, 1], np.ones_like(r_i[:, 0])]]), -1, 0)) clockwise = cw_value < 0 line = cw_value == 0 clockwise = clockwise[~line] crater_triads = crater_triads[~line] H_C_triads = H_C_triads[:, ~line] crater_triads[np.argwhere(~clockwise), [0, 1]] = crater_triads[np.argwhere(~clockwise), [1, 0]] H_C_triads[[0, 1], np.argwhere(~clockwise)] = H_C_triads[ [1, 0], np.argwhere(~clockwise)] C_triads = np.array( list(map(lambda vertex: self._C_cat[vertex], crater_triads.T))) A_i, A_j, A_k = map( lambda T, C: LA.inv(T).transpose((0, 2, 1)) @ C @ LA.inv(T), H_C_triads, C_triads) invariants = CoplanarInvariants(crater_triads, A_i, A_j, A_k, normalize_det=True) self._features = invariants.get_pattern() self._crater_triads = invariants.crater_triads if sort_ij: ij_idx = np.abs(self._features[:, :3]).argmin(1) self._features = np.concatenate( (shift_nd(self._features[:, :3], -ij_idx), shift_nd(self._features[:, 3:6], -ij_idx), self._features[:, [-1]]), axis=-1) self._crater_triads = shift_nd(self._crater_triads, -ij_idx) too_close = np.logical_or.reduce( (np.abs((self._features[:, 0] - self._features[:, 2]) / self._features[:, 0]) < 0.1, np.abs((self._features[:, 0] - self._features[:, 1]) / self._features[:, 0]) < 0.1)) self._features = np.concatenate( (self._features, np.concatenate( (np.roll(self._features[too_close, :3], 1), np.roll(self._features[too_close, :3], 1), self._features[too_close, -1:]), axis=-1)), axis=0) self._crater_triads = np.concatenate( (self._crater_triads, np.roll(self._crater_triads[too_close], 1))) self._kdtree = KDTree(self._features)
def _plot_atoms_general( self, ax, atol, max_bond_length, atom_size, bond_line_width, scaling_matrix, midpoint, scan_size, legend, top, structure, atom_axis_bounds, atoms_box, legend_atom_size, ): supercell = make_supercell( structure, scaling_matrix=np.hstack([scaling_matrix, 1]), ) inds, heights = group_layers(supercell, atol=atol) if top: surface_inds = inds[-1] else: surface_inds = inds[0] surface_atom_coords = supercell.cart_coords[surface_inds] surface_atom_symbols = np.array(supercell.species, dtype='str')[surface_inds] surface_atom_species = np.zeros(surface_atom_symbols.shape, dtype=int) surface_atom_sizes = np.zeros(surface_atom_symbols.shape, dtype=float) unique_species = np.unique(surface_atom_symbols) unique_elements = [Element(i) for i in unique_species] unique_zs = [Element(i).Z for i in unique_species] for i, z in enumerate(unique_elements): surface_atom_species[np.isin(surface_atom_symbols, unique_species[i])] = z.Z surface_atom_sizes[np.isin(surface_atom_symbols, unique_species[i])] = z.atomic_radius surface_atom_sizes /= surface_atom_sizes.max() colors = jmol_colors[surface_atom_species] shifted_point = midpoint - (scan_size / 2) surface_atom_coords[:, 0] -= shifted_point[0] surface_atom_coords[:, 1] -= shifted_point[1] neighbor_graph = radius_neighbors_graph( X=surface_atom_coords, radius=max_bond_length, ).toarray() bonds = [] for i in range(neighbor_graph.shape[0]): for j in range(neighbor_graph.shape[0]): if neighbor_graph[i, j] > 0: to_append = [ surface_atom_coords[i], surface_atom_coords[j], [np.nan, np.nan, np.nan], ] bonds.append(to_append) bonds = np.vstack(bonds) ax_atoms = ax.inset_axes(bounds=atom_axis_bounds, ) ax_atoms.set_xlim(atom_axis_bounds[0] * scan_size, (atom_axis_bounds[0] + atom_axis_bounds[2]) * scan_size) ax_atoms.set_ylim(atom_axis_bounds[1] * scan_size, (atom_axis_bounds[1] + atom_axis_bounds[3]) * scan_size) ax_atoms.set_facecolor((0, 0, 0, 0)) ax_atoms.tick_params( bottom=False, left=False, labelbottom=False, labelleft=False, ) if not atoms_box: ax_atoms.spines['left'].set_visible(False) ax_atoms.spines['right'].set_visible(False) ax_atoms.spines['top'].set_visible(False) ax_atoms.spines['bottom'].set_visible(False) ax_atoms.plot( bonds[:, 0], bonds[:, 1], color='lightgrey', linewidth=bond_line_width, zorder=5, path_effects=[ pa.Stroke(linewidth=bond_line_width + 2, foreground='black'), pa.Normal() ], ) ax_atoms.scatter( surface_atom_coords[:, 0], surface_atom_coords[:, 1], c=colors, ec='black', s=atom_size * surface_atom_sizes, zorder=10, ) if legend: legend_lines = [] legend_labels = [] for name, color, element in zip(unique_species, jmol_colors[unique_zs], unique_elements): legend_lines.append( plt.scatter( [-1], [-1], color=color, s=legend_atom_size * element.atomic_radius, ec='black', )) legend_labels.append(f'{name}') leg = ax.get_legend() if leg is None: handles = legend_lines labels = legend_labels else: handles = [l._legmarker for l in leg.legendHandles] labels = [text._text for text in leg.texts] handles.extend(legend_lines) labels.extend(legend_labels) l = ax.legend( handles, labels, ncol=1, loc='upper right', framealpha=1, ) l.set_zorder(200) frame = l.get_frame() frame.set_facecolor('white') frame.set_edgecolor('black')