Esempio n. 1
0
def test_radius_neighbors_graph():
    """Test radius_neighbors_graph to build the Nearest Neighbor graph."""
    X = np.array([[0, 1], [1.01, 1.0], [2, 0]])

    A = neighbors.radius_neighbors_graph(X, 1.5, mode="connectivity")
    assert_array_equal(A.todense(), [[1.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0]])

    A = neighbors.radius_neighbors_graph(X, 1.5, mode="distance")
    assert_array_almost_equal(A.todense(), [[0.0, 1.01, 0.0], [1.01, 0.0, 1.40716026], [0.0, 1.40716026, 0.0]])
Esempio n. 2
0
def test_include_self_neighbors_graph():
    """Test include_self parameter in neighbors_graph"""
    X = [[2, 3], [4, 5]]
    kng = neighbors.kneighbors_graph(X, 1, include_self=True).A
    kng_not_self = neighbors.kneighbors_graph(X, 1, include_self=False).A
    assert_array_equal(kng, [[1.0, 0.0], [0.0, 1.0]])
    assert_array_equal(kng_not_self, [[0.0, 1.0], [1.0, 0.0]])

    rng = neighbors.radius_neighbors_graph(X, 5.0, include_self=True).A
    rng_not_self = neighbors.radius_neighbors_graph(X, 5.0, include_self=False).A
    assert_array_equal(rng, [[1.0, 1.0], [1.0, 1.0]])
    assert_array_equal(rng_not_self, [[0.0, 1.0], [1.0, 0.0]])
Esempio n. 3
0
def test_radius_neighbors_graph_sparse(seed=36):
    """Test radius_neighbors_graph to build the Nearest Neighbor graph
    for sparse input."""
    rng = np.random.RandomState(seed)
    X = rng.randn(10, 10)
    Xcsr = csr_matrix(X)

    for n_neighbors in [1, 2, 3]:
        for mode in ["connectivity", "distance"]:
            assert_array_almost_equal(
                neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(),
                neighbors.radius_neighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(),
            )
Esempio n. 4
0
    def _get_affinity_matrix(self, X, Y=None):
        """Calculate the affinity matrix from data
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples in the number of samples
            and n_features is the number of features.

            If affinity is "precomputed"
            X : array-like, shape (n_samples, n_samples),
            Interpret X as precomputed adjacency graph computed from
            samples.

        Returns
        -------
        affinity_matrix, shape (n_samples, n_samples)
        """
        if self.affinity == 'precomputed':
            self.affinity_matrix_ = X
            print( type(             self.affinity_matrix_))
            return self.affinity_matrix_
            
        # nearest_neigh kept for backward compatibility 
        if self.affinity == 'nearest_neighbors':
            if sparse.issparse(X):
                warnings.warn("Nearest neighbors affinity currently does "
                              "not support sparse input, falling back to "
                              "rbf affinity")
                self.affinity = "rbf"
            else:
                self.n_neighbors_ = (self.n_neighbors
                                     if self.n_neighbors is not None
                                     else max(int(X.shape[0] / 10), 1))
                self.affinity_matrix_ = kneighbors_graph(X, self.n_neighbors_)
                # currently only symmetric affinity_matrix supported
                self.affinity_matrix_ = 0.5 * (self.affinity_matrix_ +
                                               self.affinity_matrix_.T)
                return self.affinity_matrix_
        if self.affinity == 'radius_neighbors':
            if self.neighbors_radius is None:
                self.neighbors_radius_ =  np.sqrt(X.shape[1])
                # to put another defaault value, like diam(X)/sqrt(dimensions)/10
            else:
                self.neighbors_radius_ = self.neighbors_radius
                
            self.gamma_ = (self.gamma
                           if self.gamma is not None else 1.0 / X.shape[1])
            self.affinity_matrix_ = radius_neighbors_graph(X, self.neighbors_radius_, mode='distance')
            
            self.affinity_matrix_.data **= 2              
            self.affinity_matrix_.data /= -self.neighbors_radius_**2
            self.affinity_matrix_.data = np.exp( self.affinity_matrix_.data, self.affinity_matrix_.data )
            return self.affinity_matrix_
        if self.affinity == 'rbf':
            self.gamma_ = (self.gamma
                           if self.gamma is not None else 1.0 / X.shape[1])
            self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
            return self.affinity_matrix_
        self.affinity_matrix_ = self.affinity(X)
        return self.affinity_matrix_
Esempio n. 5
0
def test_non_euclidean_kneighbors():
    rng = np.random.RandomState(0)
    X = rng.rand(5, 5)

    # Find a reasonable radius.
    dist_array = pairwise_distances(X).flatten()
    np.sort(dist_array)
    radius = dist_array[15]

    # Test kneighbors_graph
    for metric in ['manhattan', 'chebyshev']:
        nbrs_graph = neighbors.kneighbors_graph(
            X, 3, metric=metric).toarray()
        nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X)
        assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray())

    # Test radiusneighbors_graph
    for metric in ['manhattan', 'chebyshev']:
        nbrs_graph = neighbors.radius_neighbors_graph(
            X, radius, metric=metric).toarray()
        nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X)
        assert_array_equal(nbrs_graph,
                           nbrs1.radius_neighbors_graph(X).toarray())

    # Raise error when wrong parameters are supplied,
    X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan')
    X_nbrs.fit(X)
    assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3,
                  metric='euclidean')
    X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan')
    X_nbrs.fit(X)
    assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs,
                  radius, metric='euclidean')
Esempio n. 6
0
    def test_radius_neighbors_graph(self):
        x = [[0], [3], [1]]
        df = pdml.ModelFrame(x)

        result = df.neighbors.radius_neighbors_graph(1.5)
        expected = neighbors.radius_neighbors_graph(x, 1.5)

        self.assert_numpy_array_almost_equal(result.toarray(), expected.toarray())
def example2():
    """画出radius-近邻关系图
    距离<=radius的将被看做近邻
    """
    train = np.array([[1,2,4,7,9,10]]).transpose()
    graph = radius_neighbors_graph(train, 2.5) # radius = 2.5
    print(graph)
    print(graph.toarray())
Esempio n. 8
0
def test_radius_neighbors_graph():
    # Test radius_neighbors_graph to build the Nearest Neighbor graph.
    X = np.array([[0, 1], [1.01, 1.], [2, 0]])

    A = neighbors.radius_neighbors_graph(X, 1.5, mode='connectivity')
    assert_array_equal(
        A.toarray(),
        [[1., 1., 0.],
         [1., 1., 1.],
         [0., 1., 1.]])

    A = neighbors.radius_neighbors_graph(X, 1.5, mode='distance')
    assert_array_almost_equal(
        A.toarray(),
        [[0., 1.01, 0.],
         [1.01, 0., 1.40716026],
         [0., 1.40716026, 0.]])
Esempio n. 9
0
def distance_matrix( X, flindex = None, mode='radius_neighbors', 
                     neighbors_radius=None, symmetrize = True, n_neighbors=0 ):
    # DNearest neighbors has issues. TB FIXED
    if mode == 'nearest_neighbors':
        warnings.warn("Nearest neighbors currently does not work"
                      "falling back to radius neighbors")
        mode = 'radius_neighbors'

    if mode == 'radius_neighbors':
        neighbors_radius_ = (neighbors_radius
                             if neighbors_radius is not None else 1.0 / X.shape[1])   # to put another defaault value, like diam(X)/sqrt(dimensions)/10
        if flindex is not None:
            distance_matrix = fl_radius_neighbors_graph(X, neighbors_radius_, flindex, mode='distance')
        else:
            distance_matrix = radius_neighbors_graph(X, neighbors_radius_, mode='distance')
        return distance_matrix
Esempio n. 10
0
    def _build_graph(self):
        """Compute the graph Laplacian."""

        # Graph sparsification
        if self.sparsify == "epsilonNN":
            self.A_ = radius_neighbors_graph(self.X_, self.radius, include_self=False)
        else:
            Q = kneighbors_graph(self.X_, self.n_neighbors, include_self=False).astype(np.bool)

            if self.sparsify == "kNN":
                self.A_ = (Q + Q.T).astype(np.float64)
            elif self.sparsify == "MkNN":
                self.A_ = (Q.multiply(Q.T)).astype(np.float64)

        # Edge re-weighting
        if self.reweight == "rbf":
            W = rbf_kernel(self.X_, gamma=self.t)
            self.A_ = self.A_.multiply(W)

        return sp.csgraph.laplacian(self.A_, normed=self.normed)
Esempio n. 11
0
def make_graph_radius(x, radius, metric='euclidean', normalize_dists=True):
    use_sklearn = False
    if use_sklearn:
        dists = radius_neighbors_graph(x,
                                       radius,
                                       mode='connectivity',
                                       metric=metric)
    else:
        assert metric == 'euclidean'
        p = x.shape[1]
        #max_dist = norm(np.ones(p) - np.zeros(p))
        x = normalize(x)
        #dists = pairwise.pairwise_distances(x,x,metric) / max_dist
        dists = pairwise.pairwise_distances(x, x, metric)
        if normalize_dists:
            dists /= dists.max()
        dists[np.diag_indices_from(dists)] = 0
        dists[dists > radius] = 0
        dists[dists != 0] = 1
    return dists
Esempio n. 12
0
def build_graph(X, graph_params=GraphParams(), metric='euclidean'):
    """Builds a graph (knn or epsilon) weight matrix W
    W is sparse - to be optimized somehow
    """
    graph_type = graph_params.type
    sigma2 = graph_params.sigma2
    graph_thresh = graph_params.thresh
    n = len(X)
    W = np.zeros((n, n))
    if graph_type is 'knn':
        D = kneighbors_graph(X, graph_thresh, metric=metric,
                             mode='distance').toarray()
    elif graph_type is 'eps':
        graph_thresh = -sigma2 * np.log(graph_thresh)
        D = radius_neighbors_graph(X,
                                   graph_thresh,
                                   metric=metric,
                                   mode='distance').toarray()
    W[D > 0] = np.exp(-D[D > 0] / sigma2)
    return W
Esempio n. 13
0
    def neighbors_plot(self):
        import gc
        from numpy import histogram
        import numpy as np
        from sklearn.neighbors import radius_neighbors_graph

        start_pos, end_pos, paths = FileSplitter.points()
        del start_pos, end_pos
        gc.collect()
        neighbors = radius_neighbors_graph(paths, radius=0.005)
        del paths
        gc.collect()
        neighbors = neighbors.toarray()
        x = np.matrix(neighbors)
        x = x.sum(axis=1)
        counts = [d[0, 0] for d in x]
        hist, edges = histogram(counts, bins=10, density=False)
        self.plot_on_bokeh_hist('neighbors_hist.html', '# of Neighbors',
                                '# of Occurrance', 'Neighbors Within Radius',
                                hist, edges)
        pass
Esempio n. 14
0
 def fit_predict(self, X):
     if type(self.radius_) is int:
         if input == "point cloud":
             adj = kneighbors_graph(X,
                                    n_neighbors=self.radius_,
                                    metric=self.metric_)
         if input == "distance matrix":
             adj = np.zeros(X.shape)
             idxs = np.argpartition(X, self.radius_,
                                    axis=1)[:, :self.radius_]
             for i in range(len(X)):
                 adj[i, idxs[i, :]] = np.ones(len(idxs[i]))
     else:
         if input == "point cloud":
             adj = radius_neighbors_graph(X,
                                          radius=self.radius_,
                                          metric=self.metric_)
         if input == "distance matrix":
             adj = np.where(X <= self.radius_, np.ones(X.shape),
                            np.zeros(X.shape))
     _, clusters = csgraph.connected_components(adj)
     return clusters
Esempio n. 15
0
def test_non_euclidean_kneighbors():
    rng = np.random.RandomState(0)
    X = rng.rand(5, 5)

    # Find a reasonable radius.
    dist_array = pairwise_distances(X).flatten()
    np.sort(dist_array)
    radius = dist_array[15]

    # Test kneighbors_graph
    for metric in ['manhattan', 'chebyshev']:
        nbrs_graph = neighbors.kneighbors_graph(X, 3, metric=metric).toarray()
        nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X)
        assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray())

    # Test radiusneighbors_graph
    for metric in ['manhattan', 'chebyshev']:
        nbrs_graph = neighbors.radius_neighbors_graph(X, radius,
                                                      metric=metric).toarray()
        nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X)
        assert_array_equal(nbrs_graph,
                           nbrs1.radius_neighbors_graph(X).toarray())

    # Raise error when wrong parameters are supplied,
    X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan')
    X_nbrs.fit(X)
    assert_raises(ValueError,
                  neighbors.kneighbors_graph,
                  X_nbrs,
                  3,
                  metric='euclidean')
    X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan')
    X_nbrs.fit(X)
    assert_raises(ValueError,
                  neighbors.radius_neighbors_graph,
                  X_nbrs,
                  radius,
                  metric='euclidean')
Esempio n. 16
0
def generate_edges(X, mode='kneighbors_graph', n_neighbors=3, radius=0.1):
    """
    returns array with pairs of indices [vertex_from, vertex_to] and weight vector
    """
    n_neighbors = min(n_neighbors, len(X) - 1)
    if n_neighbors == 0:
        return X[:, 3].reshape(-1, 1), np.zeros((1, 5)), np.zeros((2, 1))
    if mode == 'kneighbors_graph':
        adjacency_matrix = np.array((kneighbors_graph(X=X[:, :3], 
                                                      n_neighbors=n_neighbors, mode='distance')).todense())
    elif mode == 'radius_neighbors_graph':
        adjacency_matrix = np.array((radius_neighbors_graph(X=X[:, :3], 
                                                            radius=radius, mode='distance')).todense())
    else:
        raise 'Unknown mode {}'.format(mode)
    rows, cols = np.where(adjacency_matrix > 0)
    edges = np.vstack([rows, cols])
    weights = adjacency_matrix[rows, cols]
    
    nodes_features = X[:, 3].reshape(-1, 1)    
    edges_features = X[edges.T[:, 0]] - X[edges.T[:, 1]]
                
    return nodes_features, np.c_[edges_features, weights], edges.astype(int)
Esempio n. 17
0
def main():
    data, _ = make_swiss_roll(random_state=1)

    n_knn = 3
    kng = kneighbors_graph(data, n_neighbors=n_knn)
    title = 'KNN Graph where n_neighbors={}'.format(n_knn)
    plot_graph(data, kng, title=title)

    n_knn = 4
    kng = kneighbors_graph(data, n_neighbors=n_knn)
    title = 'KNN Graph where n_neighbors={}'.format(n_knn)
    plot_graph(data, kng)

    radius = 6.5
    rng = radius_neighbors_graph(data, radius=radius)
    title = 'RN Graph where radius={}'.format(radius)
    plot_graph(data, rng, title)

    n_neighbors = 5
    delta = 0.95
    ckng = cknneighbors_graph(data, n_neighbors=5, delta=0.95)
    title = 'CKNN Graph where n_neighbors={}, delta={}'\
        .format(n_neighbors, delta)
    plot_graph(data, ckng, title)
Esempio n. 18
0
    def _build_graph(self):
        """Compute the graph Laplacian."""

        # Graph sparsification
        if self.sparsify == 'epsilonNN':
            self.A_           = radius_neighbors_graph(self.X_, self.radius, include_self=False)
        else:
            Q                 = kneighbors_graph(
                self.X_,
                self.n_neighbors,
                include_self  = False
            ).astype(np.bool)

            if self.sparsify   == 'kNN':
                self.A_       = (Q + Q.T).astype(np.float64)
            elif self.sparsify == 'MkNN':
                self.A_       = (Q.multiply(Q.T)).astype(np.float64)

        # Edge re-weighting
        if self.reweight == 'rbf':
            W                 = rbf_kernel(self.X_, gamma=self.t)
            self.A_           = self.A_.multiply(W)

        return sp.csgraph.laplacian(self.A_, normed=self.normed)
Esempio n. 19
0
 def geodesic_radius(self, points=None, use_cache=True):
     if use_cache and self.geodesic_d is not None:
         return self.geodesic_d
     if points is None:
         points = self.points
     dist = self.euclidean_distances()
     nbrs_inc = np.argsort(dist, axis=1)
     max_dist = -1
     for i in range(dist.shape[0]):
         achieved_neighbors = 0
         while achieved_neighbors < min(self.n_neighbors, dist.shape[0]):
             j = achieved_neighbors
             if max_dist < dist[i][nbrs_inc[i][j]]:
                 max_dist = dist[i][nbrs_inc[i][j]]
             achieved_neighbors += 1
     nbrs = (NearestNeighbors(algorithm='auto',
                              n_neighbors=self.n_neighbors,
                              radius=max_dist,
                              n_jobs=self.n_jobs)
             .fit(points))
     kng = radius_neighbors_graph(
        nbrs, max_dist, mode='distance', n_jobs=self.n_jobs)
     self.geodesic_d = graph_shortest_path(kng, method='D', directed=False)
     return self.geodesic_d
Esempio n. 20
0
    def computeGraph(self, R=None, similarity='He', g=1, th_gauss=0.1):
        """
        Computes a sparse graph for the self graph structure.
        The self graph must containg a T-matrix, self.T

        Inputs:
            :self.T:   Data matrix
            :R: Radius. Edges link all data pairs at distance lower than R
                This is to forze a sparse graph.
            :similarity: Similarity measure used to compute affinity matrix
                Available options are:
                    'l1'     :1 minus l1 distance
                    'He'     :1 minus squared Hellinger's distance (JS)
                              (sklearn-based implementation)
                    'Gauss'  :An exponential function of the squared l2
                              distance
            :g: Exponent for the affinity mapping (not used for 'Gauss')
            :th_gauss:  Similarity threshold All similarity values below this
                threshold are set to zero. This is only for the gauss method,
                the rest of them compute the threshold automatically from R).

        Returns:
            :self.edgeT_id:  List of edges, as pairs (i, j) of indices
            :self.affinityT: List of affinity values for each pair in edgeT_id
            :self.df_edges:  Pandas dataframe with one row per edge and columns
                'Source', 'Target' and 'Weihgt'. The weight is equal to the
                (mapped) affinity value
        """

        logging.info(f"-- Computing graph with {self.n_nodes} nodes")
        logging.info(f"-- Similarity measure: {similarity}")

        # #########################
        # Computing Distance Matrix

        # This is just to abbreviate
        Tg = self.Tg

        # Select Distance measure for radius_neighbor_graph
        if similarity in ['Gauss', 'He']:
            d = 'l2'     # Note: l2 seems equivalent to minkowski (p=2)
        elif similarity in ['l1']:
            d = 'l1'     # Note: l1 seems equivalent to manhattan
        else:
            logging.error("computeTsubGraph ERROR: Unknown similarity measure")
            exit()

        # Select secondary radius
        R0 = R

        # Compute the connectivity graph of all pair of nodes at distence
        # below R0
        # IMPORTANT: Note that, despite radius_neighbors_graph has an option
        # 'distance' that returns the distance values, it cannot be used in
        # any case because the distance matrix does not distinghish between
        # nodes at distance > R0 and nodes at distance = 0
        t0 = time()
        logging.info(f'-- -- Computing neighbors_graph ...')
        if similarity in ['He']:
            # We must compute the connectivity graph because module
            # radius_neighbors_graph looses edges between nodes at zero
            # distance
            D = radius_neighbors_graph(np.sqrt(Tg), radius=R0,
                                       mode='connectivity', metric=d)
        elif similarity in ['l1', 'Gauss']:
            D = radius_neighbors_graph(Tg, radius=R0, mode='connectivity',
                                       metric=d)

        logging.info(f'       in {time()-t0} seconds')

        # ##############################################
        # From distance matrix to list of weighted edges

        # Compute lists with origin, destination and value for all edges in
        # the graph affinity matrix.
        orig_id, dest_id = D.nonzero()

        # Since the graph is undirected, we select ordered pairs orig_id,
        # dest_id only
        self.edgeT_id = list(filter(lambda i: i[0] < i[1],
                             zip(orig_id, dest_id)))

        # ####################
        # Computing Affinities

        logging.info(f"-- -- Computing affinities for {len(self.edgeT_id)}" +
                     " edges ...",)
        t0 = time()

        if similarity == 'He':
            # A new self.edgeT_id is returned because the function filters out
            # affinity values below th.
            self.edgeT_id, self.affinityT = self.he_affinity(Tg, R, g)

        elif similarity == 'l1':
            self.edgeT_id, self.affinityT = self.l1_affinity(Tg, R, g)

        elif similarity == 'Gauss':
            self.edgeT_id, self.affinityT = self.l2_affinity(Tg, R, th_gauss)
        else:
            logging.error("computeTsubGraph ERROR: Unknown similarity measure")

        logging.info(f"      reduced to {len(self.edgeT_id)} edges")
        logging.info(f'      Computed in {time()-t0} seconds')

        logging.info(("-- -- Graph generated with {0} nodes and {1} " +
                      "edges").format(self.n_nodes, len(self.edgeT_id)))

        return
Esempio n. 21
0
def seeds_merge(
    varr: xr.DataArray,
    max_proj: xr.DataArray,
    seeds: pd.DataFrame,
    thres_dist=5,
    thres_corr=0.6,
    noise_freq: Optional[float] = None,
) -> pd.DataFrame:
    """
    Merge seeds based on spatial distance and temporal correlation of their
    activities.

    This function build an adjacency matrix by thresholding spatial distance
    between seeds and temporal correlation between activities of seeds. It then
    merge seeds using the adjacency matrix by only keeping the seed with maximum
    intensity in the max projection within each connected group of seeds. The
    merge is therefore transitive.

    Parameters
    ----------
    varr : xr.DataArray
        Input movie data. Should have dimension "height", "width" and "frame".
    max_proj : xr.DataArray
        Max projection of the movie data.
    seeds : pd.DataFrame
        Dataframe of seeds to be merged.
    thres_dist : int, optional
        Threshold of distance between seeds in pixel. By default `5`.
    thres_corr : float, optional
        Threshold of temporal correlation between activities of seeds. By
        default `0.6`.
    noise_freq : float, optional
        Cut-off frequency for optional smoothing of activities before computing
        the correlation. If `None` then no smoothing will be done. By default
        `None`.

    Returns
    -------
    seeds : pd.DataFrame
        The resulting seeds dataframe with an additional column "mask_mrg",
        indicating whether the seed should be kept after the merge. If the
        column already exists in input `seeds` it will be overwritten.
    """
    print("computing distance")
    nng = radius_neighbors_graph(seeds[["height", "width"]], thres_dist)
    print("computing correlations")
    adj = adj_corr(varr, nng, seeds[["height", "width"]], noise_freq)
    print("merging seeds")
    adj = adj > thres_corr
    adj = adj + adj.T
    labels = label_connected(adj, only_connected=True)
    iso = np.where(labels < 0)[0]
    seeds_final = set(iso.tolist())
    for cur_cmp in np.unique(labels):
        if cur_cmp < 0:
            continue
        cur_smp = np.where(labels == cur_cmp)[0]
        cur_max = np.array([
            max_proj.sel(height=seeds.iloc[s]["height"],
                         width=seeds.iloc[s]["width"]) for s in cur_smp
        ])
        max_seed = cur_smp[np.argmax(cur_max)]
        seeds_final.add(max_seed)
    seeds["mask_mrg"] = False
    seeds.loc[list(seeds_final), "mask_mrg"] = True
    return seeds
Esempio n. 22
0
def main(
    fastq_r1,
    fastq_r2,
    barcodes,
    locations,
    tag_sequence,
    output_pdf,
    extra_pdf=None,
    debug=False,
    percentile=95.0,
):
    """
    This script generates some plots for mapping barcoded reads.

    Reads sequences from FASTQ_R1 and FASTQ_R2. Assumes that the first read
    contains a 15bp barcode split across two locations, along with an 8bp UMI.
    The second read is assumed to have TAG_SEQUENCE in bases 20-40.
    """
    create_logger(debug, dryrun=False)

    output_pdf = Path(output_pdf)

    log.debug(f"Reading from {fastq_r1}")
    with gzip.open(fastq_r1, "rt") as fh:
        r1_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)]

    log.debug(f"Reading from {fastq_r2}")
    with gzip.open(fastq_r2, "rt") as fh:
        r2_reads = [line.strip() for line in itertools.islice(fh, 1, None, 4)]

    log.debug(f"Reading {barcodes}")
    with open(barcodes) as fh:
        raw_bcs = ["".join(line.strip().split(",")) for line in fh]

    log.debug(f"Reading {locations}")
    with open(locations) as fh:
        x = np.array([float(v) for v in fh.readline().strip().split(",")])
        y = np.array([float(v) for v in fh.readline().strip().split(",")])
        xy = np.vstack((x, y)).T

    if extra_pdf is not None:
        extra_pdf_pages = PdfPages(extra_pdf)

        umi_counts = Counter(r[32:41] for r in r1_reads)
        log.debug(
            f"Found {len(umi_counts)} UMIs with {sum(umi_counts.values())} total counts"
        )

        plot_log_hist(umi_counts.values(), "Reads per UMI", extra_pdf_pages)
    else:
        extra_pdf_pages = None

    # pre-emptively remove poly-T/N sequences
    ok_barcodes = [not set(bc).issubset({"T", "N"}) for bc in raw_bcs]
    xy = xy[ok_barcodes, :]
    bead_barcodes = [bc for ok, bc in zip(ok_barcodes, raw_bcs) if ok]

    log.info(f"Read {len(raw_bcs)} barcodes and filtered to {len(bead_barcodes)}")

    seq_barcodes = sorted(r1[:8] + r1[26:32] for r1 in r1_reads)
    # remove poly-T sequence if present
    seq_barcodes = [seq for seq in seq_barcodes if set(seq) != {"T"}]

    log.info(f"Found {len(set(seq_barcodes))} unique barcodes in sequencing data")

    log.info("Computing barcode matching")

    log.debug("Computing radius neighbor graph")
    # adjacency matrix for all beads within radius of each other
    radius_matrix = radius_neighbors_graph(xy, radius=10.0)

    log.debug("Computing hamming neighbor graph")
    # adjacency matrix for all barcodes within hamming distance 1
    hamming_matrix = hamming1_adjacency(bead_barcodes)

    # just multiply together to get the combined adjacency matrix!
    combined_graph = nx.from_scipy_sparse_matrix(radius_matrix.multiply(hamming_matrix))

    # add xy coordinates to graph so we can analyze later
    for n, (x, y) in zip(combined_graph.nodes, xy):
        combined_graph.nodes[n]["x"] = x
        combined_graph.nodes[n]["y"] = y

    # get connected components to find groups of similar/close barcodes
    bead_groups = list(nx.connected_components(combined_graph))

    # calculate degenerate (ambiguous bases -> N) barcodes
    degen_bead_barcodes = [
        degen_barcode({bead_barcodes[j] for j in bg}) for bg in bead_groups
    ]

    log.debug(
        f"Collapsed {len(bead_groups)} bead groups into"
        f" {len(set(degen_bead_barcodes))} barcodes"
    )

    # average xy for grouped beads to get centroids
    bead_xy = dict()
    for bg, degen_bc in zip(bead_groups, degen_bead_barcodes):
        bg_graph = combined_graph.subgraph(bg)
        mean_x, mean_y = np.array(
            [[nd["x"], nd["y"]] for _, nd in bg_graph.nodes(data=True)]
        ).mean(0)
        bead_xy[degen_bc] = (mean_x, mean_y)

    barcode_matching = bipartite_matching(
        bead_barcodes, degen_bead_barcodes, bead_groups, seq_barcodes
    )

    if extra_pdf is not None:
        tag_barcodes = [r2[20:40] for r2 in r2_reads]
        tag_counts = Counter(tag_barcodes)

        sum(1 for r1 in r1_reads if (r1[:8] + r1[26:32]) in barcode_matching)

        umis_per_tag = defaultdict(set)
        for r1, r2 in zip(r1_reads, r2_reads):
            umis_per_tag[r2[20:40]].add(r1[32:41])

        plot_log_hist(tag_counts.values(), "Reads per tag", extra_pdf_pages)
        plot_log_hist(
            list(map(len, umis_per_tag.values())), "UMIs per tag", extra_pdf_pages
        )

    log.debug(f"Counting UMIs and reads per bead for sequence {tag_sequence}")
    reads_per_umi_per_bead = defaultdict(Counter)
    umis_per_bead = defaultdict(set)
    reads_per_bead = Counter()

    for r1, r2 in zip(r1_reads, r2_reads):
        seq_bc = r1[:8] + r1[26:32]

        if seq_bc not in barcode_matching:
            continue
        if r2[20:40] != tag_sequence:
            continue

        bead_bc = barcode_matching[seq_bc]
        umi = r1[32:41]

        reads_per_umi_per_bead[bead_bc][umi] += 1
        umis_per_bead[bead_bc].add(umi)
        reads_per_bead[bead_bc] += 1

    filtered_barcodes = [bc for bc in degen_bead_barcodes if umis_per_bead[bc]]
    bead_xy_a = np.vstack([bead_xy[dbc] for dbc in filtered_barcodes])

    with gzip.open(output_pdf.with_suffix(".reads_per_umi.txt.gz"), "wt") as out:
        print("bead_barcodes\tumi\treads", file=out)
        for bc in filtered_barcodes:
            for umi in reads_per_umi_per_bead[bc][umi]:
                print(f"{bc}\t{umi}\t{reads_per_umi_per_bead[bc][umi]}", file=out)

    with output_pdf.with_suffix(".txt").open("w") as out:
        print("bead_barcode\tumis\treads", file=out)
        for bc in filtered_barcodes:
            print(f"{bc}\t{len(umis_per_bead[bc])}\t{reads_per_bead[bc]}", file=out)

    if extra_pdf is not None:
        plot_log_hist(
            [len(umis_per_bead[bc]) for bc in filtered_barcodes],
            "UMIs per bead",
            extra_pdf_pages,
        )

        extra_pdf_pages.close()

    pdf_pages = PdfPages(output_pdf)

    log.info("Making plots")
    spatial_plot(
        bead_xy_a,
        [len(umis_per_bead[bc]) for bc in filtered_barcodes],
        "UMIs per bead",
        pdf_pages,
        pct=percentile,
    )

    spatial_plot(
        bead_xy_a,
        [np.log10(len(umis_per_bead[bc])) for bc in filtered_barcodes],
        "log10 UMIs per bead",
        pdf_pages,
        pct=percentile,
    )

    spatial_plot(
        bead_xy_a,
        [reads_per_bead[bc] for bc in filtered_barcodes],
        "Reads per bead",
        pdf_pages,
        pct=percentile,
    )

    spatial_plot(
        bead_xy_a,
        [np.log10(1 + reads_per_bead[bc]) for bc in filtered_barcodes],
        "log10 reads per bead",
        pdf_pages,
        pct=percentile,
    )

    pdf_pages.close()
    log.info("Done!")
Esempio n. 23
0
def traj_segment_generator(pi, env, horizon, adam, vfgrad, stochastic, total_gen):


    gen_graph_this_episode=total_gen # to generate or not a graph during this episode
    stats=[] # variable to keep statistics and save them on disk
    G = nx.Graph() # Graph variable
    states= [] # History of visited states
    node_ptr=0 # Pointer used to keep track of the states' list


    i_episode=0
    print('New graph at episode {}'.format(i_episode))



    t = 0
    ac = env.action_space.sample() # not used, just so we have the datatype
    new = True # marks if we're on first timestep of an episode
    ob = env.reset()
    states.append(ob)

    cur_ep_ret = 0 # return in current episode
    cur_ep_len = 0 # len of current episode
    ep_rets = [] # returns of completed episodes in this segment
    ep_lens = [] # lengths of ...

    # Initialize history arrays
    obs = np.array([ob for _ in range(horizon)])
    rews = np.zeros(horizon, 'float32')
    vpreds = np.zeros(horizon, 'float32')
    sigmapreds = np.zeros(horizon, 'float32')
    news = np.zeros(horizon, 'int32')
    acs = np.array([ac for _ in range(horizon)])
    prevacs = acs.copy()

    while True:
        prevac = ac
        ac, vpred, sigmapred = pi.act(stochastic, ob)        
        # Slight weirdness here because we need value function at time T
        # before returning segment [0, T-1] so we get the correct
        # terminal value
        if t > 0 and t % horizon == 0:
            yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "sigmapred": sigmapreds, "new" : news,
                    "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new), 
                    "nextsigmapred": sigmapred * (1-new),
                    "ep_rets" : ep_rets, "ep_lens" : ep_lens}
            # Be careful!!! if you change the downstream algorithm to aggregate
            # several of these batches, then be sure to do a deepcopy
            ep_rets = []
            ep_lens = []
        i = t % horizon
        obs[i] = ob
        vpreds[i] = vpred
        sigmapreds[i] = sigmapred
        news[i] = new
        acs[i] = ac
        prevacs[i] = prevac



        ob, rew, new, _ = env.step(ac)
        rews[i] = rew
        states.append(ob)
        node_ptr+=1 # Move pointer
        G.add_edge(node_ptr-1,node_ptr) # Add transition to model


        if rew and gen_graph_this_episode and len(states) > 20:

            gen_graph_this_episode=0 # Only generate one graph per episode
            total_gen=max(0,total_gen-1) # Decrease the total amount of graph generations


            # Radius-Bsaed Nearest Neighbours search to add edges
            radius = 5. 
            states = np.array(states)
            adj = nn.radius_neighbors_graph(states,radius)
            adj = adj+nx.adjacency_matrix(G)
            aug_G = nx.from_scipy_sparse_matrix(adj) # Augmented Graph


            # Identify the sources and the sinks
            source = 0 
            sink = len(states) -1
            max_sources = 40    # Max number of sources
            max_sinks=40        # Max number of sinks
            other_sources =list(range(max_sources))
            other_sinks =list(range(len(states)-max_sinks,len(states)))

            # Create the features and labels for GCN
            features = np.eye(len(states), dtype=np.float32)
            features = sparse_to_tuple(sp.lil_matrix(features))
            labels = np.zeros((len(states)))
            labels[-max_sinks:] = 1
            labels = encode_onehot(labels)


            # Diffuse the reward signal
            diffused = get_graph(aug_G.edges(),adj,features,labels,source,sink,other_sources,other_sinks)

            #Smoothen the diffused result
            interpol = make_interpolater(min(diffused),max(diffused),0,1.)
            targets = interpol(diffused) 

            #Apply to the value function           
            for epo in range(100):
                grads = vfgrad(states,targets,1.)
                adam.update(grads, 1e-3)           
            states= list(states)




        cur_ep_ret += rew
        cur_ep_len += 1
        if new:

            gen_graph_this_episode=total_gen # Reset the gen_graph variable 
            i_episode+=1
            if i_episode % 3 ==0 and gen_graph_this_episode:
                print('New graph at episode {} Remaining graphs {}'.format(i_episode,total_gen))
                G = nx.Graph()
                states= []
                node_ptr=-1 # reset pointer

            ep_rets.append(cur_ep_ret)
            ep_lens.append(cur_ep_len)
            cur_ep_ret = 0
            cur_ep_len = 0
            ob = env.reset()

            
            states.append(ob) 
            node_ptr+=1 # to avoid making edges between a terminal state and an initial state

            
        t += 1
Esempio n. 24
0
print("Current Working Directory ", os.getcwd())

cur_data_dir = os.getcwd()
mat_fname = pjoin(cur_data_dir, 'isomap.mat')
matFile1 = sio.loadmat(mat_fname)

data = matFile1['images']

data.shape
pixelno = data.shape[0]
imageno = data.shape[1]
data = data.T

A = radius_neighbors_graph(data,
                           eps,
                           mode='connectivity',
                           metric='minkowski',
                           p=P,
                           include_self=False)
A.toarray()
MIN = np.sum(A.toarray(), axis=1)
min(MIN)
MIN.shape
MAX = np.sum(A.toarray(), axis=1)
max(MAX)

x, y = A.toarray().nonzero()[0], A.toarray().nonzero()[1]

edges = [(i, j) for i, j in zip(x, y)]

nodename = range(0, len(data))
Esempio n. 25
0
    def _pairwise_similarity(self, embeddings, edge_type="d"):
        if edge_type == 'd':
            embeddings_X = embeddings[:, 0:int(self.embedding_d / 2)]
            embeddings_Y = embeddings[:,
                                      int(self.embedding_d /
                                          2):self.embedding_d]

            if self.directed_distance == "euclidean_ball":
                embeddings_stacked = np.vstack([embeddings_X, embeddings_Y])
                adj = radius_neighbors_graph(embeddings_stacked,
                                             radius=self.margin,
                                             n_jobs=-2)
                adj = adj[0:embeddings_X.shape[0], :][:,
                                                      embeddings_X.shape[0]:]
                print("radius_neighbors_graph")

            elif self.directed_distance == "euclidean":
                adj = pairwise_distances(X=embeddings_X,
                                         Y=embeddings_Y,
                                         metric="euclidean",
                                         n_jobs=-2)

                # Get node-specific adaptive threshold
                # adj = self.transform_adj_adaptive_threshold(adj, margin=0)
                # adj = self.transform_adj_beta_exp(adj, edge_types="d", sample_negative=self.negative_sampling_ratio)
                adj = np.exp(-2.0 * adj)
                print("Euclidean dist")

            elif self.directed_distance == "cosine":
                adj = pairwise_distances(X=embeddings_X,
                                         Y=embeddings_Y,
                                         metric="cosine",
                                         n_jobs=-2)
                print("Cosine similarity")

            elif self.directed_distance == "dot_sigmoid":
                adj = np.matmul(embeddings_X, embeddings_Y.T)
                adj = sigmoid(adj)
                print("Dot product & sigmoid")
            elif self.directed_distance == "dot_softmax":
                adj = np.matmul(embeddings_X, embeddings_Y.T)
                adj = softmax(adj)
                print("Dot product & softmax")

        elif edge_type == 'u':
            if self.undirected_distance == "euclidean_ball":
                adj = radius_neighbors_graph(embeddings,
                                             radius=self.margin,
                                             n_jobs=-2)

            elif self.undirected_distance == "euclidean":
                adj = pairwise_distances(X=embeddings,
                                         metric="euclidean",
                                         n_jobs=-2)
                # adj = np.exp(-2.0 * adj)
                adj = self.transform_adj_beta_exp(adj,
                                                  edge_types=["u", "u_n"],
                                                  sample_negative=False)
                # adj = self.transform_adj_adaptive_threshold(adj, margin=self.margin/2)
                print("Euclidean dist")

            elif self.undirected_distance == "cosine":
                adj = pairwise_distances(X=embeddings,
                                         metric="cosine",
                                         n_jobs=-2)

            elif self.undirected_distance == "dot_sigmoid":
                adj = np.matmul(embeddings, embeddings.T)
                adj = sigmoid(adj)
            elif self.undirected_distance == "dot_softmax":
                adj = np.matmul(embeddings, embeddings.T)
                adj = softmax(adj)
        else:
            raise Exception("Unsupported edge_type", edge_type)
        return adj
 def NNGraph(self, data, limit=0.4):
     # Create the nearest neighbors graph
     graph = radius_neighbors_graph(data, limit, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False)
     graph = graph.toarray()
     return graph
Esempio n. 27
0
def compute_graph(X, dims, r_cut, metric=pbc):
    BT = BallTree(X, metric=metric, dims=dims)
    rng_con = radius_neighbors_graph(BT, r_cut, mode="connectivity")
    A = np.matrix(rng_con.toarray())
    G = nx.from_numpy_matrix(A)
    return G
Esempio n. 28
0
from scipy.sparse import csgraph

# https://medium.com/@tomernahshon/spectral-clustering-from-scratch-38c68968eae0

# In[5]:

random_state = 21
X, value = make_moons(150, noise=.07, random_state=random_state)
fig, ax = plt.subplots()
ax.set_title('Truth')
ax.scatter(X[:, 0], X[:, 1], c='k', s=50)

# In[6]:

A = radius_neighbors_graph(X, 0.4, mode='distance')
G = csgraph.laplacian(A, normed=False)

# In[ ]:

radius_neighbors_graph(np.array([[0, 0], [1, 1], [1, 2]]),
                       radius=2,
                       mode='distance').toarray()

# In[10]:

G.toarray()[0, :]

# In[11]:

eigval, eigvec = np.linalg.eig(G.toarray())
def get_local_densities(data, kernel_mult = 2.0, metric = 'manhattan'):
    """For each sample point of the data-set 'data', estimate a local density in feature
        space by counting the number of neighboring data-points within a particular
        region centered around that sample point.
    
    Parameters
    ----------
    data : array of shape (n_samples, n_features)
        The data-set, a fraction of whose sample points will be extracted
        by density sampling.
    
    kernel_mult : float, optional (default = 2.0)
        The kernel multiplier, which determine (in terms of the median of the distribution
        of distances among nearest neighbors) the extent of the regions centered
        around each sample point to consider for the computation of the local density
        associated to that particular sample point.
    
    metric : string, optional (default = 'manhattan')
        The distance metric used to determine the nearest-neighbor to each data-point.
        The DistanceMetric class defined in scikit-learn's library lists all available
        metrics.
    
    Returns
    -------
    local_densities : array of shape (n_samples,)
        The i-th entry of this vector corresponds to the local density of the i-th sample
        point in the order of the rows of 'data'.  
    """
    
    data = np.atleast_2d(data)
    
    assert isinstance(kernel_mult, numbers.Real) and kernel_mult > 0
    
    kernel_width = kernel_mult * median_min_distance(data, metric)
    
    N_samples = data.shape[0]

    if 8.0 * get_chunk_size(N_samples, 1) > N_samples:
        A = radius_neighbors_graph(data, kernel_width, mode = 'connectivity', metric = metric, include_self = True)

        rows, _ = A.nonzero()
        with NamedTemporaryFile('w', delete = True, dir = './') as file_name:
            fp = np.memmap(file_name, dtype = int, mode = 'w+', shape = rows.shape)
            fp[:] = rows[:]
            _, counts = np.unique(fp, return_counts = True)

        local_densities = np.zeros(N_samples, dtype = int)
        for i in xrange(N_samples):
            local_densities[i] = counts[i]
    else:
        local_densities = np.zeros(N_samples, dtype = int)

        chunks_size = get_chunk_size(N_samples, 2)
        for i in xrange(0, N_samples, chunks_size):
            chunk = data[i:min(i + chunks_size, N_samples)]

            D = pairwise_distances(chunk, data, metric, n_jobs = 1)

            D = (D <= kernel_width)

            local_densities[i + np.arange(min(chunks_size, N_samples - i))] = D.sum(axis = 1)
        
    return local_densities
Esempio n. 30
0
def main(args):
    name_of_pdf_dir = os.path.basename(args.directory_with_pdfs)

    all_text = get_all_pdf_text_concatenated(args.directory_with_pdfs)

    pars = pd.Series(all_text.split('\n\n')).str.replace('\n', ' ')

    pars.str.len().apply(lambda x: np.log2(x + 1)).astype(int).value_counts()  # TODO, is this being stored anywhere?

    text_keywords = keywords(all_text, scores=True, lemmatize=True, words=args.num_keywords)

    lower_bound_chars, upper_bound_chars = args.lower_bound_chars, args.upper_bound_chars
    word_count = int((lower_bound_chars + upper_bound_chars) / (2 * (avg_word_len + 1)))
    lens = pars.str.len()  # paragraph lengths
    nice_pars = pars[(lens >= lower_bound_chars)]  # paragraphs we want to use

    nice_pars = nice_pars.apply(
        partial(text_reduce_return,
                upper_bound_chars=upper_bound_chars, max_word_count=word_count)
    )

    vecs = emb(tuple(nice_pars), args.tfhub_sentence_encoder_url).numpy()

    D = sk.metrics.pairwise_distances(vecs, metric='cosine')  # pairwise distances of vectors
    R = scipy.sparse.csgraph.minimum_spanning_tree(D).max()  # reduced graph
    G = neighbors.radius_neighbors_graph(vecs, R, metric='cosine')

    core = nx.k_core(nx.Graph(G))

    # Capitalize all occurrences of keywords for easy display on the output
    # TODO, make matching case insensitive
    pattern = re.compile(f"\\b({tz.pipe(text_keywords, tz.pluck(0), '|'.join)})\\b")
    nice_pars = nice_pars.apply(
        lambda x: re.sub(pattern, lambda m: m.group().upper(), x))  # TODO add [[]] around our keywords for zettelkasten

    core_nodes = core.nodes
    core_pars = np.array(nice_pars)[core_nodes]
    core_vecs = vecs[core_nodes]

    sil_u, n, lab, sil, p = clust(nx.adjacency_matrix(core), core_vecs, 8)

    layers = nx.onion_layers(core)

    df = pd.DataFrame(
        data=[{"Label": par, "Cluster ID": cid, "Silhouette Score": ss} for par, cid, ss in zip(core_pars, lab, sil)])

    df = df[df["Silhouette Score"] > 0]

    df['Cluster ID'] = df.apply(lambda row: "T" + str(row['Cluster ID']), axis=1)

    # add footer to dataframe so that csv export will be imported by gsheet's tree map plotter correctly
    for cluster_id in df['Cluster ID'].unique():
        df = df.append({"Label": cluster_id, "Cluster ID": name_of_pdf_dir, "Silhouette Score": None},
                       ignore_index=True)
    else:
        df = df.append({"Label": name_of_pdf_dir, "Cluster ID": None, "Silhouette Score": None}, ignore_index=True)

    df.to_csv(args.output_filename, index=False)

    return {
        "text_keywords": text_keywords
    }
Esempio n. 31
0
        try:
            return summarize(paragraph, word_count=word_count).replace("\n", " ") or \
                   paragraph[:upper_bound_chars]
        except ValueError:  # usually happens if there aren't multiple sentences in paragraph
            return paragraph[:upper_bound_chars]

nice_pars = nice_pars.apply(text_reduce_return)


len(nice_pars), len(pars)

vecs = emb(tuple(nice_pars), "https://tfhub.dev/google/universal-sentence-encoder-large/5").numpy()

D = sk.metrics.pairwise_distances(vecs, metric='cosine')  # pairwise distances of vectors
R = scipy.sparse.csgraph.minimum_spanning_tree(D).max()  # reduced graph
G = neighbors.radius_neighbors_graph(vecs, R, metric='cosine')

@curry
def clust(g, v, n):
    pipe = pipeline.Pipeline([
        ('agg', cluster.AgglomerativeClustering(n, connectivity=g, linkage='ward', affinity='euclidean'))
    ])
    labels = pipe.fit_predict(v)
    silh = sk.metrics.silhouette_samples(v, labels, metric='cosine')
    return (silh.mean(), n, labels, silh, pipe)

core = nx.k_core(nx.Graph(G))

# Capitalize all occurrences of keywords for easy display on the output
pattern = re.compile(f"\\b({tz.pipe(keywords, tz.pluck(0), '|'.join)})\\b")  # TODO, make matching case insensitive
nice_pars = nice_pars.apply(lambda x: re.sub(pattern, lambda m: m.group().upper(), x))  # TODO, add [[]] around our keywords
Esempio n. 32
0
def get_local_densities(data, kernel_mult=2.0, metric='manhattan'):
    """For each sample point of the data-set 'data', estimate a local density in feature
        space by counting the number of neighboring data-points within a particular
        region centered around that sample point.
    
    Parameters
    ----------
    data : array of shape (n_samples, n_features)
        The data-set, a fraction of whose sample points will be extracted
        by density sampling.
    
    kernel_mult : float, optional (default = 2.0)
        The kernel multiplier, which determine (in terms of the median of the distribution
        of distances among nearest neighbors) the extent of the regions centered
        around each sample point to consider for the computation of the local density
        associated to that particular sample point.
    
    metric : string, optional (default = 'manhattan')
        The distance metric used to determine the nearest-neighbor to each data-point.
        The DistanceMetric class defined in scikit-learn's library lists all available
        metrics.
    
    Returns
    -------
    local_densities : array of shape (n_samples,)
        The i-th entry of this vector corresponds to the local density of the i-th sample
        point in the order of the rows of 'data'.  
    """

    data = np.atleast_2d(data)

    assert isinstance(kernel_mult, numbers.Real) and kernel_mult > 0

    kernel_width = kernel_mult * median_min_distance(data, metric)

    N_samples = data.shape[0]

    if 8.0 * get_chunk_size(N_samples, 1) > N_samples:
        A = radius_neighbors_graph(data,
                                   kernel_width,
                                   mode='connectivity',
                                   metric=metric,
                                   include_self=True)

        rows, _ = A.nonzero()
        with NamedTemporaryFile('w', delete=True, dir='./') as file_name:
            fp = np.memmap(file_name, dtype=int, mode='w+', shape=rows.shape)
            fp[:] = rows[:]
            _, counts = np.unique(fp, return_counts=True)

        local_densities = np.zeros(N_samples, dtype=int)
        for i in xrange(N_samples):
            local_densities[i] = counts[i]
    else:
        local_densities = np.zeros(N_samples, dtype=int)

        chunks_size = get_chunk_size(N_samples, 2)
        for i in xrange(0, N_samples, chunks_size):
            chunk = data[i:min(i + chunks_size, N_samples)]

            D = pairwise_distances(chunk, data, metric, n_jobs=1)

            D = (D <= kernel_width)

            local_densities[i + np.arange(min(chunks_size, N_samples -
                                              i))] = D.sum(axis=1)

    return local_densities
Esempio n. 33
0
# Import libraries
import json
import matplotlib.pyplot as plt, numpy as np, pandas as pd
from sklearn.neighbors import radius_neighbors_graph
from scipy.sparse.csgraph import connected_components

# Contact spacing
dist1 = 0.22
dist2 = 0.46

# Get connected components and distances
data = pd.read_csv("output/dispcont.csv",
                   names=["x", "y", "w", "one"],
                   usecols=["x", "y"])
data["cc"] = connected_components(radius_neighbors_graph(data.values, 0.06))[1]
ccs = data.groupby("cc").count().reset_index().sort_values(
    "x")[-4:]["cc"].values
data["dist"] = np.sqrt(data["x"]**2 + data["y"]**2)

# Get points
refpts = []
pts = []
for cc in ccs:
    # Filter to connected component
    ccdata = data.loc[data["cc"] == cc]

    # Reference point that is closest to center
    refpt = ccdata.loc[ccdata["dist"].idxmin()]
    ccdata["refdist"] = np.sqrt((ccdata["x"] - refpt["x"])**2 +
                                (ccdata["y"] - refpt["y"])**2)
Esempio n. 34
0
    def fit(self, X, y=None):
        """
        Fit the ToMATo class on a point cloud: compute the ToMATo clusters and store the corresponding labels in a numpy array called labels_

        Parameters:
            X (numpy array of shape (num_points) x (num_coordinates)): input point cloud.
            y (n x 1 array): point labels (unused).
        """
        num_pts = X.shape[0]

        if self.verbose:
            print("Computing density estimator")
        self.density_estimator.fit(X)
        self.density_values = self.density_estimator.score_samples(X)
        if self.verbose:
            plt.scatter(X[:,0], X[:,1], s=5., c=self.density_values)
            plt.show()

        if self.verbose:
            print("Computing underlying graph")
        if self.n_neighbors is not None: 
            A = kneighbors_graph(X, self.n_neighbors).toarray()
            A = np.minimum(A + A.T, np.ones(A.shape))
        elif self.radius is not None:
            A = radius_neighbors_graph(X, self.radius).toarray()
        else:
            radius = estimate_scale(X, N=100, inp="point cloud", C=10., beta=0.)
            if self.verbose:
                print("radius = " + str(radius))
            A = radius_neighbors_graph(X, radius).toarray()

        if self.verbose:
            print("Sorting points by density")
        sorted_idxs = np.flip(np.argsort(self.density_values))
        inv_sorted_idxs = np.arange(num_pts)
        for i in range(num_pts):
            inv_sorted_idxs[sorted_idxs[i]] = i

        if self.verbose:
            print("Computing tau")
        if self.tau is not None:
            tau = self.tau
        else:
            st = gd.SimplexTree()
            for i in range(num_pts):
                st.insert([i], filtration=-self.density_values[i])
            for i in range(num_pts):
                for j in range(i+1,num_pts):
                    if A[i,j] == 1.:
                        st.insert([i,j], filtration=max(-self.density_values[i],-self.density_values[j]))
            d = st.persistence()
            plot = gd.plot_persistence_diagram(d)
            plot.show()
            dgm = st.persistence_intervals_in_dimension(0)
            persistences = np.sort([abs(y-x) for (x,y) in dgm])
            if self.n_clusters is not None:
                tau = (persistences[-self.n_clusters-1] + persistences[-self.n_clusters]) / 2
            else:
                n_clusters = np.argmax(np.flip(persistences[1:-1] - persistences[:-2])) + 2
                tau = (persistences[-n_clusters-1] + persistences[-n_clusters]) / 2
        if self.verbose:
            print("tau = " + str(tau))

        if self.verbose:
            print("Applying UF sequentially")
        diag, parents = {}, -np.ones(num_pts, dtype=np.int32)
        for i in range(num_pts):

            current_pt = sorted_idxs[i]
            neighbors = np.squeeze(np.argwhere(A[current_pt,:] == 1.))
            higher_neighbors = [n for n in neighbors if inv_sorted_idxs[n] <= i] if len(neighbors.shape) > 0 else []

            if higher_neighbors == []:

                parents[current_pt] = current_pt
                diag[current_pt] = -np.inf

            else:

                g = higher_neighbors[np.argmax(self.density_values[np.array(higher_neighbors)])]
                pg = self.find(g, parents)
                parents[current_pt] = pg

                for neighbor in higher_neighbors:

                    pn = self.find(neighbor, parents)
                    val = min(self.density_values[pg], self.density_values[pn])

                    if pg != pn and val < self.density_values[current_pt] + tau and val > tau:
                        self.union(pg, pn, parents, self.density_values)
                        pp = pg if self.density_values[pg] < self.density_values[pn] else pn
                        diag[pp] = current_pt

        self.labels_ = np.array([self.find(n, parents) for n in range(num_pts)])
        self.labels_ = LabelEncoder().fit_transform(np.where(self.density_values[self.labels_] > tau, self.labels_, -np.ones(self.labels_.shape)))
Esempio n. 35
0
def main(argv):
     
     parser = argparse.ArgumentParser(epilog="NOTE: it is important to have a smooth histogram for accurate fitting\n\n")
     parser.add_argument("filename", help="input filename")
     
     parser.add_argument("-m", "--metric" , type=str,  help="define the scipy distance to be used   (Default: euclidean or hamming for MSA)",default='euclidean')
     parser.add_argument("-x", "--matrix", help="if the input file contains already the complete upper triangle of a distance matrix (2 Formats: (idx_i idx_j distance) or simply distances list ) (Opt)", action="store_true")
     parser.add_argument("-k", "--n_neighbors", type=int, help="nearest_neighbors parameter (Default k=3)", default=3)
     parser.add_argument("-r", "--radius", type=float, help="use neighbor radius instead of nearest_neighbors  (Opt)",default=0.)
     parser.add_argument("-b", "--n_bins", type=int, help="number of bins for distance histogram (Default 50)",default=50)
     parser.add_argument("-M", "--r_max", type=float, help="fix the value of distance distribution maximum in the fit (Opt, -1 force the standard fit, avoiding consistency checks)",default=0)
     parser.add_argument("-n", "--r_min", type=float, help="fix the value of shortest distance considered in the fit (Opt, -1 force the standard fit, avoiding consistency checks)",default=-10)
     parser.add_argument("-D", "--direct", help="analyze the direct (not graph) distances (Opt)", action="store_true")
     parser.add_argument("-I", "--projection", help="produce an Isomap projection using the first ID components (Opt)", action="store_true")
     
     args = parser.parse_args()
     input_f = args.filename
     me=args.metric
     n_neighbors = args.n_neighbors
     radius=args.radius+0
     MSA=False
     n_bins = args.n_bins
     rmax=args.r_max
     mm=-10000

     print '\nFile name: ', input_f
     
     #0 Reading input file
     f1 = open(input_f)
     data = []
     data_line = []
     labels = []

     for line in f1:
         if line[0]==">" : 
               MSA=True
               labels.append(line)
         if line[0]!=">" and MSA==True : 
               data.append([ord(x) for x in line[:-1]])
               data_line.append(line)
         elif line[0]!="#" and MSA==False : 
               data.append([float(x) for x in line.split()])
               data_line.append(line) 
     f1.close()

     data = n.asarray(data)
     if MSA : me='hamming'
     if args.matrix : me='as from the input file'
     print 'Metric: ', me
     if radius>0. and (args.direct==False) : print 'Nearest Neighbors Radius:', radius
     elif (args.direct==False): print 'Nearest Neighbors number K: ', n_neighbors
     else : print 'Distance distribution are calculated based on the  direct input-space distances '
     
     if radius>0. :  
        filename = str(input_f.split('.')[0])+'R'+str(radius)
     else  :
        filename = str(input_f.split('.')[0])+'K'+str(n_neighbors)
     #0
      
     #1 Computing geodesic distance on connected points of the input file and relative histogram
     if args.matrix :
        if data.shape[1] == 1 :
           dist_mat=distance.squareform(data.ravel())
           mm=dist_mat.shape[1]
        elif data.shape[1] == 3 : 
           mm=int(max(data[:,1]))
           dist_mat=n.zeros((mm,mm))
           for i in range(0,data.shape[0]):
               dist_mat[int(data[i,0])-1,int(data[i,1])-1]=data[i,2]
               dist_mat[int(data[i,1])-1,int(data[i,0])-1]=data[i,2]
        else : print 'ERROR: The distances input is not in the right matrix format' ; sys.exit(2)

        print "\n# points: ", mm

        A=n.zeros((mm,mm))
        rrr=[]
           
        if radius > 0. :
           for i in range(0,mm):
               ll=dist_mat[i] < radius
               A[i,ll]=dist_mat[i,ll]
        else :
           rrr=n.argsort(dist_mat)
           for i in range(0,mm):
               ll=rrr[i,0:n_neighbors+1]
               A[i,ll]=dist_mat[i,ll]
           radius = A.max()

        if args.direct : C=dist_mat
        else : C= graph_shortest_path(A,directed=False)
        
     else : 
        print "\n# points, coordinates: ", data.shape
        if args.direct : C=distance.squareform(distance.pdist(data,me));
        elif radius>0. :
           A = radius_neighbors_graph(data, radius,metric=me,mode='distance')
           C= graph_shortest_path(A,directed=False)
        else  :
           A = kneighbors_graph(data, n_neighbors,metric=me,mode='distance')
           C= graph_shortest_path(A,directed=False)
           radius=A.max()

     C=n.asmatrix(C)
     connect=n.zeros(C.shape[0])
     conn=n.zeros(C.shape[0])
     for i in range(0,C.shape[0]) :
         conn_points=n.count_nonzero(C[i])
         conn[i]=conn_points
         if conn_points > C.shape[0]/2. : connect[i]=1
         else : C[i]=0

     if n.count_nonzero(connect) > C.shape[0]/2. :
        print 'Number of connected points:', n.count_nonzero(connect), '(',100*n.count_nonzero(connect)/C.shape[0],'% )'
     else : print 'The neighbors graph is highly disconnected, increase K or Radius parameters' ; sys.exit(2)

     if n.count_nonzero(connect) < data.shape[0] :
        data_connect_file = open('connected_data_{0}.dat'.format(filename), "w")
        for i in range(0,C.shape[0]) :
            if connect[i]==1 :
               if MSA : data_connect_file.write(labels[i])
               data_connect_file.write(data_line[i])
        data_connect_file.close()

     
     indices = n.nonzero(n.triu(C,1))
     dist_list = n.asarray( C[indices] )[-1]
     
     dist_file= open('dist_{0}.dat'.format(filename), "w")

     for i in range(0, len(dist_list)):
         dist_file.write("%s " % ((dist_list[i])))
     dist_file.close()

     h=n.histogram(dist_list,n_bins)
     dx=h[1][1]-h[1][0]


     plt.figure(1)
     plt.plot(h[1][0:n_bins]+dx/2,h[0],'o-',label='histogram')
     plt.xlabel('r')
     plt.ylabel('N. counts')
     plt.legend()
     plt.savefig(filename+'_hist.png')
     distr_x = []
     distr_y = []

     avg=n.mean(dist_list)
     std=n.std(dist_list)

     if rmax> 0 : 
        avg=rmax
        std=min(std,rmax)
        print '\nNOTE: You fixed r_max for the initial fitting, average will have the same value' 
     else : 
        mm=n.argmax(h[0])
        rmax=h[1][mm]+dx/2

     if args.r_max== -1 : 
        print '\nNOTE: You forced r_max to the maximum of the distribution in the initial fitting, avoiding consistency checks with the average'
        avg=rmax
        std=min(std,rmax)

     if args.r_min>= 0 : print '\nNOTE: You fixed r_min for the initial fitting: r_min = ',args.r_min
     if args.r_min== -1 : print '\nNOTE: You forced r_min to the standard procedure in the initial fitting'
     
     print '\nDistances Statistics:'
     print 'Average, standard dev., n_bin, bin_size, r_max, r_NN_max:', avg , std, n_bins, dx, rmax, radius,'\n'
     #1
     tmp=1000000
     if(args.r_min>=0) : tmp=args.r_min
     elif(args.r_min==-1) : tmp=rmax-std
       
     if(n.fabs(rmax-avg)>std+2.*dx) :
        print 'ERROR: There is a problem with the r_max detection:' 
        print '       usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)'
        print '       or r_max and r_avg are too distant and you may consider to fix the first detection of r_max with option -M' 
        print '       or to change the neighbor parameter with (-r/-k)'
        plt.show()
        sys.exit()
     elif(rmax<= min(radius+dx,tmp)) :
        print 'ERROR: There is a problem with the r_max detection, it is shorter than the largest distance in the neighbors graph.'
        print '       You may consider to fix the first detection of r_max with option -M and/or the r_min with option -n to fix the fit range' 
        print '       or to decrease the neighbors parameter with (-r/-k). For example It is possible to enforce the standard fit range with '
        print '       r_min=r_max-2*sigma running option "-n -1"'
        plt.show()
        sys.exit()

     #2 Finding actual r_max and std. dev. to define fitting interval [rmin;rM] 
     distr_x=h[1][0:n_bins]+dx/2
     distr_y=h[0][0:n_bins]
     
     res= n.empty(25)
     left_distr_x = n.empty(n_bins)
     left_distr_y = n.empty(n_bins)

     left_distr_x= distr_x[n.logical_and(n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.0),distr_y[:]>0.000001)]
     left_distr_y= n.log(distr_y[n.logical_and(n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.0),distr_y[:]>0.000001)])

     if(left_distr_y.shape[0]<4) :
        print('ERROR: Too few datapoints to fit the distribution:')
        print('       usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)')
        print('       or the distance distribution itself has some issue')
        plt.show()
        print('R, Dfit, Dmin', 'ERROR3' , '\n')
        sys.exit()

     coeff = n.polyfit(left_distr_x,left_distr_y,2,full='False')    
     a0=coeff[0][0]
     b0=coeff[0][1]
     c0=coeff[0][2]
       
     rmax_old=rmax
     std_old=std
     rmax = -b0/a0/2.0
     
     if(args.r_max>0) : rmax=args.r_max 
     #if(args.r_max==-1) : rmax=avg   #to be used in future in case of problem with Ymax   
     if a0<0 and n.fabs(rmax-rmax_old)<std_old/2+dx :
        std=n.sqrt(-1/a0/2.)
     else:
        rmax=avg
        std=std_old

     left_distr_x= distr_x[n.logical_and(distr_y[:]>0.000001,n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.+dx))]
     left_distr_y= n.log(distr_y[n.logical_and(distr_y[:]>0.000001, n.logical_and(distr_x[:]>rmax-std, distr_x[:]<rmax+std/2.+dx))])

     if(left_distr_y.shape[0]<4) :
        print('ERROR: Too few datapoints to fit the distribution:')
        print('       usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)')
        print('       or the distance distribution itself has some issue')
        plt.show()
        sys.exit()

     coeff = n.polyfit(left_distr_x,left_distr_y,2,full='False')
     a=coeff[0][0]
     b=coeff[0][1]
     c=coeff[0][2]
     
     rmax_old=rmax
     std_old=std
     if a<0. :
        rmax = -b/a/2. 
        std=n.sqrt(-1/a/2.)   # it was a0
     
     rmin=max(rmax-2*std-dx/2,0.)
     if(args.r_min>=0) : 
        rmin=args.r_min
     elif (rmin < radius and args.r_min!=-1) : 
        rmin = radius 
        print '\nWARNING: For internal consistency r_min has been fixed to the largest distance (r_NN_max) in the neighbors graph.'
        print '         It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" ' 
        print '         or you can use -n to manually define a desired value (Example: -n 0.1)\n' 
          
     rM=rmax+dx/4
 
     if(n.fabs(rmax-rmax_old)>std_old/4+dx ) :    #fit consistency check
       print '\nWARNING: The histogram is probably not smooth enough (you may try to change n_bin with -b), rmax is fixed to the value of first iteration\n'  

       rmax=rmax_old
       a=a0
       b=b0
       c=c0

       if(args.r_min>=0) :
          rmin=args.r_min
       elif (rmin < radius and args.r_min!=-1) :
          rmin = radius
          print '\nWARNING2: For internal consistency r_min has been fixed to the largest distance in the neighbors graph (r_NN_max).'
          print '          It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" '
          print '          or you can use -n to manually define a desired value (Example: -n 0.1)\n'
       rM=rmax+dx/4
     #2

     #3 Gaussian Fitting to determine ratio R
     
     left_distr_x= distr_x[n.logical_and(n.logical_and(distr_x[:]>rmin,distr_x[:]<=rM),distr_y[:]>0.000001)]/rmax
     left_distr_y= n.log(distr_y[n.logical_and(n.logical_and(distr_x[:]>rmin,distr_x[:]<=rM),distr_y[:]>0.000001)])-(4*a*c-b**2)/4./a

     if(left_distr_y.shape[0]<4) :
        print('ERROR: Too few datapoints to fit the distribution:')
        print('       usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)')
        print('       or the distance distribution itself has some issue')
        plt.show()
        sys.exit()

     fit =  curve_fit(func2,left_distr_x,left_distr_y)
     ratio=n.sqrt(fit[0][0])
     y1=func2(left_distr_x,fit[0][0])
     #3

     #4 Geodesics D-Hypersphere Distribution Fitting to determine Dfit

     fit = curve_fit(func,left_distr_x,left_distr_y)
     Dfit=(fit[0][0])+1


     y2=func(left_distr_x,fit[0][0],fit[0][1],fit[0][2])
     #4

     
     #5 Determination of Dmin

     D_file = open('D_residual_{0}.dat'.format(filename), "w")
     
     for D in range(1,26):
         y=(func(left_distr_x,D-1,1,0))
         for i in range(0, len(y)):
             res[D-1] = n.linalg.norm((y)-(left_distr_y))/n.sqrt(len(y))
         D_file.write("%s " % D)
         D_file.write("%s\n" % res[D-1])

     Dmin = n.argmax(-res)+1

     y=func(left_distr_x,Dmin-1,fit[0][1],0)
     #5

     #6 Printing results
     print '\nFITTING PARAMETERS:' 
     print 'rmax, std. dev., rmin', rmax,std,rmin
     print '\nFITTING RESULTS:' 
     print 'R, Dfit, Dmin', ratio,Dfit,Dmin , '\n'

     if(Dmin == 1) : print 'NOTE: Dmin = 1 could indicate that the choice of the input parameters is not optimal or simply an underestimation of a 2D manifold\n'
     if(Dfit > 25) : print('NOTE: Dfit > 25 could indicate that the choice of the input parameters is not optimal or that the the distance distribution itself has some issue \n')
     fit_file= open('fit_{0}.dat'.format(filename), "w")

     for i in range(0, len(y)):
         fit_file.write("%s " % left_distr_x[i])
         fit_file.write("%s " % ((left_distr_y[i])))
         fit_file.write("%s " % ((y1[i])))
         fit_file.write("%s " % ((y2[i])))
         fit_file.write("%s\n" % ((y[i])))
     fit_file.close() 

             
     stat_file= open('statistics_{0}.dat'.format(filename), "w")
     statistics = str('# Npoints, rmax, standard deviation, R, D_fit, Dmin \n# \
     {}, {}, {}, {}, {}, {}\n'.format(n.count_nonzero(connect),rmax,std,ratio,Dfit,Dmin))
     stat_file.write("%s" % statistics)
     for i in range(0, len(distr_x)-2):
       if distr_y[i]>0.000001 : 
	 stat_file.write("%s " % distr_x[i])
	 stat_file.write("%s " % distr_y[i])
	 stat_file.write("%s\n" % n.log(distr_y[i]))
     stat_file.close()
     
     plt.figure(2)
     plt.plot(left_distr_x,left_distr_y,'o-',label=str(input_f.split('.')[0]))
     plt.plot(left_distr_x,y1,label='Gaussian fit for R ratio')
     plt.plot(left_distr_x,y2,label='D-Hypersphere Fit for D_fit')
     plt.plot(left_distr_x,y,label='D_min-Hypersphere Distribution')
     plt.xlabel('r/r$_{max}$')
     plt.ylabel('log p(r)/p(r$_{max}$)')
     plt.legend(loc=4)
     plt.savefig(str(input_f.split('.')[0])+'_fit.png')  
     

     plt.figure(3)
     plt.plot(range(1,26),res,'o-',label=str(input_f.split('.')[0])+' D_min')
     plt.legend()
     plt.xlabel('D')
     plt.ylabel('RMDS')
     plt.show()
     plt.savefig(str(input_f.split('.')[0])+'_Dmin.png')


     #6
   
     #7 Optional: Isomap projection
     if args.projection :
        from sklearn.decomposition import KernelPCA
        C2=(distance.squareform(dist_list))**2
        C2=-.5*C2
        obj_pj=KernelPCA(n_components=100,kernel="precomputed")
        proj=obj_pj.fit_transform(C2)
        n.savetxt('proj_'+str(input_f.split('.')[0])+'.dat',proj[:,0:Dmin+1])
     print 'NOTE: it is important to have a smooth histogram for accurate fitting\n'
Esempio n. 36
0
def main(argv):

    parser = argparse.ArgumentParser(
        epilog=
        "NOTE: it is important to have a smooth histogram for accurate fitting\n\n"
    )
    parser.add_argument("filename", help="input filename")

    parser.add_argument(
        "-m",
        "--metric",
        type=str,
        help=
        "define the scipy distance to be used   (Default: euclidean or hamming for MSA)",
        default='euclidean')
    parser.add_argument(
        "-x",
        "--matrix",
        help=
        "if the input file contains already the complete upper triangle of a distance matrix (2 Formats: (idx_i idx_j distance) or simply distances list ) (Opt)",
        action="store_true")
    parser.add_argument("-k",
                        "--n_neighbors",
                        type=int,
                        help="nearest_neighbors parameter (Default k=3)",
                        default=3)
    parser.add_argument(
        "-r",
        "--radius",
        type=float,
        help="use neighbor radius instead of nearest_neighbors  (Opt)",
        default=0.)
    parser.add_argument(
        "-b",
        "--n_bins",
        type=int,
        help="number of bins for distance histogram (Default 50)",
        default=50)
    parser.add_argument(
        "-M",
        "--r_max",
        type=float,
        help=
        "fix the value of distance distribution maximum in the fit (Opt, -1 force the standard fit, avoiding consistency checks)",
        default=0)
    parser.add_argument(
        "-n",
        "--r_min",
        type=float,
        help=
        "fix the value of shortest distance considered in the fit (Opt, -1 force the standard fit, avoiding consistency checks)",
        default=-10)
    parser.add_argument("-D",
                        "--direct",
                        help="analyze the direct (not graph) distances (Opt)",
                        action="store_true")
    parser.add_argument(
        "-I",
        "--projection",
        help="produce an Isomap projection using the first ID components (Opt)",
        action="store_true")

    args = parser.parse_args()
    input_f = args.filename
    me = args.metric
    n_neighbors = args.n_neighbors
    radius = args.radius + 0
    MSA = False
    n_bins = args.n_bins
    rmax = args.r_max
    mm = -10000

    print '\nFile name: ', input_f

    #0 Reading input file
    f1 = open(input_f)
    data = []
    data_line = []
    labels = []

    for line in f1:
        if line[0] == ">":
            MSA = True
            labels.append(line)
        if line[0] != ">" and MSA == True:
            data.append([ord(x) for x in line[:-1]])
            data_line.append(line)
        elif line[0] != "#" and MSA == False:
            data.append([float(x) for x in line.split()])
            data_line.append(line)
    f1.close()

    data = n.asarray(data)
    if MSA: me = 'hamming'
    if args.matrix: me = 'as from the input file'
    print 'Metric: ', me
    if radius > 0. and (args.direct == False):
        print 'Nearest Neighbors Radius:', radius
    elif (args.direct == False):
        print 'Nearest Neighbors number K: ', n_neighbors
    else:
        print 'Distance distribution are calculated based on the  direct input-space distances '

    if radius > 0.:
        filename = str(input_f.split('.')[0]) + 'R' + str(radius)
    else:
        filename = str(input_f.split('.')[0]) + 'K' + str(n_neighbors)
    #0

    #1 Computing geodesic distance on connected points of the input file and relative histogram
    if args.matrix:
        if data.shape[1] == 1:
            dist_mat = distance.squareform(data.ravel())
            mm = dist_mat.shape[1]
        elif data.shape[1] == 3:
            mm = int(max(data[:, 1]))
            dist_mat = n.zeros((mm, mm))
            for i in range(0, data.shape[0]):
                dist_mat[int(data[i, 0]) - 1, int(data[i, 1]) - 1] = data[i, 2]
                dist_mat[int(data[i, 1]) - 1, int(data[i, 0]) - 1] = data[i, 2]
        else:
            print 'ERROR: The distances input is not in the right matrix format'
            sys.exit(2)

        print "\n# points: ", mm

        A = n.zeros((mm, mm))
        rrr = []

        if radius > 0.:
            for i in range(0, mm):
                ll = dist_mat[i] < radius
                A[i, ll] = dist_mat[i, ll]
        else:
            rrr = n.argsort(dist_mat)
            for i in range(0, mm):
                ll = rrr[i, 0:n_neighbors + 1]
                A[i, ll] = dist_mat[i, ll]
            radius = A.max()

        if args.direct: C = dist_mat
        else: C = graph_shortest_path(A, directed=False)

    else:
        print "\n# points, coordinates: ", data.shape
        if args.direct: C = distance.squareform(distance.pdist(data, me))
        elif radius > 0.:
            A = radius_neighbors_graph(data,
                                       radius,
                                       metric=me,
                                       mode='distance')
            C = graph_shortest_path(A, directed=False)
        else:
            A = kneighbors_graph(data, n_neighbors, metric=me, mode='distance')
            C = graph_shortest_path(A, directed=False)
            radius = A.max()

    C = n.asmatrix(C)
    connect = n.zeros(C.shape[0])
    conn = n.zeros(C.shape[0])
    for i in range(0, C.shape[0]):
        conn_points = n.count_nonzero(C[i])
        conn[i] = conn_points
        if conn_points > C.shape[0] / 2.: connect[i] = 1
        else: C[i] = 0

    if n.count_nonzero(connect) > C.shape[0] / 2.:
        print 'Number of connected points:', n.count_nonzero(
            connect), '(', 100 * n.count_nonzero(connect) / C.shape[0], '% )'
    else:
        print 'The neighbors graph is highly disconnected, increase K or Radius parameters'
        sys.exit(2)

    if n.count_nonzero(connect) < data.shape[0]:
        data_connect_file = open('connected_data_{0}.dat'.format(filename),
                                 "w")
        for i in range(0, C.shape[0]):
            if connect[i] == 1:
                if MSA: data_connect_file.write(labels[i])
                data_connect_file.write(data_line[i])
        data_connect_file.close()

    indices = n.nonzero(n.triu(C, 1))
    dist_list = n.asarray(C[indices])[-1]

    dist_file = open('dist_{0}.dat'.format(filename), "w")

    for i in range(0, len(dist_list)):
        dist_file.write("%s " % ((dist_list[i])))
    dist_file.close()

    h = n.histogram(dist_list, n_bins)
    dx = h[1][1] - h[1][0]

    plt.figure(1)
    plt.plot(h[1][0:n_bins] + dx / 2, h[0], 'o-', label='histogram')
    plt.xlabel('r')
    plt.ylabel('N. counts')
    plt.legend()
    plt.savefig(filename + '_hist.png')
    distr_x = []
    distr_y = []

    avg = n.mean(dist_list)
    std = n.std(dist_list)

    if rmax > 0:
        avg = rmax
        std = min(std, rmax)
        print '\nNOTE: You fixed r_max for the initial fitting, average will have the same value'
    else:
        mm = n.argmax(h[0])
        rmax = h[1][mm] + dx / 2

    if args.r_max == -1:
        print '\nNOTE: You forced r_max to the maximum of the distribution in the initial fitting, avoiding consistency checks with the average'
        avg = rmax
        std = min(std, rmax)

    if args.r_min >= 0:
        print '\nNOTE: You fixed r_min for the initial fitting: r_min = ', args.r_min
    if args.r_min == -1:
        print '\nNOTE: You forced r_min to the standard procedure in the initial fitting'

    print '\nDistances Statistics:'
    print 'Average, standard dev., n_bin, bin_size, r_max, r_NN_max:', avg, std, n_bins, dx, rmax, radius, '\n'
    #1
    tmp = 1000000
    if (args.r_min >= 0): tmp = args.r_min
    elif (args.r_min == -1): tmp = rmax - std

    if (n.fabs(rmax - avg) > std + 2. * dx):
        print 'ERROR: There is a problem with the r_max detection:'
        print '       usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)'
        print '       or r_max and r_avg are too distant and you may consider to fix the first detection of r_max with option -M'
        print '       or to change the neighbor parameter with (-r/-k)'
        plt.show()
        sys.exit()
    elif (rmax <= min(radius + dx, tmp)):
        print 'ERROR: There is a problem with the r_max detection, it is shorter than the largest distance in the neighbors graph.'
        print '       You may consider to fix the first detection of r_max with option -M and/or the r_min with option -n to fix the fit range'
        print '       or to decrease the neighbors parameter with (-r/-k). For example It is possible to enforce the standard fit range with '
        print '       r_min=r_max-2*sigma running option "-n -1"'
        plt.show()
        sys.exit()

    #2 Finding actual r_max and std. dev. to define fitting interval [rmin;rM]
    distr_x = h[1][0:n_bins] + dx / 2
    distr_y = h[0][0:n_bins]

    res = n.empty(25)
    left_distr_x = n.empty(n_bins)
    left_distr_y = n.empty(n_bins)

    left_distr_x = distr_x[n.logical_and(
        n.logical_and(distr_x[:] > rmax - std, distr_x[:] < rmax + std / 2.0),
        distr_y[:] > 0.000001)]
    left_distr_y = n.log(distr_y[n.logical_and(
        n.logical_and(distr_x[:] > rmax - std, distr_x[:] < rmax + std / 2.0),
        distr_y[:] > 0.000001)])

    if (left_distr_y.shape[0] < 4):
        print('ERROR: Too few datapoints to fit the distribution:')
        print(
            '       usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)'
        )
        print('       or the distance distribution itself has some issue')
        plt.show()
        print('R, Dfit, Dmin', 'ERROR3', '\n')
        sys.exit()

    coeff = n.polyfit(left_distr_x, left_distr_y, 2, full='False')
    a0 = coeff[0][0]
    b0 = coeff[0][1]
    c0 = coeff[0][2]

    rmax_old = rmax
    std_old = std
    rmax = -b0 / a0 / 2.0

    if (args.r_max > 0): rmax = args.r_max
    #if(args.r_max==-1) : rmax=avg   #to be used in future in case of problem with Ymax
    if a0 < 0 and n.fabs(rmax - rmax_old) < std_old / 2 + dx:
        std = n.sqrt(-1 / a0 / 2.)
    else:
        rmax = avg
        std = std_old

    left_distr_x = distr_x[n.logical_and(
        distr_y[:] > 0.000001,
        n.logical_and(distr_x[:] > rmax - std,
                      distr_x[:] < rmax + std / 2. + dx))]
    left_distr_y = n.log(distr_y[n.logical_and(
        distr_y[:] > 0.000001,
        n.logical_and(distr_x[:] > rmax - std,
                      distr_x[:] < rmax + std / 2. + dx))])

    if (left_distr_y.shape[0] < 4):
        print('ERROR: Too few datapoints to fit the distribution:')
        print(
            '       usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)'
        )
        print('       or the distance distribution itself has some issue')
        plt.show()
        sys.exit()

    coeff = n.polyfit(left_distr_x, left_distr_y, 2, full='False')
    a = coeff[0][0]
    b = coeff[0][1]
    c = coeff[0][2]

    rmax_old = rmax
    std_old = std
    if a < 0.:
        rmax = -b / a / 2.
        std = n.sqrt(-1 / a / 2.)  # it was a0

    rmin = max(rmax - 2 * std - dx / 2, 0.)
    if (args.r_min >= 0):
        rmin = args.r_min
    elif (rmin < radius and args.r_min != -1):
        rmin = radius
        print '\nWARNING: For internal consistency r_min has been fixed to the largest distance (r_NN_max) in the neighbors graph.'
        print '         It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" '
        print '         or you can use -n to manually define a desired value (Example: -n 0.1)\n'

    rM = rmax + dx / 4

    if (n.fabs(rmax - rmax_old) > std_old / 4 + dx):  #fit consistency check
        print '\nWARNING: The histogram is probably not smooth enough (you may try to change n_bin with -b), rmax is fixed to the value of first iteration\n'

        rmax = rmax_old
        a = a0
        b = b0
        c = c0

        if (args.r_min >= 0):
            rmin = args.r_min
        elif (rmin < radius and args.r_min != -1):
            rmin = radius
            print '\nWARNING2: For internal consistency r_min has been fixed to the largest distance in the neighbors graph (r_NN_max).'
            print '          It is possible to reset the standard definition of r_min=r_max-2*sigma running with option "-n -1" '
            print '          or you can use -n to manually define a desired value (Example: -n 0.1)\n'
        rM = rmax + dx / 4
    #2

    #3 Gaussian Fitting to determine ratio R

    left_distr_x = distr_x[n.logical_and(
        n.logical_and(distr_x[:] > rmin, distr_x[:] <= rM),
        distr_y[:] > 0.000001)] / rmax
    left_distr_y = n.log(distr_y[n.logical_and(
        n.logical_and(distr_x[:] > rmin, distr_x[:] <= rM),
        distr_y[:] > 0.000001)]) - (4 * a * c - b**2) / 4. / a

    if (left_distr_y.shape[0] < 4):
        print('ERROR: Too few datapoints to fit the distribution:')
        print(
            '       usually either the histogram is not smooth enough (you may consider changing the n_bins with option -b)'
        )
        print('       or the distance distribution itself has some issue')
        plt.show()
        sys.exit()

    fit = curve_fit(func2, left_distr_x, left_distr_y)
    ratio = n.sqrt(fit[0][0])
    y1 = func2(left_distr_x, fit[0][0])
    #3

    #4 Geodesics D-Hypersphere Distribution Fitting to determine Dfit

    fit = curve_fit(func, left_distr_x, left_distr_y)
    Dfit = (fit[0][0]) + 1

    y2 = func(left_distr_x, fit[0][0], fit[0][1], fit[0][2])
    #4

    #5 Determination of Dmin

    D_file = open('D_residual_{0}.dat'.format(filename), "w")

    for D in range(1, 26):
        y = (func(left_distr_x, D - 1, 1, 0))
        for i in range(0, len(y)):
            res[D - 1] = n.linalg.norm((y) - (left_distr_y)) / n.sqrt(len(y))
        D_file.write("%s " % D)
        D_file.write("%s\n" % res[D - 1])

    Dmin = n.argmax(-res) + 1

    y = func(left_distr_x, Dmin - 1, fit[0][1], 0)
    #5

    #6 Printing results
    print '\nFITTING PARAMETERS:'
    print 'rmax, std. dev., rmin', rmax, std, rmin
    print '\nFITTING RESULTS:'
    print 'R, Dfit, Dmin', ratio, Dfit, Dmin, '\n'

    if (Dmin == 1):
        print 'NOTE: Dmin = 1 could indicate that the choice of the input parameters is not optimal or simply an underestimation of a 2D manifold\n'
    if (Dfit > 25):
        print(
            'NOTE: Dfit > 25 could indicate that the choice of the input parameters is not optimal or that the the distance distribution itself has some issue \n'
        )
    fit_file = open('fit_{0}.dat'.format(filename), "w")

    for i in range(0, len(y)):
        fit_file.write("%s " % left_distr_x[i])
        fit_file.write("%s " % ((left_distr_y[i])))
        fit_file.write("%s " % ((y1[i])))
        fit_file.write("%s " % ((y2[i])))
        fit_file.write("%s\n" % ((y[i])))
    fit_file.close()

    stat_file = open('statistics_{0}.dat'.format(filename), "w")
    statistics = str('# Npoints, rmax, standard deviation, R, D_fit, Dmin \n# \
     {}, {}, {}, {}, {}, {}\n'.format(n.count_nonzero(connect), rmax, std,
                                      ratio, Dfit, Dmin))
    stat_file.write("%s" % statistics)
    for i in range(0, len(distr_x) - 2):
        if distr_y[i] > 0.000001:
            stat_file.write("%s " % distr_x[i])
            stat_file.write("%s " % distr_y[i])
            stat_file.write("%s\n" % n.log(distr_y[i]))
    stat_file.close()

    plt.figure(2)
    plt.plot(left_distr_x,
             left_distr_y,
             'o-',
             label=str(input_f.split('.')[0]))
    plt.plot(left_distr_x, y1, label='Gaussian fit for R ratio')
    plt.plot(left_distr_x, y2, label='D-Hypersphere Fit for D_fit')
    plt.plot(left_distr_x, y, label='D_min-Hypersphere Distribution')
    plt.xlabel('r/r$_{max}$')
    plt.ylabel('log p(r)/p(r$_{max}$)')
    plt.legend(loc=4)
    plt.savefig(str(input_f.split('.')[0]) + '_fit.png')

    plt.figure(3)
    plt.plot(range(1, 26),
             res,
             'o-',
             label=str(input_f.split('.')[0]) + ' D_min')
    plt.legend()
    plt.xlabel('D')
    plt.ylabel('RMDS')
    plt.show()
    plt.savefig(str(input_f.split('.')[0]) + '_Dmin.png')

    #6

    #7 Optional: Isomap projection
    if args.projection:
        from sklearn.decomposition import KernelPCA
        C2 = (distance.squareform(dist_list))**2
        C2 = -.5 * C2
        obj_pj = KernelPCA(n_components=100, kernel="precomputed")
        proj = obj_pj.fit_transform(C2)
        n.savetxt('proj_' + str(input_f.split('.')[0]) + '.dat',
                  proj[:, 0:Dmin + 1])
    print 'NOTE: it is important to have a smooth histogram for accurate fitting\n'
Esempio n. 37
0
def makeGraph(dataset, model, districtStats, R, sigma):

    RADIUS_OF_EARTH = 6378

    dataFile = json.load(open(dataset))
    dates = [date for date in dataFile]

    # Saving locations from dictionary
    placesList = []
    for date in dates:
        for state in list(dataFile[date]):
            if state == 'TT':
                pass
            try:
                for district in list(dataFile[date][state]['districts']):
                    if district == 'Unknown' or district == 'Other State':
                        district = randomDistrict(state)
                    place = district + ',' + state + ',' + 'India'
                    if not place in placesList:
                        placesList.append(place)

            except KeyError:
                place = state + ',' + 'India'
                if not place in placesList:
                    placesList.append(place)
    print('Updated places')

    # Geolocator, we save stuff to geoUP.p
    geolocator = Bing(api_key=apis.bing())

    uniquePlacesList = list(unique_everseen(placesList))
    geocodedDistrictList = list(districtStats['Coordinates'])
    geocodedUniqueNearestDistrictList = list(
        np.zeros_like(uniquePlacesList).astype(str))

    # Initialize if not present
    if not os.path.exists('data/geoUP.p'):
        geocodedUniquePlacesList = list(
            np.zeros_like(uniquePlacesList).astype(str))
        with open('data/geoUP.p', 'wb') as f:
            pickle.dump(geocodedUniquePlacesList, f)

    # Add new locations if any
    with open('data/geoUP.p', 'rb') as f:
        geocodedUniquePlacesList = pickle.load(f)
        for i in range(len(uniquePlacesList)):
            if geocodedUniquePlacesList[i] == '':
                geocodedUniquePlacesList[i] = ((geolocator.geocode(
                    uniquePlacesList[i]).latitude), (geolocator.geocode(
                        uniquePlacesList[i]).longitude))
    print('Geo mapping stuff done')

    # Save to pickle
    with open('data/geoUP.p', 'wb') as f:
        pickle.dump(geocodedUniquePlacesList, f)

    for i in range(len(uniquePlacesList)):
        _, _, coordinate, _ = getNearestDistrictData(
            model, districtStats, geocodedUniquePlacesList[i])
        geocodedUniqueNearestDistrictList[i] = coordinate

    # Map stuff to different lists this got error
    numberOfDistricts = len(geocodedDistrictList)
    numberOfDates = len(dates)
    arrayFinal = np.zeros((numberOfDates, numberOfDistricts, 3))
    print('Making final time resolved array')

    for dateIndex in range(numberOfDates):
        for districtIndex in range(numberOfDistricts):
            date = dates[dateIndex]
            district = list(districtStats['Coordinates'])[districtIndex]
            try:
                place = uniquePlacesList[
                    geocodedUniqueNearestDistrictList.index(district)]

            # If that district is not enlisted in corona affected places
            except ValueError:
                pass

            if place:
                dump = place.split(',')
                number = 0

                # Check to see if district or state only data
                if len(dump) == 2:
                    try:
                        number = dataFile[date][dump[0]]['total']['confirmed']
                    # If that state does not exist on that date
                    except KeyError:
                        pass
                else:
                    try:
                        number = dataFile[date][dump[1]]['districts'][
                            dump[0]]['total']['confirmed']
                    # If that district does not exist in this state on the date
                    except KeyError:
                        pass

                arrayFinal[dateIndex, districtIndex, 0] = number
                arrayFinal[dateIndex, districtIndex, 1] = list(
                    districtStats['Literacy rate'])[districtIndex]
                arrayFinal[dateIndex, districtIndex, 2] = list(
                    districtStats['Population'])[districtIndex]

            else:
                pass

    print('Array made')

    E = radius_neighbors_graph(model,
                               R / RADIUS_OF_EARTH,
                               mode='distance',
                               metric='haversine').toarray()
    W = 1 - np.exp(-(E * E) / sigma)
    adj = np.where(W > 0, 1, 0)
    # edge = W.reshape(1, W.shape[0]*W.shape[1])
    return arrayFinal, W, adj
Esempio n. 38
0
def epsilon_graph(X, e):
    A = radius_neighbors_graph(X, e, mode='distance', include_self=False)
    A.toarray()
    return A
Esempio n. 39
0
from sklearn.datasets import make_circles
random_state = 21
#X_mn, y_mn = make_moons(150, noise=.07, random_state=random_state)
#X_mn, y_mn = make_circles(150, noise=.07, random_state=random_state)
X_mn, y_mn = make_circles(n_samples=400, factor=.3, noise=0.025)
cmap = 'viridis'
dot_size = 50
#fig, ax = plt.subplots(figsize=(9,7))
#ax.set_title('Data with ground truth labels - linear separation not
# possible', fontsize=18, fontweight='demi')
#ax.scatter(X_mn[:, 0], X_mn[:, 1],c=y_mn,s=dot_size, cmap=cmap)
#fig.show()
A = radius_neighbors_graph(X_mn,
                           0.4,
                           mode='distance',
                           metric='minkowski',
                           p=2,
                           metric_params=None,
                           include_self=False)
# A = kneighbors_graph(X_mn, 2, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False)
A = A.toarray()
print(A.shape)
"""
fig, ax = plt.subplots(figsize=(9,7))
ax.set_title('5 first datapoints', fontsize=18, fontweight='demi')
ax.set_xlim(-1, 2)
ax.set_ylim(-1,1)
ax.scatter(X_mn[:5, 0], X_mn[:5, 1],s=dot_size, cmap=cmap)
for i in range(5):
  ax.annotate(i, (X_mn[i,0],X_mn[i,1]))
fig.show()
Esempio n. 40
0
def actor_critic(sess, gcn, placeholders, env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):
  global gen_graph

  G = nx.Graph()
  colors= []

  totsteps =0
  positions = np.arange(-1.2,0.6,0.01)
  velolicties = np.arange(-0.07,0.07,0.001)

  vinput= []
  for vel in velolicties:
    for pos in positions:
      vinput.append([pos,vel])
  vinput = np.array(vinput)

  name = "res/mountain_graph{}_seed{}.csv".format(gen_graph,seed)
  change_lr=0


  stats = []
  states= []
  done=False
  node_ptr=0
  for i_episode in range(num_episodes):

    # pdb.set_trace()
    if i_episode % 3 ==0 and gen_graph:
      print('new graph')
      G = nx.Graph()
      states= []
      node_ptr=0
      # curr_episode=i_episode-1

    if i_episode % 5 == 0 and i_episode!=0:
      np.savetxt(name,stats,delimiter=',') 

    state = env.reset()
    states.append(state) 
    rewards = 0
    losses = 0

    for t in itertools.count():
      

      # pdb.set_trace()
      action = estimator_policy.predict([state])
      next_state, reward, done, _ = env.step(action)
      rewards += reward
      # reward+=1
      # print(reward)

      node_ptr+=1
      G.add_edge(node_ptr-1,node_ptr)
      

      
      # Calculate TD Target
      value_next = estimator_value.predict([next_state])
      td_target = reward + (1-done) * discount_factor * value_next
      advantage = td_target - estimator_value.predict([state])


      lr = 1e-3 if not change_lr else 1e-4
      estimator_value.update([state], td_target,lr)
      loss = estimator_policy.update([state], advantage, action, lr)
      losses += loss

      state = next_state
      states.append(state) 
      # print(node_ptr, len(states)-1)


      if done:
        node_ptr+=1 # to avoid making edges between an terminal state and a initial state
        totsteps+=t 

        print("\rEpisode {}/{} Steps {} Total Steps {} ({}) Loss: {}".format(i_episode, num_episodes, t,totsteps, rewards,losses/t) )
        stats.append(totsteps)
        rewards =0


        if plots:


          # pos = {i:(states[i][0],states[i][1]) for i in range(len(states))}
          # this_color = [i_episode+1] * (t+1)
          # colors += this_color
          # fig,ax = plt.subplots()
          # plt.xlim((-1.2,0.6))
          # plt.ylim((-0.07,0.07))   
          # nx.draw(G,pos, with_labels=False, font_size=7, node_size=5,node_color='blue')
          # # plt.show()
          # plt.savefig("graphs/graph{}.png".format(i_episode+1))
          # plt.clf();plt.close()

          v_preds=estimator_value.predict(vinput).reshape(len(velolicties),len(positions))         
          fig,ax = plt.subplots()
          ax.imshow(v_preds, interpolation='nearest', alpha=1.)
          # ax.autoscale(False)
          # nx.draw(G,pos, with_labels=False, font_size=7, node_size=5,node_color=colors)
          # plt.show()
          # plt.axis('off')
          plt.xticks([])
          plt.yticks([])
          plt.title("Actor-Critic",fontsize=17)
          plt.xlabel('Position',fontsize=17)
          plt.ylabel('Velocity',fontsize=17)
          # plt.title("Diffusion-Based Approximate Value Function")
          plt.savefig("vpreds0/vpred{}.png".format(i_episode))
          plt.clf();plt.close()




        if t<env._max_episode_steps-1 and gen_graph:

          gen_graph=0
          # change_lr=1
          # pdb.set_trace()
          aspect = (0.6 + 1.2) / (2*0.07)
          metric = lambda p0, p1: np.sqrt((p1[0] - p0[0]) * (p1[0] - p0[0]) + (p1[1] - p0[1]) * (p1[1] - p0[1]) * aspect)
          # dist='euclidean'


          radius = 0.02
          real_states = np.array(states)
          adj = nn.radius_neighbors_graph(real_states,radius,metric=metric)
          adj = adj+nx.adjacency_matrix(G)
          gg = nx.from_scipy_sparse_matrix(adj)

          source = 0 
          sink = len(real_states) -1
          # max_sources = 40
          # max_sinks=40
          # other_sources =range(max_sources)
          # other_sinks =range(len(real_states)-max_sinks,len(real_states))
          other_sources=[source]
          other_sinks=[sink]
          max_sinks=1

          # pdb.set_trace()
          features = featurize_state(real_states)
          # features = real_states
          # features = np.eye(len(real_states), dtype=np.float32)
          features = sparse_to_tuple(sp.lil_matrix(features))

          labels = np.zeros((len(real_states)))
          labels[-max_sinks:] = 1
          labels = encode_onehot(labels)


          V_weights = get_graph(sess,gcn,placeholders,gg.edges(),gg,real_states,adj,features,labels,source,sink,other_sources,other_sinks,featurize_state)
          targets = V_weights

          # pdb.set_trace()
          gcn_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, gcn.name)
          vf_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "value_estimator")

          for g_v,v_v in zip(gcn_vars,vf_vars):
            if '_1_' in g_v.name:
              
              if 'bias' in g_v.name:
                # pdb.set_trace()
                sess.run(tf.assign(v_v,tf.expand_dims(g_v[1],0) ))
              else:
                sess.run(tf.assign(v_v,tf.expand_dims(g_v[:,1],1) ))
            else:
              
              sess.run(tf.assign(v_v, g_v))

          
          # pos = {i:(real_states[i][0],real_states[i][1]) for i in range(len(real_states))}
          # fig,ax = plt.subplots()
          # nx.draw(gg,pos, with_labels=False, font_size=10, node_size=25,node_color=targets)
          # # plt.savefig("updated_graph/last.png")
          # # plt.clf();plt.close()          
          # plt.show()
          # plt.close()

          # for epo in range(30):
          #   estimator_value.update(real_states, targets,1e-3)
            # fig,ax = plt.subplots()
            # v_preds=estimator_value.predict(vinput).reshape(len(velolicties),len(positions)) 
            # ax.imshow(v_preds, interpolation='nearest', alpha=1.)
            # # ax.autoscale(False)
            # # nx.draw(G,pos, with_labels=False, font_size=7, node_size=5,node_color=colors)
            # # plt.show()
            # plt.savefig("updated_preds/iter{}.png".format(epo))
            # plt.clf();plt.close()
          
          fig,ax = plt.subplots()
          v_preds=estimator_value.predict(vinput).reshape(len(velolicties),len(positions)) 
          ax.imshow(v_preds, interpolation='nearest', alpha=1.)
          # ax.autoscale(False)
          # nx.draw(G,pos, with_labels=False, font_size=7, node_size=5)
          # plt.show()
          # plt.close()
          # pdb.set_trace()

        break
  return stats
Esempio n. 41
0
    def __init__(self,
                 lat,
                 long,
                 major_axis,
                 minor_axis,
                 psi,
                 crater_id=None,
                 Rbody=const.RMOON,
                 radius=const.TRIAD_RADIUS,
                 vcam_alt=const.DB_CAM_ALTITUDE,
                 sort_ij=True):
        """Crater database abstraction keyed by crater triads that generate projective invariants using information
        about their elliptical shape and relative positions [1]. Input is a crater dataset [2] that has positional
        and geometrical (ellipse parameters) information; output is an array of 7 features per crater triad.

        Parameters
        ----------
        lat : np.ndarray
            Crater latitude [radians]
        long : np.ndarray
            Crater longitude [radians]
        major_axis : np.ndarray
            Crater major axis [km]
        minor_axis : np.ndarray
            Crater minor axis [km]
        psi : np.ndarray
            Crater ellipse tilt angle, major axis w.r.t. East-West direction (0, pi) [radians]
        crater_id : np.ndarray, optional
            Crater identifier, defaults to enumerated array over len(lat)
        Rbody : float, optional
            Body radius, defaults to RMOON [km]
        radius : float, int
            Maximum radius to consider two craters connected, defaults to TRIAD_RADIUS [km]
        vcam_alt : float, int
            Altitude of virtual per-triad camera
        sort_ij : bool
            Whether to sort triad features with I_ij being the lowest absolute value

        References
        ----------
        .. [1] Christian, J. A., Derksen, H., & Watkins, R. (2020). Lunar Crater Identification in Digital Images. https://arxiv.org/abs/2009.01228
        .. [2] Robbins, S. J. (2019). A New Global Database of Lunar Impact Craters &gt;1–2 km: 1. Crater Locations and Sizes, Comparisons With Published Databases, and Global Analysis. Journal of Geophysical Research: Planets, 124(4), 871–892. https://doi.org/10.1029/2018JE005592
        """

        if crater_id is None:
            self.crater_id = np.arange(len(lat))
        else:
            self.crater_id = crater_id

        self._lat = lat
        self._long = long
        self._C_cat = conic_matrix(major_axis, minor_axis, psi)

        x, y, z = map(np.array,
                      spherical_to_cartesian(Rbody, self._lat, self._long))

        self._r_craters = np.array((x, y, z)).T[..., None]
        """
        Construct adjacency matrix and generate Graph instance
        """
        self._adjacency_matrix = radius_neighbors_graph(np.array([x, y, z]).T,
                                                        radius,
                                                        mode='distance',
                                                        metric='euclidean',
                                                        n_jobs=-1)

        self._graph = nx.from_scipy_sparse_matrix(self._adjacency_matrix)
        """
        Get all crater triads using cycle basis with length = 3
        https://en.wikipedia.org/wiki/Cycle_basis
        
        The following returns a nx3 array containing the indices of crater triads
        """
        crater_triads = np.array(get_cliques_by_length(self._graph, 3))
        """
        Project crater triads into virtual image plane using homography
        """
        r_M_ijk = np.moveaxis(
            np.concatenate(
                (x[crater_triads].T[None, ...], y[crater_triads].T[None, ...],
                 z[crater_triads].T[None, ...]),
                axis=0), 0, 2)[..., None]
        r_centroid = np.mean(r_M_ijk, axis=0)
        r_vcam = r_centroid + (
            r_centroid / LA.norm(r_centroid, axis=1)[..., None]) * vcam_alt

        T_CM = np.concatenate(nadir_attitude(r_vcam), axis=-1)
        if (LA.matrix_rank(T_CM) != 3).any():
            raise Warning("Invalid camera attitude matrices present!:\n", T_CM)

        K = camera_matrix()
        P_MC = K @ LA.inv(T_CM) @ np.concatenate(
            (np.tile(np.identity(3), (len(r_vcam), 1, 1)), -r_vcam), axis=2)

        H_C_triads = np.array(
            list(map(crater_camera_homography, r_M_ijk, repeat(P_MC))))
        """
        Ensure all crater triads are clockwise
        """
        C_triads = np.array(
            list(map(lambda vertex: self._C_cat[vertex], crater_triads.T)))

        A_i, A_j, A_k = map(
            lambda T, C: LA.inv(T).transpose((0, 2, 1)) @ C @ LA.inv(T),
            H_C_triads, C_triads)
        r_i, r_j, r_k = map(conic_center, (A_i, A_j, A_k))

        cw_value = LA.det(
            np.moveaxis(
                np.array([[r_i[:, 0], r_i[:, 1],
                           np.ones_like(r_i[:, 0])],
                          [r_j[:, 0], r_j[:, 1],
                           np.ones_like(r_i[:, 0])],
                          [r_k[:, 0], r_k[:, 1],
                           np.ones_like(r_i[:, 0])]]), -1, 0))
        clockwise = cw_value < 0
        line = cw_value == 0

        clockwise = clockwise[~line]
        crater_triads = crater_triads[~line]
        H_C_triads = H_C_triads[:, ~line]

        crater_triads[np.argwhere(~clockwise),
                      [0, 1]] = crater_triads[np.argwhere(~clockwise), [1, 0]]
        H_C_triads[[0, 1], np.argwhere(~clockwise)] = H_C_triads[
            [1, 0], np.argwhere(~clockwise)]

        C_triads = np.array(
            list(map(lambda vertex: self._C_cat[vertex], crater_triads.T)))
        A_i, A_j, A_k = map(
            lambda T, C: LA.inv(T).transpose((0, 2, 1)) @ C @ LA.inv(T),
            H_C_triads, C_triads)

        invariants = CoplanarInvariants(crater_triads,
                                        A_i,
                                        A_j,
                                        A_k,
                                        normalize_det=True)

        self._features = invariants.get_pattern()
        self._crater_triads = invariants.crater_triads

        if sort_ij:
            ij_idx = np.abs(self._features[:, :3]).argmin(1)
            self._features = np.concatenate(
                (shift_nd(self._features[:, :3], -ij_idx),
                 shift_nd(self._features[:, 3:6],
                          -ij_idx), self._features[:, [-1]]),
                axis=-1)
            self._crater_triads = shift_nd(self._crater_triads, -ij_idx)

            too_close = np.logical_or.reduce(
                (np.abs((self._features[:, 0] - self._features[:, 2]) /
                        self._features[:, 0]) < 0.1,
                 np.abs((self._features[:, 0] - self._features[:, 1]) /
                        self._features[:, 0]) < 0.1))

            self._features = np.concatenate(
                (self._features,
                 np.concatenate(
                     (np.roll(self._features[too_close, :3],
                              1), np.roll(self._features[too_close, :3],
                                          1), self._features[too_close, -1:]),
                     axis=-1)),
                axis=0)

            self._crater_triads = np.concatenate(
                (self._crater_triads, np.roll(self._crater_triads[too_close],
                                              1)))

        self._kdtree = KDTree(self._features)
Esempio n. 42
0
    def _plot_atoms_general(
        self,
        ax,
        atol,
        max_bond_length,
        atom_size,
        bond_line_width,
        scaling_matrix,
        midpoint,
        scan_size,
        legend,
        top,
        structure,
        atom_axis_bounds,
        atoms_box,
        legend_atom_size,
    ):
        supercell = make_supercell(
            structure,
            scaling_matrix=np.hstack([scaling_matrix, 1]),
        )
        inds, heights = group_layers(supercell, atol=atol)

        if top:
            surface_inds = inds[-1]
        else:
            surface_inds = inds[0]

        surface_atom_coords = supercell.cart_coords[surface_inds]
        surface_atom_symbols = np.array(supercell.species,
                                        dtype='str')[surface_inds]
        surface_atom_species = np.zeros(surface_atom_symbols.shape, dtype=int)
        surface_atom_sizes = np.zeros(surface_atom_symbols.shape, dtype=float)
        unique_species = np.unique(surface_atom_symbols)
        unique_elements = [Element(i) for i in unique_species]
        unique_zs = [Element(i).Z for i in unique_species]

        for i, z in enumerate(unique_elements):
            surface_atom_species[np.isin(surface_atom_symbols,
                                         unique_species[i])] = z.Z
            surface_atom_sizes[np.isin(surface_atom_symbols,
                                       unique_species[i])] = z.atomic_radius

        surface_atom_sizes /= surface_atom_sizes.max()

        colors = jmol_colors[surface_atom_species]

        shifted_point = midpoint - (scan_size / 2)
        surface_atom_coords[:, 0] -= shifted_point[0]
        surface_atom_coords[:, 1] -= shifted_point[1]

        neighbor_graph = radius_neighbors_graph(
            X=surface_atom_coords,
            radius=max_bond_length,
        ).toarray()

        bonds = []

        for i in range(neighbor_graph.shape[0]):
            for j in range(neighbor_graph.shape[0]):
                if neighbor_graph[i, j] > 0:
                    to_append = [
                        surface_atom_coords[i],
                        surface_atom_coords[j],
                        [np.nan, np.nan, np.nan],
                    ]
                    bonds.append(to_append)

        bonds = np.vstack(bonds)

        ax_atoms = ax.inset_axes(bounds=atom_axis_bounds, )
        ax_atoms.set_xlim(atom_axis_bounds[0] * scan_size,
                          (atom_axis_bounds[0] + atom_axis_bounds[2]) *
                          scan_size)
        ax_atoms.set_ylim(atom_axis_bounds[1] * scan_size,
                          (atom_axis_bounds[1] + atom_axis_bounds[3]) *
                          scan_size)
        ax_atoms.set_facecolor((0, 0, 0, 0))

        ax_atoms.tick_params(
            bottom=False,
            left=False,
            labelbottom=False,
            labelleft=False,
        )

        if not atoms_box:
            ax_atoms.spines['left'].set_visible(False)
            ax_atoms.spines['right'].set_visible(False)
            ax_atoms.spines['top'].set_visible(False)
            ax_atoms.spines['bottom'].set_visible(False)

        ax_atoms.plot(
            bonds[:, 0],
            bonds[:, 1],
            color='lightgrey',
            linewidth=bond_line_width,
            zorder=5,
            path_effects=[
                pa.Stroke(linewidth=bond_line_width + 2, foreground='black'),
                pa.Normal()
            ],
        )
        ax_atoms.scatter(
            surface_atom_coords[:, 0],
            surface_atom_coords[:, 1],
            c=colors,
            ec='black',
            s=atom_size * surface_atom_sizes,
            zorder=10,
        )

        if legend:
            legend_lines = []
            legend_labels = []
            for name, color, element in zip(unique_species,
                                            jmol_colors[unique_zs],
                                            unique_elements):
                legend_lines.append(
                    plt.scatter(
                        [-1],
                        [-1],
                        color=color,
                        s=legend_atom_size * element.atomic_radius,
                        ec='black',
                    ))
                legend_labels.append(f'{name}')

            leg = ax.get_legend()

            if leg is None:
                handles = legend_lines
                labels = legend_labels
            else:
                handles = [l._legmarker for l in leg.legendHandles]
                labels = [text._text for text in leg.texts]
                handles.extend(legend_lines)
                labels.extend(legend_labels)

            l = ax.legend(
                handles,
                labels,
                ncol=1,
                loc='upper right',
                framealpha=1,
            )
            l.set_zorder(200)
            frame = l.get_frame()
            frame.set_facecolor('white')
            frame.set_edgecolor('black')