Example #1
0
    def calculate_potential(self, diff_op, t):
        """Calculates the diffusion potential

        Parameters
        ----------

        diff_op : array-like, shape=[n_samples, n_samples] or [n_landmarks, n_landmarks]
            The diffusion operator fit on the input data

        t : int
            power to which the diffusion operator is powered
            sets the level of diffusion

        Returns
        -------

        diff_potential : array-like, shape=[n_samples, n_samples]
            The diffusion potential fit on the input data
        """
        tasklogger.log_start("diffusion potential")
        # diffused diffusion operator
        diff_op_t = np.linalg.matrix_power(diff_op, t)

        if self.gamma == 1:
            # handling small values
            diff_op_t = diff_op_t + 1e-7
            diff_potential = -1 * np.log(diff_op_t)
        elif self.gamma == -1:
            diff_potential = diff_op_t
        else:
            c = (1 - self.gamma) / 2
            diff_potential = ((diff_op_t)**c) / c
        tasklogger.log_complete("diffusion potential")
        return diff_potential
Example #2
0
    def fit(self, X):
        if not len(X.shape) == 3:
            raise ValueError("Expected X to be a tensor with three dimensions."
                             " Got shape {}".format(X.shape))

        if self.normalize:
            X = utils.normalize(X)

        tasklogger.log_start("multislice kernel")
        K = kernel.multislice_kernel(X,
                                     intraslice_knn=self.intraslice_knn,
                                     interslice_knn=self.interslice_knn,
                                     decay=self.decay,
                                     n_pca=self.n_pca,
                                     distance=self.knn_dist,
                                     n_jobs=self.n_jobs)
        tasklogger.log_complete("multislice kernel")
        tasklogger.log_start("graph and diffusion operator")
        n_landmark = self.n_landmark if self.n_landmark < K.shape[0] else None
        self.graph = graphtools.Graph(K,
                                      precomputed="affinity",
                                      n_landmark=n_landmark,
                                      n_svd=self.n_svd,
                                      n_jobs=self.n_jobs,
                                      verbose=self.verbose,
                                      random_state=self.random_state,
                                      **(self.kwargs))
        self.diff_op
        tasklogger.log_complete("graph and diffusion operator")
        result = super().fit(self.graph)
        return result
Example #3
0
    def fit_transform(self, X, graph=None, **kwargs):
        """Computes the diffusion operator and the position of the cells in the
        embedding space

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_features`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`.

        graph : `graphtools.Graph`, optional (default: None)
            If given, provides a precomputed kernel matrix with which to
            perform diffusion.

        kwargs : further arguments for `PHATE.transform()`
            Keyword arguments as specified in :func:`~phate.PHATE.transform`

        Returns
        -------
        X_magic : array, shape=[n_samples, n_genes]
            The gene expression values after diffusion
        """
        tasklogger.log_start('MAGIC')
        self.fit(X, graph=graph)
        X_magic = self.transform(**kwargs)
        tasklogger.log_complete('MAGIC')
        return X_magic
Example #4
0
    def fit_transform(self, X, **kwargs):
        """Computes the diffusion operator and the position of the cells in the
        embedding space

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_dimensions`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData` If
            `knn_dist` is 'precomputed', `data` should be a n_samples x
            n_samples distance or affinity matrix

        kwargs : further arguments for `PHATE.transform()`
            Keyword arguments as specified in :func:`~phate.PHATE.transform`

        Returns
        -------
        embedding : array, shape=[n_samples, n_dimensions]
            The cells embedded in a lower dimensional space using PHATE
        """
        tasklogger.log_start('PHATE')
        self.fit(X)
        embedding = self.transform(**kwargs)
        tasklogger.log_complete('PHATE')
        return embedding
Example #5
0
    def _reduce_data(self):
        """Private method to reduce data dimension.

        If data is dense, uses randomized PCA. If data is sparse, uses
        randomized SVD.
        TODO: should we subtract and store the mean?

        Returns
        -------
        Reduced data matrix
        """
        if self.n_pca is not None and self.n_pca < self.data.shape[1]:
            tasklogger.log_start("PCA")
            if sparse.issparse(self.data):
                if isinstance(self.data, sparse.coo_matrix) or \
                        isinstance(self.data, sparse.lil_matrix) or \
                        isinstance(self.data, sparse.dok_matrix):
                    self.data = self.data.tocsr()
                self.data_pca = TruncatedSVD(self.n_pca,
                                             random_state=self.random_state)
            else:
                self.data_pca = PCA(self.n_pca,
                                    svd_solver='randomized',
                                    random_state=self.random_state)
            self.data_pca.fit(self.data)
            data_nu = self.data_pca.transform(self.data)
            tasklogger.log_complete("PCA")
            return data_nu
        else:
            data_nu = self.data
            if sparse.issparse(data_nu) and not isinstance(
                    data_nu,
                (sparse.csr_matrix, sparse.csc_matrix, sparse.bsr_matrix)):
                data_nu = data_nu.tocsr()
            return data_nu
Example #6
0
    def fit(self, X):
        """Computes the diffusion operator

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_dimensions`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If
            `knn_dist` is 'precomputed', `data` should be a n_samples x
            n_samples distance or affinity matrix

        Returns
        -------
        phate_operator : PHATE
        The estimator object
        """
        X, n_pca, precomputed, update_graph = self._parse_input(X)

        if precomputed is None:
            tasklogger.log_info(
                "Running PHATE on {} cells and {} genes.".format(
                    X.shape[0], X.shape[1]))
        else:
            tasklogger.log_info(
                "Running PHATE on precomputed {} matrix with {} cells.".format(
                    precomputed, X.shape[0]))

        if self.n_landmark is None or X.shape[0] <= self.n_landmark:
            n_landmark = None
        else:
            n_landmark = self.n_landmark

        if self.graph is not None and update_graph:
            self._update_graph(X, precomputed, n_pca, n_landmark)

        self.X = X

        if self.graph is None:
            tasklogger.log_start("graph and diffusion operator")
            self.graph = graphtools.Graph(
                X,
                n_pca=n_pca,
                n_landmark=n_landmark,
                distance=self.knn_dist,
                precomputed=precomputed,
                knn=self.knn,
                decay=self.decay,
                thresh=1e-4,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                **(self.kwargs))
            tasklogger.log_complete("graph and diffusion operator")

        # landmark op doesn't build unless forced
        self.diff_op
        return self
Example #7
0
def PCA(X, *args, is_graph=False, seed=None, n_components=2, **kwargs):
    X = scprep.utils.toarray(X)
    tasklogger.log_start("PCA")
    Y = sklearn.decomposition.PCA(*args,
                                  n_components=n_components,
                                  random_state=seed,
                                  **kwargs).fit_transform(X)
    tasklogger.log_complete("PCA")
    return Y
Example #8
0
def ISOMAP(X, *args, is_graph=False, seed=None, **kwargs):
    np.random.seed(seed)
    if is_graph:
        X = utils.geodesic_distance(X)
    tasklogger.log_start("ISOMAP")
    Y = Isomap_(*args, precomputed=is_graph, random_state=seed,
                **kwargs).fit_transform(X)
    tasklogger.log_complete("ISOMAP")
    return Y
Example #9
0
    def _calculate_potential(self,
                             t=None,
                             t_max=100,
                             plot_optimal_t=False,
                             ax=None):
        """Calculates the diffusion potential

        Parameters
        ----------

        t : int
            power to which the diffusion operator is powered
            sets the level of diffusion

        t_max : int, default: 100
            Maximum value of `t` to test

        plot_optimal_t : boolean, default: False
            If true, plots the Von Neumann Entropy and knee point

        ax : matplotlib.Axes, default: None
            If plot=True and ax is not None, plots the VNE on the given axis
            Otherwise, creates a new axis and displays the plot

        Returns
        -------

        diff_potential : array-like, shape=[n_samples, n_samples]
            The diffusion potential fit on the input data
        """
        if t is None:
            t = self.t
        if self._diff_potential is None:
            if t == 'auto':
                t = self._find_optimal_t(t_max=t_max,
                                         plot=plot_optimal_t,
                                         ax=ax)
            else:
                t = self.t
            tasklogger.log_start("diffusion potential")
            # diffused diffusion operator
            diff_op_t = np.linalg.matrix_power(self.diff_op, t)
            if self.gamma == 1:
                # handling small values
                diff_op_t = diff_op_t + 1e-7
                self._diff_potential = -1 * np.log(diff_op_t)
            elif self.gamma == -1:
                self._diff_potential = diff_op_t
            else:
                c = (1 - self.gamma) / 2
                self._diff_potential = ((diff_op_t)**c) / c
            tasklogger.log_complete("diffusion potential")
        elif plot_optimal_t:
            self._find_optimal_t(t_max=t_max, plot=plot_optimal_t, ax=ax)

        return self._diff_potential
Example #10
0
    def build_kernel(self):
        """Build the MNN kernel.

        Build a mutual nearest neighbors kernel.

        Returns
        -------
        K : kernel matrix, shape=[n_samples, n_samples]
            symmetric matrix with ones down the diagonal
            with no non-negative entries.
        """
        tasklogger.log_start("subgraphs")
        self.subgraphs = []
        from .api import Graph
        # iterate through sample ids
        for i, idx in enumerate(self.samples):
            tasklogger.log_debug("subgraph {}: sample {}, "
                                 "n = {}, knn = {}".format(
                                     i, idx, np.sum(self.sample_idx == idx),
                                     self.weighted_knn[i]))
            # select data for sample
            data = self.data_nu[self.sample_idx == idx]
            # build a kNN graph for cells within sample
            graph = Graph(data,
                          n_pca=None,
                          knn=self.weighted_knn[i],
                          decay=self.decay,
                          distance=self.distance,
                          thresh=self.thresh,
                          verbose=self.verbose,
                          random_state=self.random_state,
                          n_jobs=self.n_jobs,
                          initialize=False)
            self.subgraphs.append(graph)  # append to list of subgraphs
        tasklogger.log_complete("subgraphs")

        if self.thresh > 0 or self.decay is None:
            K = sparse.lil_matrix(
                (self.data_nu.shape[0], self.data_nu.shape[0]))
        else:
            K = np.zeros([self.data_nu.shape[0], self.data_nu.shape[0]])
        for i, X in enumerate(self.subgraphs):
            for j, Y in enumerate(self.subgraphs):
                tasklogger.log_start("kernel from sample {} to {}".format(
                    self.samples[i], self.samples[j]))
                Kij = Y.build_kernel_to_data(X.data_nu,
                                             knn=self.weighted_knn[i])
                if i == j:
                    # downweight within-batch affinities by beta
                    Kij = Kij * self.beta
                K = set_submatrix(K, self.sample_idx == self.samples[i],
                                  self.sample_idx == self.samples[j], Kij)
                tasklogger.log_complete("kernel from sample {} to {}".format(
                    self.samples[i], self.samples[j]))
        return K
Example #11
0
def TSNE(X, *args, is_graph=False, metric='euclidean', seed=None, **kwargs):
    if is_graph:
        X = utils.geodesic_distance(X)
        metric = 'precomputed'
    tasklogger.log_start("TSNE")
    Y = sklearn.manifold.TSNE(*args,
                              metric=metric,
                              random_state=seed,
                              **kwargs).fit_transform(X)
    tasklogger.log_complete("TSNE")
    return Y
Example #12
0
def Spring(X, *args, is_graph=False, seed=None, **kwargs):
    np.random.seed(seed)
    if not is_graph:
        G = graphtools.Graph(X, knn=3, decay=None, use_pygsp=True)
    else:
        G = pygsp.graphs.Graph(X)
    G = networkx.from_numpy_matrix(G.W.toarray())
    tasklogger.log_start("Spring")
    X = networkx.spring_layout(G, *args, **kwargs)
    tasklogger.log_complete("Spring")
    X = np.vstack(list(X.values()))
    return X
Example #13
0
def measure_method(data_noised, method, labels, data_name, subsample_idx=None):
    if subsample_idx is not None:
        data_noised = data_noised[subsample_idx]
    tasklogger.log_start(method.__name__, logger="demap")
    embedding = method(data_noised)
    tasklogger.log_complete(method.__name__, logger="demap")
    ari_score = demap.ari.ARI(labels, embedding, subsample_idx=subsample_idx)
    df = pd.DataFrame(
        {
            "dataset": data_name,
            "method": method.__name__,
            "ARI": ari_score
        },
        index=[""])
    return df
Example #14
0
    def build_kernel_to_data(self, Y, knn=None):
        """Build transition matrix from new data to the graph

        Creates a transition matrix such that `Y` can be approximated by
        a linear combination of landmarks. Any
        transformation of the landmarks can be trivially applied to `Y` by
        performing

        `transform_Y = transitions.dot(transform)`

        Parameters
        ----------

        Y: array-like, [n_samples_y, n_features]
            new data for which an affinity matrix is calculated
            to the existing data. `n_features` must match
            either the ambient or PCA dimensions

        Returns
        -------

        transitions : array-like, [n_samples_y, self.data.shape[0]]
            Transition matrix from `Y` to `self.data`

        Raises
        ------

        ValueError: if `precomputed` is not `None`, then the graph cannot
        be extended.
        """
        if knn is None:
            knn = self.knn
        if self.precomputed is not None:
            raise ValueError("Cannot extend kernel on precomputed graph")
        else:
            tasklogger.log_start("affinities")
            Y = self._check_extension_shape(Y)
            pdx = cdist(Y, self.data_nu, metric=self.distance)
            knn_dist = np.partition(pdx, knn, axis=1)[:, :knn]
            epsilon = np.max(knn_dist, axis=1)
            pdx = (pdx.T / epsilon).T
            K = np.exp(-1 * pdx**self.decay)
            # handle nan
            K = np.where(np.isnan(K), 1, K)
            K[K < self.thresh] = 0
            tasklogger.log_complete("affinities")
        return K
Example #15
0
    def _find_optimal_t(self, t_max=100, plot=False, ax=None):
        """Find the optimal value of t

        Selects the optimal value of t based on the knee point of the
        Von Neumann Entropy of the diffusion operator.

        Parameters
        ----------
        t_max : int, default: 100
            Maximum value of t to test

        plot : boolean, default: False
            If true, plots the Von Neumann Entropy and knee point

        ax : matplotlib.Axes, default: None
            If plot=True and ax is not None, plots the VNE on the given axis
            Otherwise, creates a new axis and displays the plot

        Returns
        -------
        t_opt : int
            The optimal value of t
        """
        tasklogger.log_start("optimal t")
        t, h = self._von_neumann_entropy(t_max=t_max)
        t_opt = vne.find_knee_point(y=h, x=t)
        tasklogger.log_info("Automatically selected t = {}".format(t_opt))
        tasklogger.log_complete("optimal t")

        if plot:
            if ax is None:
                fig, ax = plt.subplots()
                show = True
            else:
                show = False
            ax.plot(t, h)
            ax.scatter(t_opt, h[t == t_opt], marker='*', c='k', s=50)
            ax.set_xlabel("t")
            ax.set_ylabel("Von Neumann Entropy")
            ax.set_title("Optimal t = {}".format(t_opt))
            if show:
                plt.show()

        self.optimal_t = t_opt

        return t_opt
Example #16
0
def graphDiffusionCoordinates(G, n_eigenvectors=None):
    # diffusion maps with normalized Laplacian
    tasklogger.log_start("eigendecomposition")
    if n_eigenvectors is None:
        G.compute_fourier_basis()
    else:
        # temporary workaround until pygsp updates to pypi
        from scipy import sparse
        G._e, G._U = sparse.linalg.eigsh(G.L, n_eigenvectors, which='SM')
    tasklogger.log_complete("eigendecomposition")
    phi, lmbda = G.U, G.e
    # smallest to largest
    lmbda_idx = np.argsort(lmbda)
    phi, lmbda = phi[:, lmbda_idx], lmbda[lmbda_idx]
    #  trim trivial information
    phi, lmbda = phi[:, 1:], lmbda[1:]
    return phi, lmbda
Example #17
0
def MDS(X,
        *args,
        is_graph=False,
        dissimilarity='euclidean',
        seed=None,
        n_jobs=15,
        **kwargs):
    if is_graph:
        X = utils.geodesic_distance(X)
        dissimilarity = 'precomputed'
    tasklogger.log_start("MDS")
    Y = sklearn.manifold.MDS(*args,
                             dissimilarity=dissimilarity,
                             random_state=None,
                             n_jobs=n_jobs,
                             **kwargs).fit_transform(X)
    tasklogger.log_complete("MDS")
    return Y
Example #18
0
File: magic.py Project: akv84/MAGIC
    def fit_transform(self, X, graph=None, **kwargs):
        """Computes the diffusion operator and the position of the cells in the
        embedding space

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_features`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`.

        graph : `graphtools.Graph`, optional (default: None)
            If given, provides a precomputed kernel matrix with which to
            perform diffusion.

        genes : list or {"all_genes", "pca_only"}, optional (default: None)
            List of genes, either as integer indices or column names
            if input data is a pandas DataFrame. If "all_genes", the entire
            smoothed matrix is returned. If "pca_only", PCA on the smoothed
            data is returned. If None, the entire matrix is also
            returned, but a warning may be raised if the resultant matrix
            is very large.

        t_max : int, optional, default: 20
            maximum t to test if `t` is set to 'auto'

        plot_optimal_t : boolean, optional, default: False
            If true and `t` is set to 'auto', plot the disparity used to
            select t

        ax : matplotlib.axes.Axes, optional
            If given and `plot_optimal_t` is true, plot will be drawn
            on the given axis.

        Returns
        -------
        X_magic : array, shape=[n_samples, n_genes]
            The gene expression values after diffusion
        """
        tasklogger.log_start('MAGIC')
        self.fit(X, graph=graph)
        X_magic = self.transform(**kwargs)
        tasklogger.log_complete('MAGIC')
        return X_magic
Example #19
0
def measure_method(data, data_noised, method, data_name, subsample_idx=None):
    if subsample_idx is not None:
        data_noised = data_noised[subsample_idx]
    tasklogger.log_start(method.__name__, logger="demap")
    embedding = method(data_noised)
    tasklogger.log_complete(method.__name__, logger="demap")
    demap_score = demap.DEMaP(data,
                              embedding,
                              knn=5,
                              subsample_idx=subsample_idx)
    df = pd.DataFrame(
        {
            "dataset": data_name,
            "method": method.__name__,
            "demap": demap_score
        },
        index=[""],
    )
    return df
Example #20
0
    def call(self, bam_dir, out_dir):
        """call CNV for each chromosome

        Parameters
        ----------
        bam_dir : directory path which contains all BAM files

        out_dir : the output directory
        Returns
        -------
        self 
        """
        Y_path = os.path.join(out_dir, 'temp.Y.csv')
        nor_Y_path = os.path.join(out_dir, 'temp.norY.csv')
        ref_path = os.path.join(out_dir, 'temp.ref.csv')
        gini_path = os.path.join(out_dir, 'temp.gini.csv')
        ploidy_path = os.path.join(out_dir, 'temp.ploidy.csv')
        scope_path = os.path.join(out_dir, 'run-scope.R')
        utils.write_scope(scope_path)
        command = 'Rscript {0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10} {11}'.format(scope_path, bam_dir, Y_path, ref_path, gini_path, ploidy_path, nor_Y_path,self.seq, self.reg, self.ref, self.mapq, self.bin_len)
        code = os.system(command)
        if code != 0:
            sys.exit(1)
        tasklogger.log_start('SCYN')
        Y = pd.read_csv(Y_path, index_col=0)
        nor_Y = pd.read_csv(nor_Y_path, index_col=0)
        ref = pd.read_csv(ref_path, index_col=0)
        gini = pd.read_csv(gini_path, index_col=0)
        ploidy = pd.read_csv(ploidy_path, index_col=0)
        self.meta_info = pd.DataFrame(index=['c_gini', 'c_ploidy'], columns=Y.columns)
        self.meta_info.loc['c_gini'] = gini.T.iloc[0].values
        self.meta_info.loc['c_ploidy'] = ploidy.T.iloc[0].values
        self.meta_info = self.meta_info.T
        self._cal_cnv(ref, Y, nor_Y)
        self.bin_info = ref

        # clean up temp files
        utils.clean_up([Y_path, nor_Y_path, ref_path,
                        gini_path, ploidy_path, scope_path])

        tasklogger.log_complete('SCYN')

        return self
 def fit(self, X, Y, q=None):
     if hasattr(self, "phi_X"):
         tasklogger.log_info("Using precomputed diffusion coordinates.")
     else:
         tasklogger.log_start("diffusion coordinates")
         if q is None:
             with parallel.ParallelQueue(n_jobs=min(2, self.n_jobs)) as q:
                 return self.fit(X, Y, q)
         else:
             q.queue(
                 math.diffusionCoordinates,
                 X,
                 decay=self.decay_X,
                 knn=self.knn_X,
                 n_pca=self.n_pca_X if self.n_pca_X is not None
                 and self.n_pca_X < min(X.shape) else None,
                 n_eigenvectors=self.n_eigenvectors,
                 n_jobs=max(self.n_jobs // 2, 1),
                 verbose=self.verbose,
                 random_state=self.random_state,
             )
             q.queue(
                 math.diffusionCoordinates,
                 Y,
                 decay=self.decay_Y,
                 knn=self.knn_Y,
                 n_pca=self.n_pca_Y if self.n_pca_Y is not None
                 and self.n_pca_Y < min(Y.shape) else None,
                 n_eigenvectors=self.n_eigenvectors,
                 n_jobs=max(self.n_jobs // 2, 1),
                 verbose=self.verbose,
                 random_state=self.random_state,
             )
         (phi_X, lambda_X), (phi_Y, lambda_Y) = q.run()
         self.phi_X = phi_X
         self.lambda_X = lambda_X
         self.phi_Y = phi_Y
         self.lambda_Y = lambda_Y
         tasklogger.log_complete("diffusion coordinates")
     return self
Example #22
0
def PHATE(X,
          *args,
          is_graph=False,
          knn_dist='euclidean',
          solver='smacof',
          verbose=0,
          seed=None,
          n_jobs=15,
          **kwargs):
    if knn_dist is None:
        if is_graph:
            knn_dist = 'precomputed'
    tasklogger.log_start("PHATE")
    Y = phate.PHATE(*args,
                    knn_dist=knn_dist,
                    verbose=verbose,
                    random_state=seed,
                    n_jobs=n_jobs,
                    mds_solver=solver,
                    **kwargs).fit_transform(X)
    tasklogger.log_complete("PHATE")
    return Y
Example #23
0
    def impute(self, data):
        """Main function of I-Impute

        Parameters
        ----------
        data : matrix, shape (m x n)
            The raw reads count matrix

        Returns
        -------
        imputed_data: matrix, shape (m x n)
            The imputed matrix, pandas Dataframe object
        """
        tasklogger.log_start('I-Impute')
        imputed_data = None
        if self.iteration:
            exp_mse = 1
            mse = 100
            previous_imputed_data = data
            iteration = 1
            while mse > exp_mse:
                tasklogger.log_info(
                    'iteratively impute for the {0}th time'.format(iteration))
                current_imputed_data = self._cimpute(previous_imputed_data)
                dist_matrix = (current_imputed_data - previous_imputed_data)**2
                n_values = data.shape[0] * data.shape[1]
                mse = np.sqrt(dist_matrix.values.sum() / n_values)
                previous_imputed_data = current_imputed_data
                iteration += 1

            imputed_data = previous_imputed_data
        else:
            imputed_data = self._cimpute(data)

        tasklogger.log_complete('I-Impute')
        return imputed_data
Example #24
0
    def build_landmark_op(self):
        """Build the landmark operator

        Calculates spectral clusters on the kernel, and calculates transition
        probabilities between cluster centers by using transition probabilities
        between samples assigned to each cluster.
        """
        tasklogger.log_start("landmark operator")
        is_sparse = sparse.issparse(self.kernel)
        # spectral clustering
        tasklogger.log_start("SVD")
        _, _, VT = randomized_svd(self.diff_aff,
                                  n_components=self.n_svd,
                                  random_state=self.random_state)
        tasklogger.log_complete("SVD")
        tasklogger.log_start("KMeans")
        kmeans = MiniBatchKMeans(self.n_landmark,
                                 init_size=3 * self.n_landmark,
                                 batch_size=10000,
                                 random_state=self.random_state)
        self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T))
        # some clusters are not assigned
        tasklogger.log_complete("KMeans")

        # transition matrices
        pmn = self._landmarks_to_data()

        # row normalize
        pnm = pmn.transpose()
        pmn = normalize(pmn, norm='l1', axis=1)
        pnm = normalize(pnm, norm='l1', axis=1)
        landmark_op = pmn.dot(pnm)  # sparsity agnostic matrix multiplication
        if is_sparse:
            # no need to have a sparse landmark operator
            landmark_op = landmark_op.toarray()
        # store output
        self._landmark_op = landmark_op
        self._transitions = pnm
        tasklogger.log_complete("landmark operator")
    def align(self,
              X,
              Y,
              phi_X=None,
              phi_Y=None,
              lambda_X=None,
              lambda_Y=None):
        """Harmonic alignment

        Parameters
        ----------
        X : array-like, shape=[n_samples, n_features]
            Input dataset
        Y : array-like, shape=[m_samples, n_features]
            Input dataset
        phi_{X,Y} : array-like, shape=[{n,m}_samples, {n,m}_samples], optional (default: None)
            Precomputed Laplacian eigenvectors
        lambda_{X,Y} : list-like, shape=[{n,m}_samples], optional (default: None)
            Precomputed Laplacian eigenvalues

        Returns
        -------
        XY_aligned : array-like, shape=[n_samples + m_samples, n_samples + m_samples - 1]
        """
        tasklogger.log_start("Harmonic Alignment")
        np.random.seed(self.random_state)
        # normalized L with diffusion coordinates
        with parallel.ParallelQueue(n_jobs=min(2, self.n_jobs)) as q:
            if (phi_X is not None or phi_Y is not None or lambda_X is not None
                    or lambda_Y is not None):
                if None in (phi_X, phi_Y, lambda_X, lambda_Y):
                    raise RuntimeError(
                        "If a precomputed eigensystem is provided, all of"
                        " `phi_X, phi_Y, lambda_X, lambda_Y` must be provided."
                        " Got phi_X={}, phi_Y={}, lambda_X={}, lambda_Y={}".
                        format(phi_X, phi_Y, lambda_X, lambda_Y))
                else:
                    self.phi_X, self.phi_Y = phi_X, phi_Y
                    self.lambda_X, self.lambda_Y = lambda_X, lambda_Y
            self.fit(X, Y, q)
            # evaluate wavelets over data in the spectral domain
            tasklogger.log_start("wavelets")
            transform = build_wavelet_transform(
                X,
                self.phi_X,
                self.lambda_X,
                Y,
                self.phi_Y,
                self.lambda_Y,
                self.n_filters,
                self.overlap,
                q=q,
            )
            tasklogger.log_complete("wavelets")
        #  compute transformed data
        tasklogger.log_start("transformed data")
        self.phi_combined, self.lambda_combined = combine_eigenvectors(
            transform, self.phi_X, self.phi_Y, self.lambda_X, self.lambda_Y)
        E = self.phi_combined @ np.diag(self.lambda_combined**self.t)
        # build the joint diffusion map
        tasklogger.log_start("graph Laplacian")
        self.graph = graphtools.Graph(
            E,
            knn=self.knn_XY,
            decay=self.decay_XY,
            n_pca=self.n_pca_XY if self.n_pca_XY is not None
            and self.n_pca_XY < min(E.shape) else None,
            use_pygsp=True,
            thresh=1e-4,
            anisotropy=1,
            lap_type="normalized",
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            random_state=self.random_state,
        )
        tasklogger.log_complete("graph Laplacian")
        tasklogger.log_complete("transformed data")
        tasklogger.log_complete("Harmonic Alignment")
        return self.graph
Example #26
0
def test_tasks():
    logger = tasklogger.log_start("test")
    assert time.time() - logger.tasks['test'] < 0.01
    time.sleep(logger.min_runtime)
    tasklogger.log_complete("test")
    assert 'test' not in logger.tasks
Example #27
0
    def fit(self, X):
        """Computes the diffusion operator

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_features`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`.

        Returns
        -------
        magic_operator : MAGIC
            The estimator object
        """
        if self.knn_dist == 'precomputed':
            if isinstance(X, sparse.coo_matrix):
                X = X.tocsr()
            if X[0, 0] == 0:
                precomputed = "distance"
            else:
                precomputed = "affinity"
            tasklogger.log_info(
                "Using precomputed {} matrix...".format(precomputed))
            n_pca = None
        else:
            precomputed = None
            if self.n_pca is None or X.shape[1] <= self.n_pca:
                n_pca = None
            else:
                n_pca = self.n_pca

        if self.graph is not None:
            if self.X is not None and not \
                    utils.matrix_is_equivalent(X, self.X):
                """
                If the same data is used, we can reuse existing kernel and
                diffusion matrices. Otherwise we have to recompute.
                """
                self.graph = None
            else:
                try:
                    self.graph.set_params(decay=self.a,
                                          knn=self.k + 1,
                                          distance=self.knn_dist,
                                          precomputed=precomputed,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          n_pca=n_pca,
                                          thresh=1e-4,
                                          random_state=self.random_state)
                    tasklogger.log_info(
                        "Using precomputed graph and diffusion operator...")
                except ValueError as e:
                    # something changed that should have invalidated the graph
                    tasklogger.log_debug("Reset graph due to {}".format(
                        str(e)))
                    self.graph = None

        self.X = X

        if utils.has_empty_columns(X):
            warnings.warn("Input matrix contains unexpressed genes. "
                          "Please remove them prior to running MAGIC.")

        if self.graph is None:
            # reset X_magic in case it was previously set
            self.X_magic = None
            tasklogger.log_start("graph and diffusion operator")
            self.graph = graphtools.Graph(X,
                                          n_pca=n_pca,
                                          knn=self.k + 1,
                                          decay=self.a,
                                          thresh=1e-4,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          random_state=self.random_state)
            tasklogger.log_complete("graph and diffusion operator")

        return self
Example #28
0
    def impute(self,
               data,
               t_max=20,
               plot=False,
               ax=None,
               max_genes_compute_t=500,
               threshold=0.001):
        """Peform MAGIC imputation

        Parameters
        ----------
        data : graphtools.Graph, graphtools.Data or array-like
            Input data
        t_max : int, optional (default: 20)
            Maximum value of t to consider for optimal t selection
        plot : bool, optional (default: False)
            Plot the optimal t selection graph
        ax : matplotlib.Axes, optional (default: None)
            Axis on which to plot. If None, a new axis is created
        max_genes_compute_t : int, optional (default: 500)
            Above this number, genes will be subsampled for
            optimal t selection
        threshold : float, optional (default: 0.001)
            Threshold after which Procrustes disparity is considered
            to have converged for optimal t selection

        Returns
        -------
        X_magic : array-like, shape=[n_samples, n_pca]
            Imputed data
        """
        if not isinstance(data, graphtools.base.Data):
            data = graphtools.base.Data(data, n_pca=self.n_pca)
        data_imputed = data.data_nu

        if data_imputed.shape[1] > max_genes_compute_t:
            subsample_genes = np.random.choice(data_imputed.shape[1],
                                               max_genes_compute_t,
                                               replace=False)
        else:
            subsample_genes = None
        if hasattr(data, "data_pca"):
            weights = None  # data.data_pca.explained_variance_ratio_
        else:
            weights = None
        if self.t == 'auto':
            _, data_prev = self.calculate_error(
                data_imputed,
                data_prev=None,
                weights=weights,
                subsample_genes=subsample_genes)
            error_vec = []
            t_opt = None
        else:
            t_opt = self.t

        tasklogger.log_start("imputation")

        # classic magic
        # the diffusion matrix is powered when t has been specified by
        # the user, and the dimensions of the diffusion matrix are lesser
        # than those of the data matrix. (M^t) * D
        if (t_opt is not None) and \
                (self.diff_op.shape[1] < data_imputed.shape[1]):
            diff_op_t = np.linalg.matrix_power(self.diff_op, t_opt)
            data_imputed = diff_op_t.dot(data_imputed)

        # fast magic
        # a while loop is used when the dimensions of the diffusion matrix
        # are greater than those of the data matrix, or when t is not specified
        # (so as to allow for the calculation of the optimal t value)
        else:
            i = 0
            while (t_opt is None and i < t_max) or \
                    (t_opt is not None and i < t_opt):
                i += 1
                data_imputed = self.diff_op.dot(data_imputed)
                if self.t == 'auto':
                    error, data_prev = self.calculate_error(
                        data_imputed,
                        data_prev,
                        weights=weights,
                        subsample_genes=subsample_genes)
                    error_vec.append(error)
                    tasklogger.log_debug("{}: {}".format(i, error_vec))
                    if error < threshold and t_opt is None:
                        t_opt = i + 1
                        tasklogger.log_info(
                            "Automatically selected t = {}".format(t_opt))

        tasklogger.log_complete("imputation")

        if plot:
            # continue to t_max
            tasklogger.log_start("optimal t plot")
            if t_opt is None:
                # never converged
                warnings.warn("optimal t > t_max ({})".format(t_max),
                              RuntimeWarning)
            else:
                data_overimputed = data_imputed
                while i < t_max:
                    i += 1
                    data_overimputed = self.diff_op.dot(data_overimputed)
                    error, data_prev = self.calculate_error(
                        data_overimputed,
                        data_prev,
                        weights=weights,
                        subsample_genes=subsample_genes)
                    error_vec.append(error)

            # create axis
            if ax is None:
                fig, ax = plt.subplots()
                show = True
            else:
                show = False

            # plot
            x = np.arange(len(error_vec)) + 1
            ax.plot(x, error_vec)
            if t_opt is not None:
                ax.plot(
                    t_opt,
                    error_vec[t_opt - 1],
                    'ro',
                    markersize=10,
                )
            ax.plot(x, np.full(len(error_vec), threshold), 'k--')
            ax.set_xlabel('t')
            ax.set_ylabel('disparity(data_{t}, data_{t-1})')
            ax.set_xlim([1, len(error_vec)])
            plt.tight_layout()
            tasklogger.log_complete("optimal t plot")
            if show:
                plt.show(block=False)

        return data_imputed
Example #29
0
n = trace.shape[0]
m = trace.shape[1]

neuron_ids = np.tile(np.arange(m), n)
layer_ids = np.tile(data['layer'], n)
epoch = np.repeat(np.arange(n), m)

digit_ids = np.repeat(np.arange(10), 10)
digit_activity = np.array([
    np.sum(np.abs(trace[:, :, digit_ids == digit]), axis=2)
    for digit in np.unique(digit_ids)
])
most_active_digit = np.argmax(digit_activity, axis=0).flatten()

tasklogger.log_start("Naive DR")
trace_flat = trace.reshape(-1, trace.shape[-1])
tasklogger.log_start("PHATE")
phate_naive_op = phate.PHATE(verbose=0)
phate_naive = phate_naive_op.fit_transform(trace_flat)
tasklogger.log_complete("PHATE")
tasklogger.log_start("DM")
dm_naive = m_phate.kernel.DM(phate_naive_op.graph)
tasklogger.log_complete("DM")
tasklogger.log_start("t-SNE")
tsne_naive = TSNE().fit_transform(trace_flat)
tasklogger.log_complete("t-SNE")
tasklogger.log_start("ISOMAP")
isomap_naive = Isomap().fit_transform(trace_flat)
tasklogger.log_complete("ISOMAP")
tasklogger.log_complete("Naive DR")
Example #30
0
    def transform(self, X=None, t_max=100, plot_optimal_t=False, ax=None):
        """Computes the position of the cells in the embedding space

        Parameters
        ----------
        X : array, optional, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_dimensions`
            dimensions. Not required, since PHATE does not currently embed
            cells not given in the input matrix to `PHATE.fit()`.
            Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If
            `knn_dist` is 'precomputed', `data` should be a n_samples x
            n_samples distance or affinity matrix

        t_max : int, optional, default: 100
            maximum t to test if `t` is set to 'auto'

        plot_optimal_t : boolean, optional, default: False
            If true and `t` is set to 'auto', plot the Von Neumann
            entropy used to select t

        ax : matplotlib.axes.Axes, optional
            If given and `plot_optimal_t` is true, plot will be drawn
            on the given axis.

        Returns
        -------
        embedding : array, shape=[n_samples, n_dimensions]
        The cells embedded in a lower dimensional space using PHATE
        """
        if self.graph is None:
            raise NotFittedError("This PHATE instance is not fitted yet. Call "
                                 "'fit' with appropriate arguments before "
                                 "using this method.")
        elif X is not None and not utils.matrix_is_equivalent(X, self.X):
            # fit to external data
            warnings.warn(
                "Pre-fit PHATE should not be used to transform a "
                "new data matrix. Please fit PHATE to the new"
                " data by running 'fit' with the new data.", RuntimeWarning)
            if isinstance(self.graph, graphtools.graphs.TraditionalGraph) and \
                    self.graph.precomputed is not None:
                raise ValueError("Cannot transform additional data using a "
                                 "precomputed distance matrix.")
            else:
                if self.embedding is None:
                    self.transform()
                transitions = self.graph.extend_to_data(X)
                return self.graph.interpolate(self.embedding, transitions)
        else:
            diff_potential = self._calculate_potential(
                t_max=t_max, plot_optimal_t=plot_optimal_t, ax=ax)
            if self.embedding is None:
                tasklogger.log_start("{} MDS".format(self.mds))
                self.embedding = mds.embed_MDS(diff_potential,
                                               ndim=self.n_components,
                                               how=self.mds,
                                               distance_metric=self.mds_dist,
                                               n_jobs=self.n_jobs,
                                               seed=self.random_state,
                                               verbose=max(
                                                   self.verbose - 1, 0))
                tasklogger.log_complete("{} MDS".format(self.mds))
            if isinstance(self.graph, graphtools.graphs.LandmarkGraph):
                tasklogger.log_debug("Extending to original data...")
                return self.graph.interpolate(self.embedding)
            else:
                return self.embedding