Exemple #1
0
 def symmetrize_kernel(self, K):
     if self.kernel_symm == 'gamma' and self.gamma is not None and \
             not isinstance(self.gamma, numbers.Number):
         # matrix gamma
         # Gamma can be a matrix with specific values transitions for
         # each batch. This allows for technical replicates and
         # experimental samples to be corrected simultaneously
         tasklogger.log_debug("Using gamma symmetrization. "
                              "Gamma:\n{}".format(self.gamma))
         for i, sample_i in enumerate(self.samples):
             for j, sample_j in enumerate(self.samples):
                 if j < i:
                     continue
                 Kij = K[np.ix_(self.sample_idx == sample_i,
                                self.sample_idx == sample_j)]
                 Kji = K[np.ix_(self.sample_idx == sample_j,
                                self.sample_idx == sample_i)]
                 Kij_symm = self.gamma[i, j] * \
                     elementwise_minimum(Kij, Kji.T) + \
                     (1 - self.gamma[i, j]) * \
                     elementwise_maximum(Kij, Kji.T)
                 K = set_submatrix(K, self.sample_idx == sample_i,
                                   self.sample_idx == sample_j, Kij_symm)
                 if not i == j:
                     K = set_submatrix(K, self.sample_idx == sample_j,
                                       self.sample_idx == sample_i,
                                       Kij_symm.T)
     else:
         K = super().symmetrize_kernel(K)
     return K
Exemple #2
0
    def build_kernel(self):
        """Build the MNN kernel.

        Build a mutual nearest neighbors kernel.

        Returns
        -------
        K : kernel matrix, shape=[n_samples, n_samples]
            symmetric matrix with ones down the diagonal
            with no non-negative entries.
        """
        tasklogger.log_start("subgraphs")
        self.subgraphs = []
        from .api import Graph
        # iterate through sample ids
        for i, idx in enumerate(self.samples):
            tasklogger.log_debug("subgraph {}: sample {}, "
                                 "n = {}, knn = {}".format(
                                     i, idx, np.sum(self.sample_idx == idx),
                                     self.weighted_knn[i]))
            # select data for sample
            data = self.data_nu[self.sample_idx == idx]
            # build a kNN graph for cells within sample
            graph = Graph(data,
                          n_pca=None,
                          knn=self.weighted_knn[i],
                          decay=self.decay,
                          distance=self.distance,
                          thresh=self.thresh,
                          verbose=self.verbose,
                          random_state=self.random_state,
                          n_jobs=self.n_jobs,
                          initialize=False)
            self.subgraphs.append(graph)  # append to list of subgraphs
        tasklogger.log_complete("subgraphs")

        if self.thresh > 0 or self.decay is None:
            K = sparse.lil_matrix(
                (self.data_nu.shape[0], self.data_nu.shape[0]))
        else:
            K = np.zeros([self.data_nu.shape[0], self.data_nu.shape[0]])
        for i, X in enumerate(self.subgraphs):
            for j, Y in enumerate(self.subgraphs):
                tasklogger.log_start("kernel from sample {} to {}".format(
                    self.samples[i], self.samples[j]))
                Kij = Y.build_kernel_to_data(X.data_nu,
                                             knn=self.weighted_knn[i])
                if i == j:
                    # downweight within-batch affinities by beta
                    Kij = Kij * self.beta
                K = set_submatrix(K, self.sample_idx == self.samples[i],
                                  self.sample_idx == self.samples[j], Kij)
                tasklogger.log_complete("kernel from sample {} to {}".format(
                    self.samples[i], self.samples[j]))
        return K
Exemple #3
0
    def __init__(self, kernel_symm='+', gamma=None, initialize=True, **kwargs):
        self.kernel_symm = kernel_symm
        self.gamma = gamma
        self._check_symmetrization(kernel_symm, gamma)

        if initialize:
            tasklogger.log_debug("Initializing kernel...")
            self.K
        else:
            tasklogger.log_debug("Not initializing kernel.")
        super().__init__(**kwargs)
Exemple #4
0
def cmdscale_fast(D, ndim):
    """Fast CMDS using random SVD

    Parameters
    ----------
    D : array-like, input data [n_samples, n_dimensions]

    ndim : int, number of dimensions in which to embed `D`

    Returns
    -------
    Y : array-like, embedded data [n_sample, ndim]
    """
    tasklogger.log_debug("Performing classic MDS on {} of shape {}...".format(
        type(D).__name__, D.shape))
    D = D**2
    D = D - D.mean(axis=0)[None, :]
    D = D - D.mean(axis=1)[:, None]
    pca = PCA(n_components=ndim, svd_solver='randomized')
    Y = pca.fit_transform(D)
    return Y
Exemple #5
0
 def _update_graph(self, X, precomputed, n_pca, n_landmark):
     if self.X is not None and not utils.matrix_is_equivalent(
             X, self.X):
         """
         If the same data is used, we can reuse existing kernel and
         diffusion matrices. Otherwise we have to recompute.
         """
         self._reset_graph()
     else:
         try:
             self.graph.set_params(
                 decay=self.decay, knn=self.knn, distance=self.knn_dist,
                 precomputed=precomputed,
                 n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca,
                 n_landmark=n_landmark,
                 random_state=self.random_state)
             tasklogger.log_info(
                 "Using precomputed graph and diffusion operator...")
         except ValueError as e:
             # something changed that should have invalidated the graph
             tasklogger.log_debug("Reset graph due to {}".format(
                 str(e)))
             self._reset_graph()
Exemple #6
0
 def symmetrize_kernel(self, K):
     # symmetrize
     if self.kernel_symm == "+":
         tasklogger.log_debug("Using addition symmetrization.")
         K = (K + K.T) / 2
     elif self.kernel_symm == "*":
         tasklogger.log_debug("Using multiplication symmetrization.")
         K = K.multiply(K.T)
     elif self.kernel_symm == 'gamma':
         tasklogger.log_debug(
             "Using gamma symmetrization (gamma = {}).".format(self.gamma))
         K = self.gamma * elementwise_minimum(K, K.T) + \
             (1 - self.gamma) * elementwise_maximum(K, K.T)
     elif self.kernel_symm is None:
         tasklogger.log_debug("Using no symmetrization.")
         pass
     else:
         # this should never happen
         raise ValueError(
             "Expected kernel_symm in ['+', '*', 'gamma' or None]. "
             "Got {}".format(self.gamma))
     return K
Exemple #7
0
def run_magic_from_file(
        filename,
        # data loading params
        sparse=True,
        gene_names=None,
        cell_names=None,
        cell_axis=None,
        gene_labels=None,
        allow_duplicates=None,
        genome=None,
        metadata_channels=None,
        # filtering params
        min_library_size=2000,
        min_cells_per_gene=10,
        # normalization params
        library_size_normalize=True,
        transform='sqrt',
        pseudocount=None,
        cofactor=None,
        # kernel params
        knn=5,
        decay=15,
        n_pca=100,
        knn_dist='euclidean',
        n_jobs=1,
        random_state=42,
        verbose=1,
        # magic params
        t_magic='auto',
        genes=None,
        # output params
        output='magic.csv',
        validate=False):
    """Run MAGIC on a file

    Parameters
    ----------
    filename : str
        Allowed types: csv, tsv, mtx, hdf5/h5 (10X format),
        directory/zip (10X format)
    sparse : bool (recommended: True for scRNAseq, False for CyTOF)
        Force data sparsity. If `None`, sparsity is determined by data type.
    gene_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says gene names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        gene names, list gives an array of gene names, `False` means
        no gene names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing gene names, list gives an array of gene names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says cell names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        cell names, list gives an array of cell names, `False` means
        no cell names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing cell names, list gives an array of cell names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_axis : {'row', 'column'}
        States whether cells are on rows or columns. If cell_axis=='row',
        data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of
        shape [n_genes, n_cells]. Only valid for filetype mtx and csv
    gene_labels : {'symbol', 'id', 'both'}
        Choice of gene labels for 10X data. Recommended: 'both'
        Only valid for directory, zip, hdf5, h5
    allow_duplicates : bool
        Allow duplicate gene names in 10X data. Recommended: True
        Only valid for directory, zip, hdf5, h5
    genome : str
        Genome name. Only valid for hdf5, h5
    metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1'])
        Names of channels in fcs data which are not real measurements.
        Only valid if datatype is fcs.
    min_library_size : int or `None`, optional (default: 2000)
        Cutoff for library size normalization. If `None`,
        library size filtering is not used
    min_cells_per_gene : int or `None`, optional (default: 10)
        Minimum non-zero cells for a gene to be used. If `None`,
        genes are not removed
    library_size_normalize : `bool`, optional (default: True)
        Use library size normalization
    transform : {'sqrt', 'log', 'arcsinh', None}
        How to transform the data. If `None`, no transformation is done
    pseudocount : float (recommended: 1)
        Number of pseudocounts to add to genes prior to log transformation
    cofactor : float (recommended: 5)
        Factor by which to divide genes prior to arcsinh transformation
    knn : int, optional, default: 10
        number of nearest neighbors on which to build kernel
    decay : int, optional, default: 15
        sets decay rate of kernel tails.
        If None, alpha decaying kernel is not used
    n_pca : int, optional, default: 100
        Number of principal components to use for calculating
        neighborhoods. For extremely large datasets, using
        n_pca < 20 allows neighborhoods to be calculated in
        roughly log(n_samples) time.
    knn_dist : string, optional, default: 'euclidean'
        recommended values: 'euclidean', 'cosine'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph.
    n_jobs : integer, optional, default: 1
        The number of jobs to use for the computation.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used
    random_state : integer or numpy.RandomState, optional, default: None
        The generator used to initialize random PCA
        If an integer is given, it fixes the seed
        Defaults to the global `numpy` random number generator
    verbose : `int` or `boolean`, optional (default: 1)
        If `True` or `> 0`, print status messages
    t_magic : int, optional, default: 'auto'
        power to which the diffusion operator is powered for MAGIC.
        This sets the level of diffusion. If 'auto', t is selected
        according to the Procrustes disparity of the diffused data
    genes : list or {"all_genes", "pca_only"}, optional (default: None)
        List of genes to return from MAGIC,
        either as integer indices or column names
        if input data is a pandas DataFrame. If "all_genes", the entire
        smoothed matrix is returned. If "pca_only", PCA on the smoothed
        data is returned. If None, the entire matrix is also
        returned, but a warning may be raised if the resultant matrix
        is very large.
    output : str, optional (default: 'magic.csv')
        Output CSV file to save smoothed data matrix
    """
    # check arguments
    filetype = check_filetype(filename)
    load_fn, load_kws = check_load_args(filetype,
                                        sparse=sparse,
                                        gene_names=gene_names,
                                        cell_names=cell_names,
                                        cell_axis=cell_axis,
                                        gene_labels=gene_labels,
                                        allow_duplicates=allow_duplicates,
                                        genome=genome,
                                        metadata_channels=metadata_channels)
    transform_fn, transform_kws = check_transform_args(transform=transform,
                                                       pseudocount=pseudocount,
                                                       cofactor=cofactor)

    # set up logging
    # https://github.com/scottgigante/tasklogger
    tasklogger.set_level(verbose)

    # load data
    # example: scprep.io.load_csv("data.csv")
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.io
    tasklogger.log_info("Loading data from {}...".format(filename))
    data = load_fn(filename, **load_kws)
    data = scprep.sanitize.check_numeric(data, copy=True)
    tasklogger.log_info("Loaded {} cells and {} genes.".format(
        data.shape[0], data.shape[1]))

    # filter data
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.filter
    if min_library_size is not None:
        tasklogger.log_info("Filtering cells by library size >= {}...".format(
            min_library_size))
        data = scprep.filter.filter_library_size(data, cutoff=min_library_size)
        tasklogger.log_info("Retained {} cells.".format(data.shape[0]))
    if min_cells_per_gene is not None:
        tasklogger.log_info(
            "Filtering genes by min cells >= {}...".format(min_cells_per_gene))
        data = scprep.filter.filter_rare_genes(data,
                                               min_cells=min_cells_per_gene)
        tasklogger.log_info("Retained {} genes.".format(data.shape[1]))

    # normalize data
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.normalize
    if library_size_normalize:
        tasklogger.log_info("Library size normalizing data...")
        data = scprep.normalize.library_size_normalize(data)

    # transform data
    # example: data = scprep.transform.sqrt(data)
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.transform
    if transform is not None:
        tasklogger.log_info("Applying {} transform...".format(transform))
        data = transform_fn(data, **transform_kws)

    # run MAGIC
    # https://magic.readthedocs.io/
    magic_op = magic.MAGIC(knn=knn,
                           decay=decay,
                           t=t_magic,
                           n_pca=n_pca,
                           knn_dist=knn_dist,
                           n_jobs=n_jobs,
                           random_state=random_state,
                           verbose=verbose)
    magic_data = magic_op.fit_transform(data, genes=genes)

    # save as csv
    magic_data = pd.DataFrame(magic_data)
    if cell_axis in ['col', 'column']:
        magic_data = magic_data.T
    tasklogger.log_info("Saving data to {}...".format(output))
    magic_data.to_csv(output)
    tasklogger.log_info("Complete.".format(output))
    if validate:
        correct_magic_data = scprep.io.load_csv(
            'https://raw.githubusercontent.com/KrishnaswamyLab/magic-docker/'
            'master/magic-validate.csv',
            sparse=False)
        try:
            np.testing.assert_equal(scprep.utils.toarray(magic_data),
                                    scprep.utils.toarray(correct_magic_data))
            tasklogger.log_debug(
                "Validation complete, output is equal to expected")
        except AssertionError:
            np.testing.assert_allclose(
                scprep.utils.toarray(magic_data),
                scprep.utils.toarray(correct_magic_data),
                atol=1e-14)
            tasklogger.log_debug(
                "Validation complete, output is numerically equivalent to expected"
            )
Exemple #8
0
def test_log():
    tasklogger.log_debug("debug")
    tasklogger.log_info("info")
    tasklogger.log_warning("warning")
    tasklogger.log_error("error")
    tasklogger.log_critical("critical")
Exemple #9
0
    def impute(self,
               data,
               t_max=20,
               plot=False,
               ax=None,
               max_genes_compute_t=500,
               threshold=0.001):
        """Peform MAGIC imputation

        Parameters
        ----------
        data : graphtools.Graph, graphtools.Data or array-like
            Input data
        t_max : int, optional (default: 20)
            Maximum value of t to consider for optimal t selection
        plot : bool, optional (default: False)
            Plot the optimal t selection graph
        ax : matplotlib.Axes, optional (default: None)
            Axis on which to plot. If None, a new axis is created
        max_genes_compute_t : int, optional (default: 500)
            Above this number, genes will be subsampled for
            optimal t selection
        threshold : float, optional (default: 0.001)
            Threshold after which Procrustes disparity is considered
            to have converged for optimal t selection

        Returns
        -------
        X_magic : array-like, shape=[n_samples, n_pca]
            Imputed data
        """
        if not isinstance(data, graphtools.base.Data):
            data = graphtools.base.Data(data, n_pca=self.n_pca)
        data_imputed = data.data_nu

        if data_imputed.shape[1] > max_genes_compute_t:
            subsample_genes = np.random.choice(data_imputed.shape[1],
                                               max_genes_compute_t,
                                               replace=False)
        else:
            subsample_genes = None
        if hasattr(data, "data_pca"):
            weights = None  # data.data_pca.explained_variance_ratio_
        else:
            weights = None
        if self.t == 'auto':
            _, data_prev = self.calculate_error(
                data_imputed,
                data_prev=None,
                weights=weights,
                subsample_genes=subsample_genes)
            error_vec = []
            t_opt = None
        else:
            t_opt = self.t

        tasklogger.log_start("imputation")

        # classic magic
        # the diffusion matrix is powered when t has been specified by
        # the user, and the dimensions of the diffusion matrix are lesser
        # than those of the data matrix. (M^t) * D
        if (t_opt is not None) and \
                (self.diff_op.shape[1] < data_imputed.shape[1]):
            diff_op_t = np.linalg.matrix_power(self.diff_op, t_opt)
            data_imputed = diff_op_t.dot(data_imputed)

        # fast magic
        # a while loop is used when the dimensions of the diffusion matrix
        # are greater than those of the data matrix, or when t is not specified
        # (so as to allow for the calculation of the optimal t value)
        else:
            i = 0
            while (t_opt is None and i < t_max) or \
                    (t_opt is not None and i < t_opt):
                i += 1
                data_imputed = self.diff_op.dot(data_imputed)
                if self.t == 'auto':
                    error, data_prev = self.calculate_error(
                        data_imputed,
                        data_prev,
                        weights=weights,
                        subsample_genes=subsample_genes)
                    error_vec.append(error)
                    tasklogger.log_debug("{}: {}".format(i, error_vec))
                    if error < threshold and t_opt is None:
                        t_opt = i + 1
                        tasklogger.log_info(
                            "Automatically selected t = {}".format(t_opt))

        tasklogger.log_complete("imputation")

        if plot:
            # continue to t_max
            tasklogger.log_start("optimal t plot")
            if t_opt is None:
                # never converged
                warnings.warn("optimal t > t_max ({})".format(t_max),
                              RuntimeWarning)
            else:
                data_overimputed = data_imputed
                while i < t_max:
                    i += 1
                    data_overimputed = self.diff_op.dot(data_overimputed)
                    error, data_prev = self.calculate_error(
                        data_overimputed,
                        data_prev,
                        weights=weights,
                        subsample_genes=subsample_genes)
                    error_vec.append(error)

            # create axis
            if ax is None:
                fig, ax = plt.subplots()
                show = True
            else:
                show = False

            # plot
            x = np.arange(len(error_vec)) + 1
            ax.plot(x, error_vec)
            if t_opt is not None:
                ax.plot(
                    t_opt,
                    error_vec[t_opt - 1],
                    'ro',
                    markersize=10,
                )
            ax.plot(x, np.full(len(error_vec), threshold), 'k--')
            ax.set_xlabel('t')
            ax.set_ylabel('disparity(data_{t}, data_{t-1})')
            ax.set_xlim([1, len(error_vec)])
            plt.tight_layout()
            tasklogger.log_complete("optimal t plot")
            if show:
                plt.show(block=False)

        return data_imputed
Exemple #10
0
    def fit(self, X):
        """Computes the diffusion operator

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_features`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`.

        Returns
        -------
        magic_operator : MAGIC
            The estimator object
        """
        if self.knn_dist == 'precomputed':
            if isinstance(X, sparse.coo_matrix):
                X = X.tocsr()
            if X[0, 0] == 0:
                precomputed = "distance"
            else:
                precomputed = "affinity"
            tasklogger.log_info(
                "Using precomputed {} matrix...".format(precomputed))
            n_pca = None
        else:
            precomputed = None
            if self.n_pca is None or X.shape[1] <= self.n_pca:
                n_pca = None
            else:
                n_pca = self.n_pca

        if self.graph is not None:
            if self.X is not None and not \
                    utils.matrix_is_equivalent(X, self.X):
                """
                If the same data is used, we can reuse existing kernel and
                diffusion matrices. Otherwise we have to recompute.
                """
                self.graph = None
            else:
                try:
                    self.graph.set_params(decay=self.a,
                                          knn=self.k + 1,
                                          distance=self.knn_dist,
                                          precomputed=precomputed,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          n_pca=n_pca,
                                          thresh=1e-4,
                                          random_state=self.random_state)
                    tasklogger.log_info(
                        "Using precomputed graph and diffusion operator...")
                except ValueError as e:
                    # something changed that should have invalidated the graph
                    tasklogger.log_debug("Reset graph due to {}".format(
                        str(e)))
                    self.graph = None

        self.X = X

        if utils.has_empty_columns(X):
            warnings.warn("Input matrix contains unexpressed genes. "
                          "Please remove them prior to running MAGIC.")

        if self.graph is None:
            # reset X_magic in case it was previously set
            self.X_magic = None
            tasklogger.log_start("graph and diffusion operator")
            self.graph = graphtools.Graph(X,
                                          n_pca=n_pca,
                                          knn=self.k + 1,
                                          decay=self.a,
                                          thresh=1e-4,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          random_state=self.random_state)
            tasklogger.log_complete("graph and diffusion operator")

        return self
Exemple #11
0
def embed_MDS(X,
              ndim=2,
              how='metric',
              distance_metric='euclidean',
              n_jobs=1,
              seed=None,
              verbose=0):
    """Performs classic, metric, and non-metric MDS

    Metric MDS is initialized using classic MDS,
    non-metric MDS is initialized using metric MDS.

    Parameters
    ----------
    X: ndarray [n_samples, n_samples]
        2 dimensional input data array with n_samples
        embed_MDS does not check for matrix squareness,
        but this is necessary for PHATE

    n_dim : int, optional, default: 2
        number of dimensions in which the data will be embedded

    how : string, optional, default: 'classic'
        choose from ['classic', 'metric', 'nonmetric']
        which MDS algorithm is used for dimensionality reduction

    distance_metric : string, optional, default: 'euclidean'
        choose from ['cosine', 'euclidean']
        distance metric for MDS

    n_jobs : integer, optional, default: 1
        The number of jobs to use for the computation.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used

    seed: integer or numpy.RandomState, optional
        The generator used to initialize SMACOF (metric, nonmetric) MDS
        If an integer is given, it fixes the seed
        Defaults to the global numpy random number generator

    Returns
    -------
    Y : ndarray [n_samples, n_dim]
        low dimensional embedding of X using MDS
    """
    if how not in ['classic', 'metric', 'nonmetric']:
        raise ValueError("Allowable 'how' values for MDS: 'classic', "
                         "'metric', or 'nonmetric'. "
                         "'{}' was passed.".format(how))

    # MDS embeddings, each gives a different output.
    X_dist = squareform(pdist(X, distance_metric))

    # initialize all by CMDS
    Y = cmdscale_fast(X_dist, ndim)
    if how in ['metric', 'nonmetric']:
        tasklogger.log_debug("Performing metric MDS on "
                             "{} of shape {}...".format(
                                 type(X_dist), X_dist.shape))
        # Metric MDS from sklearn
        Y, _ = smacof(X_dist,
                      n_components=ndim,
                      metric=True,
                      max_iter=3000,
                      eps=1e-6,
                      random_state=seed,
                      n_jobs=n_jobs,
                      n_init=1,
                      init=Y,
                      verbose=verbose)
    if how == 'nonmetric':
        tasklogger.log_debug("Performing non-metric MDS on "
                             "{} of shape {}...".format(
                                 type(X_dist), X_dist.shape))
        # Nonmetric MDS from sklearn using metric MDS as an initialization
        Y, _ = smacof(X_dist,
                      n_components=ndim,
                      metric=True,
                      max_iter=3000,
                      eps=1e-6,
                      random_state=seed,
                      n_jobs=n_jobs,
                      n_init=1,
                      init=Y,
                      verbose=verbose)
    return Y
Exemple #12
0
    def fit(self, X, graph=None):
        """Computes the diffusion operator

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_features`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`.
        graph : `graphtools.Graph`, optional (default: None)
            If given, provides a precomputed kernel matrix with which to
            perform diffusion.

        Returns
        -------
        magic_operator : MAGIC
            The estimator object
        """
        if self.n_pca is None or X.shape[1] <= self.n_pca:
            n_pca = None
        else:
            n_pca = self.n_pca

        tasklogger.log_info("Running MAGIC on {} cells and {} genes.".format(
            X.shape[0], X.shape[1]))

        if graph is None:
            graph = self.graph
            if self.X is not None and not \
                    utils.matrix_is_equivalent(X, self.X):
                """
                If the same data is used, we can reuse existing kernel and
                diffusion matrices. Otherwise we have to recompute.
                """
                tasklogger.log_debug(
                    "Reset graph due to difference in input data")
                graph = None
            elif graph is not None:
                try:
                    graph.set_params(decay=self.decay,
                                     knn=self.knn,
                                     distance=self.knn_dist,
                                     n_jobs=self.n_jobs,
                                     verbose=self.verbose,
                                     n_pca=n_pca,
                                     thresh=1e-4,
                                     random_state=self.random_state)
                except ValueError as e:
                    # something changed that should have invalidated the graph
                    tasklogger.log_debug("Reset graph due to {}".format(
                        str(e)))
                    graph = None
        else:
            self.knn = graph.knn
            self.alpha = graph.decay
            self.n_pca = graph.n_pca
            self.knn_dist = graph.distance

        self.X = X

        if utils.has_empty_columns(X):
            warnings.warn("Input matrix contains unexpressed genes. "
                          "Please remove them prior to running MAGIC.")

        if graph is not None:
            tasklogger.log_info(
                "Using precomputed graph and diffusion operator...")
            self.graph = graph
        else:
            # reset X_magic in case it was previously set
            self.X_magic = None
            tasklogger.log_start("graph and diffusion operator")
            self.graph = graphtools.Graph(X,
                                          n_pca=n_pca,
                                          knn=self.knn,
                                          decay=self.decay,
                                          thresh=1e-4,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          random_state=self.random_state)
            tasklogger.log_complete("graph and diffusion operator")

        return self
    def fit(self, X):
        """Computes the diffusion operator

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_dimensions`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If
            `knn_dist` is 'precomputed', `data` should be a n_samples x
            n_samples distance or affinity matrix

        Returns
        -------
        phate_operator : PHATE
        The estimator object
        """
        try:
            if isinstance(X, anndata.AnnData):
                X = X.X
        except NameError:
            # anndata not installed
            pass

        if self.knn_dist.startswith('precomputed'):
            if self.knn_dist == 'precomputed':
                # automatic detection
                if isinstance(X, sparse.coo_matrix):
                    X = X.tocsr()
                if X[0, 0] == 0:
                    precomputed = "distance"
                else:
                    precomputed = "affinity"
            elif self.knn_dist in [
                    'precomputed_affinity', 'precomputed_distance'
            ]:
                precomputed = self.knn_dist.split("_")[1]
            else:
                raise ValueError(
                    "knn_dist {} not recognized. Did you mean "
                    "'precomputed_distance', "
                    "'precomputed_affinity', or 'precomputed' "
                    "(automatically detects distance or affinity)?")
            tasklogger.log_info(
                "Using precomputed {} matrix...".format(precomputed))
            n_pca = None
        else:
            precomputed = None
            if X.shape[1] <= self.n_pca:
                n_pca = None
            else:
                n_pca = self.n_pca
        if self.n_landmark is None or X.shape[0] <= self.n_landmark:
            n_landmark = None
        else:
            n_landmark = self.n_landmark

        if self.graph is not None:
            if self.X is not None and not matrix_is_equivalent(X, self.X):
                """
                If the same data is used, we can reuse existing kernel and
                diffusion matrices. Otherwise we have to recompute.
                """
                self._reset_graph()
            else:
                try:
                    self.graph.set_params(decay=self.a,
                                          knn=self.k + 1,
                                          distance=self.knn_dist,
                                          precomputed=precomputed,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          n_pca=n_pca,
                                          thresh=1e-4,
                                          n_landmark=n_landmark,
                                          random_state=self.random_state)
                    tasklogger.log_info(
                        "Using precomputed graph and diffusion operator...")
                except ValueError as e:
                    # something changed that should have invalidated the graph
                    tasklogger.log_debug("Reset graph due to {}".format(
                        str(e)))
                    self._reset_graph()

        self.X = X

        if self.graph is None:
            tasklogger.log_start("graph and diffusion operator")
            self.graph = graphtools.Graph(X,
                                          n_pca=n_pca,
                                          n_landmark=n_landmark,
                                          distance=self.knn_dist,
                                          precomputed=precomputed,
                                          knn=self.k + 1,
                                          decay=self.a,
                                          thresh=1e-4,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          random_state=self.random_state)
            tasklogger.log_complete("graph and diffusion operator")

        # landmark op doesn't build unless forced
        self.diff_op
        return self
Exemple #14
0
    def build_kernel_to_data(self, Y, knn=None):
        """Build a kernel from new input data `Y` to the `self.data`

        Parameters
        ----------

        Y: array-like, [n_samples_y, n_features]
            new data for which an affinity matrix is calculated
            to the existing data. `n_features` must match
            either the ambient or PCA dimensions

        knn : `int` or `None`, optional (default: `None`)
            If `None`, defaults to `self.knn`

        Returns
        -------

        K_yx: array-like, [n_samples_y, n_samples]
            kernel matrix where each row represents affinities of a single
            sample in `Y` to all samples in `self.data`.

        Raises
        ------

        ValueError: if the supplied data is the wrong shape
        """
        if knn is None:
            knn = self.knn
        if knn > self.data.shape[0]:
            warnings.warn("Cannot set knn ({k}) to be greater than "
                          "data.shape[0] ({n}). Setting knn={n}".format(
                              k=knn, n=self.data.shape[0]))

        Y = self._check_extension_shape(Y)
        tasklogger.log_start("KNN search")
        if self.decay is None or self.thresh == 1:
            # binary connectivity matrix
            K = self.knn_tree.kneighbors_graph(Y,
                                               n_neighbors=knn,
                                               mode='connectivity')
            tasklogger.log_complete("KNN search")
        else:
            # sparse fast alpha decay
            knn_tree = self.knn_tree
            search_knn = min(knn * 20, self.data_nu.shape[0])
            distances, indices = knn_tree.kneighbors(Y, n_neighbors=search_knn)
            if np.any(distances[:, 1] == 0):
                has_duplicates = distances[:, 1] == 0
                idx = np.argwhere((distances == 0) & has_duplicates[:, None])
                duplicate_ids = np.array([[indices[i[0], i[1]], i[0]]
                                          for i in idx
                                          if indices[i[0], i[1]] < i[0]])
                duplicate_ids = duplicate_ids[np.argsort(duplicate_ids[:, 0])]
                duplicate_names = ", ".join(
                    ["{} and {}".format(i[0], i[1]) for i in duplicate_ids])
                warnings.warn(
                    "Detected zero distance between samples {}. "
                    "Consider removing duplicates to avoid errors in "
                    "downstream processing.".format(duplicate_names),
                    RuntimeWarning)
            tasklogger.log_complete("KNN search")
            tasklogger.log_start("affinities")
            bandwidth = distances[:, knn - 1]
            radius = bandwidth * np.power(-1 * np.log(self.thresh),
                                          1 / self.decay)
            update_idx = np.argwhere(
                np.max(distances, axis=1) < radius).reshape(-1)
            tasklogger.log_debug("search_knn = {}; {} remaining".format(
                search_knn, len(update_idx)))
            if len(update_idx) > 0:
                distances = [d for d in distances]
                indices = [i for i in indices]
            while len(update_idx) > Y.shape[0] // 10 and \
                    search_knn < self.data_nu.shape[0] / 2:
                # increase the knn search
                search_knn = min(search_knn * 20, self.data_nu.shape[0])
                dist_new, ind_new = knn_tree.kneighbors(Y[update_idx],
                                                        n_neighbors=search_knn)
                for i, idx in enumerate(update_idx):
                    distances[idx] = dist_new[i]
                    indices[idx] = ind_new[i]
                update_idx = [
                    i for i, d in enumerate(distances) if np.max(d) < radius[i]
                ]
                tasklogger.log_debug("search_knn = {}; {} remaining".format(
                    search_knn, len(update_idx)))
            if search_knn > self.data_nu.shape[0] / 2:
                knn_tree = NearestNeighbors(knn,
                                            algorithm='brute',
                                            n_jobs=self.n_jobs).fit(
                                                self.data_nu)
            if len(update_idx) > 0:
                tasklogger.log_debug("radius search on {}".format(
                    len(update_idx)))
                # give up - radius search
                dist_new, ind_new = knn_tree.radius_neighbors(
                    Y[update_idx, :], radius=np.max(radius[update_idx]))
                for i, idx in enumerate(update_idx):
                    distances[idx] = dist_new[i]
                    indices[idx] = ind_new[i]
            data = np.concatenate(
                [distances[i] / bandwidth[i] for i in range(len(distances))])
            indices = np.concatenate(indices)
            indptr = np.concatenate([[0],
                                     np.cumsum([len(d) for d in distances])])
            K = sparse.csr_matrix((data, indices, indptr),
                                  shape=(Y.shape[0], self.data_nu.shape[0]))
            K.data = np.exp(-1 * np.power(K.data, self.decay))
            # handle nan
            K.data = np.where(np.isnan(K.data), 1, K.data)
            # TODO: should we zero values that are below thresh?
            K.data[K.data < self.thresh] = 0
            K = K.tocoo()
            K.eliminate_zeros()
            K = K.tocsr()
            tasklogger.log_complete("affinities")
        return K
Exemple #15
0
def Graph(data,
          n_pca=None,
          sample_idx=None,
          adaptive_k='sqrt',
          precomputed=None,
          knn=5,
          decay=10,
          distance='euclidean',
          thresh=1e-4,
          kernel_symm='+',
          gamma=None,
          n_landmark=None,
          n_svd=100,
          beta=1,
          n_jobs=-1,
          verbose=False,
          random_state=None,
          graphtype='auto',
          use_pygsp=False,
          initialize=True,
          **kwargs):
    """Create a graph built on data.

    Automatically selects the appropriate DataGraph subclass based on
    chosen parameters.
    Selection criteria:
    - if `graphtype` is given, this will be respected
    - otherwise:
    -- if `sample_idx` is given, an MNNGraph will be created
    -- if `precomputed` is not given, and either `decay` is `None` or `thresh`
    is given, a kNNGraph will be created
    - otherwise, a TraditionalGraph will be created.

    Incompatibilities:
    - MNNGraph and kNNGraph cannot be precomputed
    - kNNGraph and TraditionalGraph do not accept sample indices

    Parameters
    ----------
    data : array-like, shape=[n_samples,n_features]
        accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
        TODO: accept pandas dataframes

    n_pca : `int` or `None`, optional (default: `None`)
        number of PC dimensions to retain for graph building.
        If `None`, uses the original data.
        Note: if data is sparse, uses SVD instead of PCA
        TODO: should we subtract and store the mean?

    knn : `int`, optional (default: 5)
        Number of nearest neighbors (including self) to use to build the graph

    decay : `int` or `None`, optional (default: 10)
        Rate of alpha decay to use. If `None`, alpha decay is not used.

    distance : `str`, optional (default: `'euclidean'`)
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph.
        TODO: actually sklearn.neighbors has even more choices

    thresh : `float`, optional (default: `1e-4`)
        Threshold above which to calculate alpha decay kernel.
        All affinities below `thresh` will be set to zero in order to save
        on time and memory constraints.

    kernel_symm : string, optional (default: '+')
        Defines method of MNN symmetrization.
        '+'  : additive
        '*'  : multiplicative
        'gamma' : min-max
        'none' : no symmetrization

    gamma: float (default: None)
        Min-max symmetrization constant or matrix. Only used if kernel_symm='gamma'.
        K = `gamma * min(K, K.T) + (1 - gamma) * max(K, K.T)`

    precomputed : {'distance', 'affinity', 'adjacency', `None`}, optional (default: `None`)
        If the graph is precomputed, this variable denotes which graph
        matrix is provided as `data`.
        Only one of `precomputed` and `n_pca` can be set.

    beta: float, optional(default: 1)
        Multiply within - batch connections by(1 - beta)

    sample_idx: array-like
        Batch index for MNN kernel

    adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: 'sqrt')
        Weights MNN kernel adaptively using the number of cells in
        each sample according to the selected method.

    n_landmark : `int`, optional (default: 2000)
        number of landmarks to use

    n_svd : `int`, optional (default: 100)
        number of SVD components to use for spectral clustering

    random_state : `int` or `None`, optional (default: `None`)
        Random state for random PCA

    verbose : `bool`, optional (default: `True`)
        Verbosity.
        TODO: should this be an integer instead to allow multiple
        levels of verbosity?

    n_jobs : `int`, optional (default : 1)
        The number of jobs to use for the computation.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used

    graphtype : {'exact', 'knn', 'mnn', 'auto'} (Default: 'auto')
        Manually selects graph type. Only recommended for expert users

    use_pygsp : `bool` (Default: `False`)
        If true, inherits from `pygsp.graphs.Graph`.

    initialize : `bool` (Default: `True`)
        If True, initialize the kernel matrix on instantiation

    **kwargs : extra arguments for `pygsp.graphs.Graph`

    Returns
    -------
    G : `DataGraph`

    Raises
    ------
    ValueError : if selected parameters are incompatible.
    """
    tasklogger.set_level(verbose)
    if sample_idx is not None and len(np.unique(sample_idx)) == 1:
        warnings.warn("Only one unique sample. " "Not using MNNGraph")
        sample_idx = None
        if graphtype == 'mnn':
            graphtype = 'auto'
    if graphtype == 'auto':
        # automatic graph selection
        if sample_idx is not None:
            # only mnn does batch correction
            graphtype = "mnn"
        elif precomputed is None and (decay is None or thresh > 0):
            # precomputed requires exact graph
            # no decay or threshold decay require knngraph
            graphtype = "knn"
        else:
            graphtype = "exact"

    # set base graph type
    if graphtype == "knn":
        basegraph = graphs.kNNGraph
        if precomputed is not None:
            raise ValueError("kNNGraph does not support precomputed "
                             "values. Use `graphtype='exact'` or "
                             "`precomputed=None`")
        if sample_idx is not None:
            raise ValueError("kNNGraph does not support batch "
                             "correction. Use `graphtype='mnn'` or "
                             "`sample_idx=None`")

    elif graphtype == "mnn":
        basegraph = graphs.MNNGraph
        if precomputed is not None:
            raise ValueError("MNNGraph does not support precomputed "
                             "values. Use `graphtype='exact'` and "
                             "`sample_idx=None` or `precomputed=None`")
    elif graphtype == "exact":
        basegraph = graphs.TraditionalGraph
        if sample_idx is not None:
            raise ValueError("TraditionalGraph does not support batch "
                             "correction. Use `graphtype='mnn'` or "
                             "`sample_idx=None`")
    else:
        raise ValueError("graphtype '{}' not recognized. Choose from "
                         "['knn', 'mnn', 'exact', 'auto']")

    # set add landmarks if necessary
    parent_classes = [basegraph]
    msg = "Building {} graph".format(graphtype)
    if n_landmark is not None:
        parent_classes.append(graphs.LandmarkGraph)
        msg = msg + " with landmarks"
    if use_pygsp:
        parent_classes.append(base.PyGSPGraph)
        if len(parent_classes) > 2:
            msg = msg + " with PyGSP inheritance"
        else:
            msg = msg + " and PyGSP inheritance"

    tasklogger.log_debug(msg)

    class_names = [p.__name__.replace("Graph", "") for p in parent_classes]
    try:
        Graph = eval("graphs." + "".join(class_names) + "Graph")
    except NameError:
        raise RuntimeError("unknown graph classes {}".format(parent_classes))

    params = kwargs
    for parent_class in parent_classes:
        for param in parent_class._get_param_names():
            try:
                params[param] = eval(param)
            except NameError:
                # keyword argument not specified above - no problem
                pass

    # build graph and return
    tasklogger.log_debug("Initializing {} with arguments {}".format(
        parent_classes, ", ".join([
            "{}='{}'".format(key, value) for key, value in params.items()
            if key != "data"
        ])))
    return Graph(**params)
Exemple #16
0
    def transform(self, X=None, t_max=100, plot_optimal_t=False, ax=None):
        """Computes the position of the cells in the embedding space

        Parameters
        ----------
        X : array, optional, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_dimensions`
            dimensions. Not required, since PHATE does not currently embed
            cells not given in the input matrix to `PHATE.fit()`.
            Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If
            `knn_dist` is 'precomputed', `data` should be a n_samples x
            n_samples distance or affinity matrix

        t_max : int, optional, default: 100
            maximum t to test if `t` is set to 'auto'

        plot_optimal_t : boolean, optional, default: False
            If true and `t` is set to 'auto', plot the Von Neumann
            entropy used to select t

        ax : matplotlib.axes.Axes, optional
            If given and `plot_optimal_t` is true, plot will be drawn
            on the given axis.

        Returns
        -------
        embedding : array, shape=[n_samples, n_dimensions]
        The cells embedded in a lower dimensional space using PHATE
        """
        if self.graph is None:
            raise NotFittedError("This PHATE instance is not fitted yet. Call "
                                 "'fit' with appropriate arguments before "
                                 "using this method.")
        elif X is not None and not utils.matrix_is_equivalent(X, self.X):
            # fit to external data
            warnings.warn(
                "Pre-fit PHATE should not be used to transform a "
                "new data matrix. Please fit PHATE to the new"
                " data by running 'fit' with the new data.", RuntimeWarning)
            if isinstance(self.graph, graphtools.graphs.TraditionalGraph) and \
                    self.graph.precomputed is not None:
                raise ValueError("Cannot transform additional data using a "
                                 "precomputed distance matrix.")
            else:
                if self.embedding is None:
                    self.transform()
                transitions = self.graph.extend_to_data(X)
                return self.graph.interpolate(self.embedding, transitions)
        else:
            diff_potential = self._calculate_potential(
                t_max=t_max, plot_optimal_t=plot_optimal_t, ax=ax)
            if self.embedding is None:
                tasklogger.log_start("{} MDS".format(self.mds))
                self.embedding = mds.embed_MDS(diff_potential,
                                               ndim=self.n_components,
                                               how=self.mds,
                                               distance_metric=self.mds_dist,
                                               n_jobs=self.n_jobs,
                                               seed=self.random_state,
                                               verbose=max(
                                                   self.verbose - 1, 0))
                tasklogger.log_complete("{} MDS".format(self.mds))
            if isinstance(self.graph, graphtools.graphs.LandmarkGraph):
                tasklogger.log_debug("Extending to original data...")
                return self.graph.interpolate(self.embedding)
            else:
                return self.embedding
Exemple #17
0
def test_log():
    tasklogger.log_debug('debug')
    tasklogger.log_info('info')
    tasklogger.log_warning('warning')
    tasklogger.log_error('error')
    tasklogger.log_critical('critical')
            args.metadata_channels = None
        else:
            parser.error(
                "Cannot handle --metadata-channels with {} file".format(
                    filetype))

    # check for inappropriately set parameters
    if not args.transform == 'log':
        if '--pseudocount' in sys.argv:
            parser.error(
                "Cannot handle --pseudocount with --transform {}".format(
                    args.transform))
        else:
            args.pseudocount = None
    if not args.transform == 'arcsinh':
        if '--cofactor' in sys.argv:
            parser.error("Cannot handle --cofactor with --transform {}".format(
                args.transform))
        else:
            args.cofactor = None

    return args


if __name__ == "__main__":
    args = parse_args()
    tasklogger.set_level(args.verbose)
    tasklogger.log_debug("Running MAGIC with arguments {}".format(
        args.__dict__))
    run_magic_from_file(**(args.__dict__))