Ejemplo n.º 1
0
def measure_splat_range(load_fn,
                        var_name,
                        var_range,
                        n_jobs=1,
                        seed=None,
                        **load_kwargs):
    truth_kwargs = {}
    truth_kwargs["dropout"] = 0
    truth_kwargs["bcv"] = 0
    truth_kwargs.update(load_kwargs)
    tasklogger.log_info("Generating ground truth data...", logger="demap")
    data_truth = load_fn(seed=seed, **truth_kwargs)
    results = pd.concat([
        measure_all_methods(
            data_truth,
            load_fn,
            load_params=dict(**{var_name: var_value}, seed=seed,
                             **load_kwargs),
            n_jobs=n_jobs,
            seed=seed,
        ) for var_value in var_range
    ])
    results.to_csv("../results/{}_{}_{}_{}_{}.csv".format(
        load_fn.__name__, var_name, var_range.min(), var_range.max(), seed))
    results_agg = (results.groupby("method").agg({
        "demap": [np.mean, np.std]
    }).sort_values(("demap", "mean"), ascending=False))
    print(results_agg)
    return results_agg
Ejemplo n.º 2
0
def get_levels(grad):
    """Short summary.

    Parameters
    ----------
    grad : type
        Description of parameter `Xs`.

    Returns
    -------
    type
        Description of returned object.


    """
    tasklogger.log_info("Identifying salient levels of resolution...")
    minimum = np.max(grad)
    levels = []
    levels.append(0)

    for i in range(1, len(grad) - 1):
        if grad[i] <= minimum and grad[i] < grad[i + 1]:
            levels.append(i)
            minimum = grad[i]
    return levels
Ejemplo n.º 3
0
def compute_gradient(Xs, merges):
    """Short summary.

    Parameters
    ----------
    Xs : type
        Description of parameter `Xs`.
    merges : type
        Description of parameter `merges`.

    Returns
    -------
    type
        Description of returned object.

    """
    tasklogger.log_info("Computing gradient...")
    gradient = []
    m = 0
    X = Xs[0]

    for l in range(0, len(Xs) - 1):
        if X.shape[0] != Xs[l + 1].shape[0]:
            X_1 = condense_visualization(merges[m], X)
            m = m + 1
            while X_1.shape[0] != Xs[l + 1].shape[0]:
                X_1 = condense_visualization(merges[m], X_1)
                m = m + 1
        else:
            X_1 = X
        gradient.append(np.sum(np.abs(X_1 - Xs[l + 1])))
        X = Xs[l + 1]
    return np.array(gradient)
Ejemplo n.º 4
0
    def fit(self, X):
        """Computes the diffusion operator

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_dimensions`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If
            `knn_dist` is 'precomputed', `data` should be a n_samples x
            n_samples distance or affinity matrix

        Returns
        -------
        phate_operator : PHATE
        The estimator object
        """
        X, n_pca, precomputed, update_graph = self._parse_input(X)

        if precomputed is None:
            tasklogger.log_info(
                "Running PHATE on {} cells and {} genes.".format(
                    X.shape[0], X.shape[1]))
        else:
            tasklogger.log_info(
                "Running PHATE on precomputed {} matrix with {} cells.".format(
                    precomputed, X.shape[0]))

        if self.n_landmark is None or X.shape[0] <= self.n_landmark:
            n_landmark = None
        else:
            n_landmark = self.n_landmark

        if self.graph is not None and update_graph:
            self._update_graph(X, precomputed, n_pca, n_landmark)

        self.X = X

        if self.graph is None:
            tasklogger.log_start("graph and diffusion operator")
            self.graph = graphtools.Graph(
                X,
                n_pca=n_pca,
                n_landmark=n_landmark,
                distance=self.knn_dist,
                precomputed=precomputed,
                knn=self.knn,
                decay=self.decay,
                thresh=1e-4,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                **(self.kwargs))
            tasklogger.log_complete("graph and diffusion operator")

        # landmark op doesn't build unless forced
        self.diff_op
        return self
Ejemplo n.º 5
0
def measure_all_methods(load_fn,
                        n_cells=None,
                        n_jobs=1,
                        load_params=None,
                        seed=None):
    if load_params is None:
        load_params = {}
    if "n_cells" in load_params:
        n_cells = load_params["n_cells"]
        del load_params["n_cells"]
    tasklogger.log_info("Generating noisy data with {}...".format(load_params),
                        logger="demap")
    data_noised, labels = load_fn(return_groups=True, **load_params)
    data_name = load_fn.__name__
    if n_cells is not None:
        subsample_idx = np.random.choice(data_noised.shape[0],
                                         n_cells,
                                         replace=False)
    else:
        subsample_idx = None
    measure = partial(
        measure_method,
        labels=labels,
        data_noised=data_noised,
        data_name=data_name,
        subsample_idx=subsample_idx,
    )
    if n_jobs == 1:
        results = [
            measure(method=method) for method in demap.embed.all_methods
        ]
    else:
        results = Parallel(n_jobs=n_jobs)(
            delayed(measure)(method=method)
            for method in demap.embed.parallel_methods)
        results = results + [
            measure(method=method)
            for method in demap.embed.non_parallel_methods
        ]
    df = pd.concat(results)
    df = df.sort_values("ARI", ascending=False)
    for key, value in load_params.items():
        df[key] = value
    if n_cells is not None:
        df["n_cells"] = n_cells
    print(df)
    return df
Ejemplo n.º 6
0
    def _find_optimal_t(self, t_max=100, plot=False, ax=None):
        """Find the optimal value of t

        Selects the optimal value of t based on the knee point of the
        Von Neumann Entropy of the diffusion operator.

        Parameters
        ----------
        t_max : int, default: 100
            Maximum value of t to test

        plot : boolean, default: False
            If true, plots the Von Neumann Entropy and knee point

        ax : matplotlib.Axes, default: None
            If plot=True and ax is not None, plots the VNE on the given axis
            Otherwise, creates a new axis and displays the plot

        Returns
        -------
        t_opt : int
            The optimal value of t
        """
        tasklogger.log_start("optimal t")
        t, h = self._von_neumann_entropy(t_max=t_max)
        t_opt = vne.find_knee_point(y=h, x=t)
        tasklogger.log_info("Automatically selected t = {}".format(t_opt))
        tasklogger.log_complete("optimal t")

        if plot:
            if ax is None:
                fig, ax = plt.subplots()
                show = True
            else:
                show = False
            ax.plot(t, h)
            ax.scatter(t_opt, h[t == t_opt], marker='*', c='k', s=50)
            ax.set_xlabel("t")
            ax.set_ylabel("Von Neumann Entropy")
            ax.set_title("Optimal t = {}".format(t_opt))
            if show:
                plt.show()

        self.optimal_t = t_opt

        return t_opt
 def fit(self, X, Y, q=None):
     if hasattr(self, "phi_X"):
         tasklogger.log_info("Using precomputed diffusion coordinates.")
     else:
         tasklogger.log_start("diffusion coordinates")
         if q is None:
             with parallel.ParallelQueue(n_jobs=min(2, self.n_jobs)) as q:
                 return self.fit(X, Y, q)
         else:
             q.queue(
                 math.diffusionCoordinates,
                 X,
                 decay=self.decay_X,
                 knn=self.knn_X,
                 n_pca=self.n_pca_X if self.n_pca_X is not None
                 and self.n_pca_X < min(X.shape) else None,
                 n_eigenvectors=self.n_eigenvectors,
                 n_jobs=max(self.n_jobs // 2, 1),
                 verbose=self.verbose,
                 random_state=self.random_state,
             )
             q.queue(
                 math.diffusionCoordinates,
                 Y,
                 decay=self.decay_Y,
                 knn=self.knn_Y,
                 n_pca=self.n_pca_Y if self.n_pca_Y is not None
                 and self.n_pca_Y < min(Y.shape) else None,
                 n_eigenvectors=self.n_eigenvectors,
                 n_jobs=max(self.n_jobs // 2, 1),
                 verbose=self.verbose,
                 random_state=self.random_state,
             )
         (phi_X, lambda_X), (phi_Y, lambda_Y) = q.run()
         self.phi_X = phi_X
         self.lambda_X = lambda_X
         self.phi_Y = phi_Y
         self.lambda_Y = lambda_Y
         tasklogger.log_complete("diffusion coordinates")
     return self
Ejemplo n.º 8
0
def compute_condensation_param(X, granularity):
    """Short summary.

    Parameters
    ----------
    X : type
        Description of parameter `X`.
    granularity : type
        Description of parameter `granularity`.

    Returns
    -------
    type
        Description of returned object.

    """
    epsilon = granularity * (0.1 * np.mean(np.std(X))) / (X.shape[0] ** (-1 / 5))
    D = scipy.spatial.distance.pdist(X, metric="euclidean")
    merge_threshold = np.percentile(D, 0.001) + 0.001
    tasklogger.log_info("Setting epsilon to " + str(round(epsilon, 4)))
    tasklogger.log_info("Setting merge threshold to " + str(round(merge_threshold, 4)))
    return epsilon, merge_threshold
Ejemplo n.º 9
0
 def _update_graph(self, X, precomputed, n_pca, n_landmark):
     if self.X is not None and not utils.matrix_is_equivalent(
             X, self.X):
         """
         If the same data is used, we can reuse existing kernel and
         diffusion matrices. Otherwise we have to recompute.
         """
         self._reset_graph()
     else:
         try:
             self.graph.set_params(
                 decay=self.decay, knn=self.knn, distance=self.knn_dist,
                 precomputed=precomputed,
                 n_jobs=self.n_jobs, verbose=self.verbose, n_pca=n_pca,
                 n_landmark=n_landmark,
                 random_state=self.random_state)
             tasklogger.log_info(
                 "Using precomputed graph and diffusion operator...")
         except ValueError as e:
             # something changed that should have invalidated the graph
             tasklogger.log_debug("Reset graph due to {}".format(
                 str(e)))
             self._reset_graph()
Ejemplo n.º 10
0
    def impute(self, data):
        """Main function of I-Impute

        Parameters
        ----------
        data : matrix, shape (m x n)
            The raw reads count matrix

        Returns
        -------
        imputed_data: matrix, shape (m x n)
            The imputed matrix, pandas Dataframe object
        """
        tasklogger.log_start('I-Impute')
        imputed_data = None
        if self.iteration:
            exp_mse = 1
            mse = 100
            previous_imputed_data = data
            iteration = 1
            while mse > exp_mse:
                tasklogger.log_info(
                    'iteratively impute for the {0}th time'.format(iteration))
                current_imputed_data = self._cimpute(previous_imputed_data)
                dist_matrix = (current_imputed_data - previous_imputed_data)**2
                n_values = data.shape[0] * data.shape[1]
                mse = np.sqrt(dist_matrix.values.sum() / n_values)
                previous_imputed_data = current_imputed_data
                iteration += 1

            imputed_data = previous_imputed_data
        else:
            imputed_data = self._cimpute(data)

        tasklogger.log_complete('I-Impute')
        return imputed_data
def measure_all_methods(load_fn,
                        dropout=None,
                        bcv=None, 
                        n_cells=None, 
                        n_genes=None,
                        n_jobs=6,
                        seed=None):
    dataset = load_fn(seed=seed, dropout=dropout, bcv=bcv, 
                          n_genes=n_genes)
    data_truth = dataset.X_true
    tasklogger.log_info("Calculating geodesic distances...")
    geodesic_dist = quantify.geodesic_distance(data_truth)
    data_noised = dataset.X
    if n_cells is not None and n_cells < Splatter.N_CELLS:
        subsample_idx = np.random.choice(
            data.shape[0], n_cells, replace=False)
    else:
        subsample_idx = None
    # embed
    tasklogger.log_info("Embedding...")
    methods = [m for m in embed.__all__ if not (m.__name__ in ['MDS', 'PHATE'])]
    embeddings = Parallel(6)(delayed(method)(data_noised, seed=seed) for method in methods)
    methods.append(embed.PHATE)
    embeddings.append(embed.PHATE(data_noised, seed=seed, n_jobs=10))
    methods.append(embed.MDS)
    embeddings.append(embed.MDS(data_noised, seed=seed, n_jobs=10))
    # plot
    tasklogger.log_info("Plotting...")
    fig, axes = plt.subplots(1, len(embeddings), figsize=(4*len(embeddings), 4))
    for embedding, ax, method in zip(embeddings, axes, methods):
        scprep.plot.scatter2d(embedding, ax=ax, label_prefix=method.__name__, 
                              ticks=False, c=dataset.c, legend=False)
    plt.tight_layout()
    fig.savefig(IMG_PATH.format(dataset.name, seed))
    # evaluate
    tasklogger.log_info("Evaluating...")
    results = [measure_method(embedding=embedding, data=data_truth, data_noised=data_noised,
                                    name=method.__name__, subsample_idx=subsample_idx,
                                    geodesic_dist=geodesic_dist, labels=dataset.c, seed=seed)
            for embedding, method in zip(embeddings, methods)]
    df = pd.concat(results)
    df = df.sort_values('DEMaP', ascending=False)
    print(df)
    return df
Ejemplo n.º 12
0
    def fit(self, X):
        """Computes the diffusion operator

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_dimensions`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If
            `knn_dist` is 'precomputed', `data` should be a n_samples x
            n_samples distance or affinity matrix

        Returns
        -------
        phate_operator : PHATE
        The estimator object
        """
        try:
            if isinstance(X, anndata.AnnData):
                X = X.X
        except NameError:
            # anndata not installed
            pass

        if self.knn_dist.startswith('precomputed'):
            if self.knn_dist == 'precomputed':
                # automatic detection
                if isinstance(X, sparse.coo_matrix):
                    X = X.tocsr()
                if X[0, 0] == 0:
                    precomputed = "distance"
                else:
                    precomputed = "affinity"
            elif self.knn_dist in [
                    'precomputed_affinity', 'precomputed_distance'
            ]:
                precomputed = self.knn_dist.split("_")[1]
            else:
                raise ValueError(
                    "knn_dist {} not recognized. Did you mean "
                    "'precomputed_distance', "
                    "'precomputed_affinity', or 'precomputed' "
                    "(automatically detects distance or affinity)?")
            tasklogger.log_info(
                "Using precomputed {} matrix...".format(precomputed))
            n_pca = None
        else:
            precomputed = None
            if X.shape[1] <= self.n_pca:
                n_pca = None
            else:
                n_pca = self.n_pca
        if self.n_landmark is None or X.shape[0] <= self.n_landmark:
            n_landmark = None
        else:
            n_landmark = self.n_landmark

        if self.graph is not None:
            if self.X is not None and not matrix_is_equivalent(X, self.X):
                """
                If the same data is used, we can reuse existing kernel and
                diffusion matrices. Otherwise we have to recompute.
                """
                self._reset_graph()
            else:
                try:
                    self.graph.set_params(decay=self.a,
                                          knn=self.k + 1,
                                          distance=self.knn_dist,
                                          precomputed=precomputed,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          n_pca=n_pca,
                                          thresh=1e-4,
                                          n_landmark=n_landmark,
                                          random_state=self.random_state)
                    tasklogger.log_info(
                        "Using precomputed graph and diffusion operator...")
                except ValueError as e:
                    # something changed that should have invalidated the graph
                    tasklogger.log_debug("Reset graph due to {}".format(
                        str(e)))
                    self._reset_graph()

        self.X = X

        if self.graph is None:
            tasklogger.log_start("graph and diffusion operator")
            self.graph = graphtools.Graph(X,
                                          n_pca=n_pca,
                                          n_landmark=n_landmark,
                                          distance=self.knn_dist,
                                          precomputed=precomputed,
                                          knn=self.k + 1,
                                          decay=self.a,
                                          thresh=1e-4,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          random_state=self.random_state)
            tasklogger.log_complete("graph and diffusion operator")

        # landmark op doesn't build unless forced
        self.diff_op
        return self
Ejemplo n.º 13
0
def run_magic_from_file(
        filename,
        # data loading params
        sparse=True,
        gene_names=None,
        cell_names=None,
        cell_axis=None,
        gene_labels=None,
        allow_duplicates=None,
        genome=None,
        metadata_channels=None,
        # filtering params
        min_library_size=2000,
        min_cells_per_gene=10,
        # normalization params
        library_size_normalize=True,
        transform='sqrt',
        pseudocount=None,
        cofactor=None,
        # kernel params
        knn=5,
        decay=15,
        n_pca=100,
        knn_dist='euclidean',
        n_jobs=1,
        random_state=42,
        verbose=1,
        # magic params
        t_magic='auto',
        genes=None,
        # output params
        output='magic.csv',
        validate=False):
    """Run MAGIC on a file

    Parameters
    ----------
    filename : str
        Allowed types: csv, tsv, mtx, hdf5/h5 (10X format),
        directory/zip (10X format)
    sparse : bool (recommended: True for scRNAseq, False for CyTOF)
        Force data sparsity. If `None`, sparsity is determined by data type.
    gene_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says gene names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        gene names, list gives an array of gene names, `False` means
        no gene names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing gene names, list gives an array of gene names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says cell names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        cell names, list gives an array of cell names, `False` means
        no cell names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing cell names, list gives an array of cell names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_axis : {'row', 'column'}
        States whether cells are on rows or columns. If cell_axis=='row',
        data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of
        shape [n_genes, n_cells]. Only valid for filetype mtx and csv
    gene_labels : {'symbol', 'id', 'both'}
        Choice of gene labels for 10X data. Recommended: 'both'
        Only valid for directory, zip, hdf5, h5
    allow_duplicates : bool
        Allow duplicate gene names in 10X data. Recommended: True
        Only valid for directory, zip, hdf5, h5
    genome : str
        Genome name. Only valid for hdf5, h5
    metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1'])
        Names of channels in fcs data which are not real measurements.
        Only valid if datatype is fcs.
    min_library_size : int or `None`, optional (default: 2000)
        Cutoff for library size normalization. If `None`,
        library size filtering is not used
    min_cells_per_gene : int or `None`, optional (default: 10)
        Minimum non-zero cells for a gene to be used. If `None`,
        genes are not removed
    library_size_normalize : `bool`, optional (default: True)
        Use library size normalization
    transform : {'sqrt', 'log', 'arcsinh', None}
        How to transform the data. If `None`, no transformation is done
    pseudocount : float (recommended: 1)
        Number of pseudocounts to add to genes prior to log transformation
    cofactor : float (recommended: 5)
        Factor by which to divide genes prior to arcsinh transformation
    knn : int, optional, default: 10
        number of nearest neighbors on which to build kernel
    decay : int, optional, default: 15
        sets decay rate of kernel tails.
        If None, alpha decaying kernel is not used
    n_pca : int, optional, default: 100
        Number of principal components to use for calculating
        neighborhoods. For extremely large datasets, using
        n_pca < 20 allows neighborhoods to be calculated in
        roughly log(n_samples) time.
    knn_dist : string, optional, default: 'euclidean'
        recommended values: 'euclidean', 'cosine'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph.
    n_jobs : integer, optional, default: 1
        The number of jobs to use for the computation.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used
    random_state : integer or numpy.RandomState, optional, default: None
        The generator used to initialize random PCA
        If an integer is given, it fixes the seed
        Defaults to the global `numpy` random number generator
    verbose : `int` or `boolean`, optional (default: 1)
        If `True` or `> 0`, print status messages
    t_magic : int, optional, default: 'auto'
        power to which the diffusion operator is powered for MAGIC.
        This sets the level of diffusion. If 'auto', t is selected
        according to the Procrustes disparity of the diffused data
    genes : list or {"all_genes", "pca_only"}, optional (default: None)
        List of genes to return from MAGIC,
        either as integer indices or column names
        if input data is a pandas DataFrame. If "all_genes", the entire
        smoothed matrix is returned. If "pca_only", PCA on the smoothed
        data is returned. If None, the entire matrix is also
        returned, but a warning may be raised if the resultant matrix
        is very large.
    output : str, optional (default: 'magic.csv')
        Output CSV file to save smoothed data matrix
    """
    # check arguments
    filetype = check_filetype(filename)
    load_fn, load_kws = check_load_args(filetype,
                                        sparse=sparse,
                                        gene_names=gene_names,
                                        cell_names=cell_names,
                                        cell_axis=cell_axis,
                                        gene_labels=gene_labels,
                                        allow_duplicates=allow_duplicates,
                                        genome=genome,
                                        metadata_channels=metadata_channels)
    transform_fn, transform_kws = check_transform_args(transform=transform,
                                                       pseudocount=pseudocount,
                                                       cofactor=cofactor)

    # set up logging
    # https://github.com/scottgigante/tasklogger
    tasklogger.set_level(verbose)

    # load data
    # example: scprep.io.load_csv("data.csv")
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.io
    tasklogger.log_info("Loading data from {}...".format(filename))
    data = load_fn(filename, **load_kws)
    data = scprep.sanitize.check_numeric(data, copy=True)
    tasklogger.log_info("Loaded {} cells and {} genes.".format(
        data.shape[0], data.shape[1]))

    # filter data
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.filter
    if min_library_size is not None:
        tasklogger.log_info("Filtering cells by library size >= {}...".format(
            min_library_size))
        data = scprep.filter.filter_library_size(data, cutoff=min_library_size)
        tasklogger.log_info("Retained {} cells.".format(data.shape[0]))
    if min_cells_per_gene is not None:
        tasklogger.log_info(
            "Filtering genes by min cells >= {}...".format(min_cells_per_gene))
        data = scprep.filter.filter_rare_genes(data,
                                               min_cells=min_cells_per_gene)
        tasklogger.log_info("Retained {} genes.".format(data.shape[1]))

    # normalize data
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.normalize
    if library_size_normalize:
        tasklogger.log_info("Library size normalizing data...")
        data = scprep.normalize.library_size_normalize(data)

    # transform data
    # example: data = scprep.transform.sqrt(data)
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.transform
    if transform is not None:
        tasklogger.log_info("Applying {} transform...".format(transform))
        data = transform_fn(data, **transform_kws)

    # run MAGIC
    # https://magic.readthedocs.io/
    magic_op = magic.MAGIC(knn=knn,
                           decay=decay,
                           t=t_magic,
                           n_pca=n_pca,
                           knn_dist=knn_dist,
                           n_jobs=n_jobs,
                           random_state=random_state,
                           verbose=verbose)
    magic_data = magic_op.fit_transform(data, genes=genes)

    # save as csv
    magic_data = pd.DataFrame(magic_data)
    if cell_axis in ['col', 'column']:
        magic_data = magic_data.T
    tasklogger.log_info("Saving data to {}...".format(output))
    magic_data.to_csv(output)
    tasklogger.log_info("Complete.".format(output))
    if validate:
        correct_magic_data = scprep.io.load_csv(
            'https://raw.githubusercontent.com/KrishnaswamyLab/magic-docker/'
            'master/magic-validate.csv',
            sparse=False)
        try:
            np.testing.assert_equal(scprep.utils.toarray(magic_data),
                                    scprep.utils.toarray(correct_magic_data))
            tasklogger.log_debug(
                "Validation complete, output is equal to expected")
        except AssertionError:
            np.testing.assert_allclose(
                scprep.utils.toarray(magic_data),
                scprep.utils.toarray(correct_magic_data),
                atol=1e-14)
            tasklogger.log_debug(
                "Validation complete, output is numerically equivalent to expected"
            )
Ejemplo n.º 14
0
def test_log():
    tasklogger.log_debug("debug")
    tasklogger.log_info("info")
    tasklogger.log_warning("warning")
    tasklogger.log_error("error")
    tasklogger.log_critical("critical")
Ejemplo n.º 15
0
    def impute(self,
               data,
               t_max=20,
               plot=False,
               ax=None,
               max_genes_compute_t=500,
               threshold=0.001):
        """Peform MAGIC imputation

        Parameters
        ----------
        data : graphtools.Graph, graphtools.Data or array-like
            Input data
        t_max : int, optional (default: 20)
            Maximum value of t to consider for optimal t selection
        plot : bool, optional (default: False)
            Plot the optimal t selection graph
        ax : matplotlib.Axes, optional (default: None)
            Axis on which to plot. If None, a new axis is created
        max_genes_compute_t : int, optional (default: 500)
            Above this number, genes will be subsampled for
            optimal t selection
        threshold : float, optional (default: 0.001)
            Threshold after which Procrustes disparity is considered
            to have converged for optimal t selection

        Returns
        -------
        X_magic : array-like, shape=[n_samples, n_pca]
            Imputed data
        """
        if not isinstance(data, graphtools.base.Data):
            data = graphtools.base.Data(data, n_pca=self.n_pca)
        data_imputed = data.data_nu

        if data_imputed.shape[1] > max_genes_compute_t:
            subsample_genes = np.random.choice(data_imputed.shape[1],
                                               max_genes_compute_t,
                                               replace=False)
        else:
            subsample_genes = None
        if hasattr(data, "data_pca"):
            weights = None  # data.data_pca.explained_variance_ratio_
        else:
            weights = None
        if self.t == 'auto':
            _, data_prev = self.calculate_error(
                data_imputed,
                data_prev=None,
                weights=weights,
                subsample_genes=subsample_genes)
            error_vec = []
            t_opt = None
        else:
            t_opt = self.t

        tasklogger.log_start("imputation")

        # classic magic
        # the diffusion matrix is powered when t has been specified by
        # the user, and the dimensions of the diffusion matrix are lesser
        # than those of the data matrix. (M^t) * D
        if (t_opt is not None) and \
                (self.diff_op.shape[1] < data_imputed.shape[1]):
            diff_op_t = np.linalg.matrix_power(self.diff_op, t_opt)
            data_imputed = diff_op_t.dot(data_imputed)

        # fast magic
        # a while loop is used when the dimensions of the diffusion matrix
        # are greater than those of the data matrix, or when t is not specified
        # (so as to allow for the calculation of the optimal t value)
        else:
            i = 0
            while (t_opt is None and i < t_max) or \
                    (t_opt is not None and i < t_opt):
                i += 1
                data_imputed = self.diff_op.dot(data_imputed)
                if self.t == 'auto':
                    error, data_prev = self.calculate_error(
                        data_imputed,
                        data_prev,
                        weights=weights,
                        subsample_genes=subsample_genes)
                    error_vec.append(error)
                    tasklogger.log_debug("{}: {}".format(i, error_vec))
                    if error < threshold and t_opt is None:
                        t_opt = i + 1
                        tasklogger.log_info(
                            "Automatically selected t = {}".format(t_opt))

        tasklogger.log_complete("imputation")

        if plot:
            # continue to t_max
            tasklogger.log_start("optimal t plot")
            if t_opt is None:
                # never converged
                warnings.warn("optimal t > t_max ({})".format(t_max),
                              RuntimeWarning)
            else:
                data_overimputed = data_imputed
                while i < t_max:
                    i += 1
                    data_overimputed = self.diff_op.dot(data_overimputed)
                    error, data_prev = self.calculate_error(
                        data_overimputed,
                        data_prev,
                        weights=weights,
                        subsample_genes=subsample_genes)
                    error_vec.append(error)

            # create axis
            if ax is None:
                fig, ax = plt.subplots()
                show = True
            else:
                show = False

            # plot
            x = np.arange(len(error_vec)) + 1
            ax.plot(x, error_vec)
            if t_opt is not None:
                ax.plot(
                    t_opt,
                    error_vec[t_opt - 1],
                    'ro',
                    markersize=10,
                )
            ax.plot(x, np.full(len(error_vec), threshold), 'k--')
            ax.set_xlabel('t')
            ax.set_ylabel('disparity(data_{t}, data_{t-1})')
            ax.set_xlim([1, len(error_vec)])
            plt.tight_layout()
            tasklogger.log_complete("optimal t plot")
            if show:
                plt.show(block=False)

        return data_imputed
Ejemplo n.º 16
0
    def fit(self, X):
        """Computes the diffusion operator

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_features`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`.

        Returns
        -------
        magic_operator : MAGIC
            The estimator object
        """
        if self.knn_dist == 'precomputed':
            if isinstance(X, sparse.coo_matrix):
                X = X.tocsr()
            if X[0, 0] == 0:
                precomputed = "distance"
            else:
                precomputed = "affinity"
            tasklogger.log_info(
                "Using precomputed {} matrix...".format(precomputed))
            n_pca = None
        else:
            precomputed = None
            if self.n_pca is None or X.shape[1] <= self.n_pca:
                n_pca = None
            else:
                n_pca = self.n_pca

        if self.graph is not None:
            if self.X is not None and not \
                    utils.matrix_is_equivalent(X, self.X):
                """
                If the same data is used, we can reuse existing kernel and
                diffusion matrices. Otherwise we have to recompute.
                """
                self.graph = None
            else:
                try:
                    self.graph.set_params(decay=self.a,
                                          knn=self.k + 1,
                                          distance=self.knn_dist,
                                          precomputed=precomputed,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          n_pca=n_pca,
                                          thresh=1e-4,
                                          random_state=self.random_state)
                    tasklogger.log_info(
                        "Using precomputed graph and diffusion operator...")
                except ValueError as e:
                    # something changed that should have invalidated the graph
                    tasklogger.log_debug("Reset graph due to {}".format(
                        str(e)))
                    self.graph = None

        self.X = X

        if utils.has_empty_columns(X):
            warnings.warn("Input matrix contains unexpressed genes. "
                          "Please remove them prior to running MAGIC.")

        if self.graph is None:
            # reset X_magic in case it was previously set
            self.X_magic = None
            tasklogger.log_start("graph and diffusion operator")
            self.graph = graphtools.Graph(X,
                                          n_pca=n_pca,
                                          knn=self.k + 1,
                                          decay=self.a,
                                          thresh=1e-4,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          random_state=self.random_state)
            tasklogger.log_complete("graph and diffusion operator")

        return self
Ejemplo n.º 17
0
def test_log():
    tasklogger.log_debug('debug')
    tasklogger.log_info('info')
    tasklogger.log_warning('warning')
    tasklogger.log_error('error')
    tasklogger.log_critical('critical')
Ejemplo n.º 18
0
Archivo: magic.py Proyecto: akv84/MAGIC
    def fit(self, X, graph=None):
        """Computes the diffusion operator

        Parameters
        ----------
        X : array, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_features`
            dimensions. Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`.
        graph : `graphtools.Graph`, optional (default: None)
            If given, provides a precomputed kernel matrix with which to
            perform diffusion.

        Returns
        -------
        magic_operator : MAGIC
            The estimator object
        """
        if self.n_pca is None or X.shape[1] <= self.n_pca:
            n_pca = None
        else:
            n_pca = self.n_pca

        tasklogger.log_info("Running MAGIC on {} cells and {} genes.".format(
            X.shape[0], X.shape[1]))

        if graph is None:
            graph = self.graph
            if self.X is not None and not \
                    utils.matrix_is_equivalent(X, self.X):
                """
                If the same data is used, we can reuse existing kernel and
                diffusion matrices. Otherwise we have to recompute.
                """
                tasklogger.log_debug(
                    "Reset graph due to difference in input data")
                graph = None
            elif graph is not None:
                try:
                    graph.set_params(decay=self.decay,
                                     knn=self.knn,
                                     distance=self.knn_dist,
                                     n_jobs=self.n_jobs,
                                     verbose=self.verbose,
                                     n_pca=n_pca,
                                     thresh=1e-4,
                                     random_state=self.random_state)
                except ValueError as e:
                    # something changed that should have invalidated the graph
                    tasklogger.log_debug("Reset graph due to {}".format(
                        str(e)))
                    graph = None
        else:
            self.knn = graph.knn
            self.alpha = graph.decay
            self.n_pca = graph.n_pca
            self.knn_dist = graph.distance

        self.X = X

        if utils.has_empty_columns(X):
            warnings.warn("Input matrix contains unexpressed genes. "
                          "Please remove them prior to running MAGIC.")

        if graph is not None:
            tasklogger.log_info(
                "Using precomputed graph and diffusion operator...")
            self.graph = graph
        else:
            # reset X_magic in case it was previously set
            self.X_magic = None
            tasklogger.log_start("graph and diffusion operator")
            self.graph = graphtools.Graph(X,
                                          n_pca=n_pca,
                                          knn=self.knn,
                                          decay=self.decay,
                                          thresh=1e-4,
                                          n_jobs=self.n_jobs,
                                          verbose=self.verbose,
                                          random_state=self.random_state)
            tasklogger.log_complete("graph and diffusion operator")

        return self
Ejemplo n.º 19
0
def online_update_tree(
    data_1,
    data_2,
    pca_centroid,
    pca_op,
    partitions,
    diff_operator,
    diff_pca_op,
    Xs,
    NxTs,
    Ks,
    Merges,
    Ps,
    scale,
    n_jobs=10,
    random_state=None,
):
    """Short summary.

    Parameters
    ----------
    data_1 : type
        Description of parameter `data_1`.
    data_2 : type
        Description of parameter `data_2`.
    pca_centroid : type
        Description of parameter `pca_centroid`.
    pca_op : type
        Description of parameter `pca_op`.
    partitions : type
        Description of parameter `partitions`.
    diff_operator : type
        Description of parameter `diff_operator`.
    diff_pca_op : type
        Description of parameter `diff_pca_op`.
    Xs : type
        Description of parameter `Xs`.
    NxTs : type
        Description of parameter `NxTs`.
    Ks : type
        Description of parameter `Ks`.
    Merges : type
        Description of parameter `Merges`.
    Ps : type
        Description of parameter `Ps`.
    scale : type
        Description of parameter `scale`.
    n_jobs : type
        Description of parameter `n_jobs`.
    random_state : integer or numpy.RandomState, optional, default: None
        The random number generator.
        If an integer is given, it fixes the seed.
        Defaults to the global `numpy` random number generator

    Returns
    -------
    type
        Description of returned object.

    """
    with tasklogger.log_task("Multiscale PHATE tree mapping"):
        if data_1.shape[0] != len(np.unique(partitions)):
            tasklogger.log_info("PCA compressing new data...")
            data_pca_1 = pca_op.transform(np.array(data_1))
            data_pca_2 = pca_op.transform(np.array(data_2))

            # Mapping new data to partitions
            partition_assignments = compress.map_update_data(pca_centroid,
                                                             data_pca_1,
                                                             data_pca_2,
                                                             partitions,
                                                             nn=5,
                                                             n_jobs=n_jobs)
            tasklogger.log_info("Points not mapped to partitions: " +
                                str(sum(partition_assignments == -1)))

            # creating new joint paritions mapping
            new_partition_clusters = list(partitions)

            new_partition_clusters.extend(partition_assignments)
            new_partition_clusters = np.asarray(new_partition_clusters)

            update_idx = np.where(new_partition_clusters == -1)[0]

            max_partition = max(new_partition_clusters)

            for i in range(len(update_idx)):
                new_partition_clusters[update_idx[i]] = max_partition + 1
                max_partition += 1

            if sum(partition_assignments == -1) > 0:
                diff_pot_1 = diffuse.online_update_diffusion_potential(
                    data_pca_2[partition_assignments == -1, :],
                    diff_operator,
                    diff_pca_op,
                )
                epsilon, merge_threshold = condense.compute_condensation_param(
                    diff_pot_1, granularity=0.1)  # change to granularity

                pca_total = np.concatenate(
                    [pca_centroid, data_pca_2[partition_assignments == -1, :]])

                NxTs_n, Xs_n, Ks_n, Merges_n, Ps_n = condense.condense(
                    diff_pot_1,
                    new_partition_clusters,
                    scale,
                    epsilon,
                    merge_threshold,
                    n_jobs=n_jobs,
                    random_state=random_state,
                )
                return NxTs_n, Xs_n, Ks_n, Merges_n, Ps_n, pca_total

            else:
                clusters = new_partition_clusters
                tasklogger.log_info("Rebuilding condensation tree...")
                clusters_idx = []

                for c in clusters:
                    clusters_idx.append(np.where(NxTs[0] == c)[0][0])

                NxTs_l = []

                for l in range(len(NxTs)):
                    NxTs_l.append(NxTs[l][clusters_idx])
                return NxTs_l, Xs, Ks, Merges, Ps, pca_centroid

        else:
            tasklogger.log_info("PCA compressing new data...")
            data_pca_2 = pca_op.transform(np.array(data_2))
            diff_pot_1 = diffuse.online_update_diffusion_potential(
                data_pca_2, diff_operator, diff_pca_op)
            clusters = np.arange(diff_pot_1.shape[0])

            epsilon, merge_threshold = condense.compute_condensation_param(
                diff_pot_1, granularity=0.1)  # change to granularity

            NxTs_n, Xs_n, Ks_n, Merges_n, Ps_n = condense.condense(
                diff_pot_1,
                clusters,
                scale,
                epsilon,
                merge_threshold,
                n_jobs=n_jobs,
                random_state=random_state,
            )
            return (
                NxTs_n,
                Xs_n,
                Ks_n,
                Merges_n,
                Ps_n,
                np.concatenate([pca_centroid, data_pca_2]),
            )
Ejemplo n.º 20
0
    def transform(self, X=None, t_max=100, plot_optimal_t=False, ax=None):
        """Computes the position of the cells in the embedding space

        Parameters
        ----------
        X : array, optional, shape=[n_samples, n_features]
            input data with `n_samples` samples and `n_dimensions`
            dimensions. Not required, since PHATE does not currently embed
            cells not given in the input matrix to `PHATE.fit()`.
            Accepted data types: `numpy.ndarray`,
            `scipy.sparse.spmatrix`, `pd.DataFrame`, `anndata.AnnData`. If
            `knn_dist` is 'precomputed', `data` should be a n_samples x
            n_samples distance or affinity matrix

        t_max : int, optional, default: 100
            maximum t to test if `t` is set to 'auto'

        plot_optimal_t : boolean, optional, default: False
            If true and `t` is set to 'auto', plot the Von Neumann
            entropy used to select t

        ax : matplotlib.axes.Axes, optional
            If given and `plot_optimal_t` is true, plot will be drawn
            on the given axis.

        Returns
        -------
        embedding : array, shape=[n_samples, n_dimensions]
        The cells embedded in a lower dimensional space using PHATE
        """
        if self.graph is None:
            raise NotFittedError("This PHATE instance is not fitted yet. Call "
                                 "'fit' with appropriate arguments before "
                                 "using this method.")
        elif X is not None and not matrix_is_equivalent(X, self.X):
            # fit to external data
            warnings.warn(
                "Pre-fit PHATE cannot be used to transform a "
                "new data matrix. Please fit PHATE to the new"
                " data by running 'fit' with the new data.", RuntimeWarning)
            if isinstance(self.graph, graphtools.graphs.TraditionalGraph) and \
                    self.graph.precomputed is not None:
                raise ValueError("Cannot transform additional data using a "
                                 "precomputed distance matrix.")
            else:
                transitions = self.graph.extend_to_data(X)
                return self.graph.interpolate(self.embedding, transitions)
        else:
            if self.diff_potential is None:
                if self.t == 'auto':
                    t = self.optimal_t(t_max=t_max, plot=plot_optimal_t, ax=ax)
                    tasklogger.log_info(
                        "Automatically selected t = {}".format(t))
                else:
                    t = self.t
                self.diff_potential = self.calculate_potential(self.diff_op, t)
            elif plot_optimal_t:
                self.optimal_t(t_max=t_max, plot=plot_optimal_t, ax=ax)
            if self.embedding is None:
                tasklogger.log_start("{} MDS".format(self.mds))
                self.embedding = embed_MDS(self.diff_potential,
                                           ndim=self.n_components,
                                           how=self.mds,
                                           distance_metric=self.mds_dist,
                                           n_jobs=self.n_jobs,
                                           seed=self.random_state,
                                           verbose=self.verbose - 1)
                tasklogger.log_complete("{} MDS".format(self.mds))
            if isinstance(self.graph, graphtools.graphs.LandmarkGraph):
                tasklogger.log_debug("Extending to original data...")
                return self.graph.interpolate(self.embedding)
            else:
                return self.embedding
def parse_args():
    parser = argparse.ArgumentParser(
        description='Run MAGIC for imputation of '
        'high-dimensional data.',
        epilog='For help, visit magic.readthedocs.io or '
        'krishnaswamylab.org/get-help',
        add_help=True,
        allow_abbrev=True)

    io_group = parser.add_argument_group('Data IO')
    filename = io_group.add_mutually_exclusive_group(required=True)
    filename.add_argument('--filename',
                          type=str,
                          default=None,
                          help='Input data. Allowed types: csv, tsv, mtx, '
                          'hdf5/h5 (10X format), directory/zip (10X format)')
    filename.add_argument('--validate',
                          action='store_true',
                          default=False,
                          help='Run MAGIC on a test dataset to ensure '
                          'output is correct.')
    sparse = io_group.add_mutually_exclusive_group()
    sparse.add_argument('--sparse',
                        action='store_true',
                        help='Use sparse data format',
                        dest='sparse',
                        default=None)
    sparse.add_argument('--dense',
                        action='store_false',
                        help='Use dense data format',
                        dest='sparse',
                        default=None)
    gene_names = io_group.add_mutually_exclusive_group()
    gene_names.add_argument('--gene-names',
                            action='store_true',
                            help='Use gene name headers in data file'
                            ' (csv, tsv, fcs)',
                            dest='gene_names',
                            default=True)
    gene_names.add_argument('--no-gene-names',
                            action='store_false',
                            help='Do not use gene names'
                            ' (csv, tsv, fcs, mtx)',
                            dest='gene_names',
                            default=True)
    gene_names.add_argument('--gene-name-file',
                            type=str,
                            help='Use gene name headers in FILE'
                            ' (csv, tsv, fcs, mtx)',
                            metavar='FILE',
                            dest='gene_names',
                            default=True)
    cell_names = io_group.add_mutually_exclusive_group()
    cell_names.add_argument('--cell-names',
                            action='store_true',
                            help='Use cell name headers in data file'
                            ' (csv, tsv, fcs)',
                            dest='cell_names',
                            default=True)
    cell_names.add_argument('--no-cell-names',
                            action='store_false',
                            help='Do not use cell names'
                            ' (csv, tsv, fcs, mtx)',
                            dest='cell_names',
                            default=True)
    cell_names.add_argument('--cell-name-file',
                            type=str,
                            help='Use cell name headers in FILE'
                            ' (csv, tsv, fcs, mtx)',
                            metavar='FILE',
                            dest='cell_names',
                            default=True)
    io_group.add_argument('--cell-axis',
                          type=str,
                          choices=['row', 'column'],
                          default='row',
                          help='States whether cells are on rows or columns '
                          '(csv, tsv, mtx)')
    io_group.add_argument('--gene-labels',
                          type=str,
                          default='both',
                          choices=['symbol', 'id', 'both'],
                          help='Choice of gene labels for 10X data'
                          ' (dir, zip, hdf5)')
    io_group.add_argument('--genome',
                          type=str,
                          default=None,
                          help='Genome name for 10X HDF5 data (hdf5)')
    io_group.add_argument(
        '--metadata-channels',
        type=str,
        nargs='+',
        default=[
            'Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist',
            'bead1'
        ],
        help='Names of channels to remove from fcs data (fcs)',
        metavar='CHANNEL')

    preprocess_group = parser.add_argument_group('Preprocessing')
    cell_filter = preprocess_group.add_mutually_exclusive_group()
    cell_filter.add_argument('--min-library-size',
                             type=int,
                             default=2000,
                             help='Filter cells with less than COUNTS counts',
                             dest='min_library_size',
                             metavar='COUNTS')
    cell_filter.add_argument('--no-cell-filter',
                             action='store_false',
                             default=2000,
                             dest='min_library_size',
                             help='Do not filter cells')
    gene_filter = preprocess_group.add_mutually_exclusive_group()
    gene_filter.add_argument(
        '--min-cells-per-gene',
        type=int,
        default=10,
        help='Filter genes with less than CELLS non-zero cells',
        dest='min_cells_per_gene',
        metavar='CELLS')
    gene_filter.add_argument('--no-gene-filter',
                             action='store_false',
                             default=2000,
                             dest='min_cells_per_gene',
                             help='Do not filter genes')
    libnorm = preprocess_group.add_mutually_exclusive_group()
    libnorm.add_argument(
        '--normalize',
        action='store_true',
        default=True,
        dest='library_size_normalize',
        help='Normalize cells by total UMI count (library size)')
    libnorm.add_argument('--no-normalize',
                         action='store_false',
                         default=True,
                         dest='library_size_normalize',
                         help='Do not normalize cells')
    transform = preprocess_group.add_mutually_exclusive_group()
    transform.add_argument('--transform',
                           type=str,
                           default='sqrt',
                           choices=['sqrt', 'log', 'arcsinh'],
                           help='Sublinear data transformation function')
    transform.add_argument('--no-transform',
                           action='store_false',
                           default='sqrt',
                           dest='transform',
                           help='Do not transform data')
    preprocess_group.add_argument('--pseudocount',
                                  type=float,
                                  default=1,
                                  help='Pseudocount to add to genes prior '
                                  'to log transform',
                                  metavar='PCOUNT')
    preprocess_group.add_argument('--cofactor',
                                  type=float,
                                  default=5,
                                  help='Factor by which to divide genes prior '
                                  'to arcsinh transform')

    kernel_group = parser.add_argument_group('Kernel Computation')
    kernel_group.add_argument('-k',
                              '--knn',
                              type=int,
                              default=10,
                              dest='knn',
                              help='Number of nearest neighbors on which to '
                              'build kernel')
    decay = kernel_group.add_mutually_exclusive_group()
    decay.add_argument('-a',
                       '--decay',
                       type=int,
                       default=15,
                       dest='decay',
                       help='Sets decay rate of kernel tails')
    decay.add_argument('--no-decay',
                       action='store_false',
                       default=15,
                       dest='decay',
                       help='Do not use alpha decay')
    pca = kernel_group.add_mutually_exclusive_group()
    pca.add_argument('--pca',
                     type=int,
                     default=100,
                     dest='n_pca',
                     help='Number of principal components to use for '
                     'neighborhoods')
    pca.add_argument('--no-pca',
                     action='store_false',
                     default=100,
                     dest='n_pca',
                     help='Do not use PCA')
    kernel_group.add_argument('--knn-dist',
                              type=str,
                              default='euclidean',
                              help='Distance metric to use for calculating '
                              'neighborhoods. Recommended values are '
                              '"euclidean" and "cosine"',
                              metavar='DISTANCE')
    kernel_group.add_argument(
        '-t',
        '--threads',
        type=int,
        default=1,
        help='Use THREADS threads. If -1 all CPUs are used',
        metavar='THREADS',
        dest='random_state')
    kernel_group.add_argument('--seed',
                              type=int,
                              default=None,
                              help='Integer random seed',
                              metavar='SEED',
                              dest='random_state')
    verbose = kernel_group.add_mutually_exclusive_group()
    verbose.add_argument('-v',
                         '--verbose',
                         action='store_true',
                         default=True,
                         help='Print verbose output')
    verbose.add_argument('-q',
                         '--quiet',
                         action='store_false',
                         default=True,
                         help='Do not print verbose output',
                         dest='verbose')
    verbose.add_argument('-vv',
                         '--debug',
                         action='store_true',
                         default=False,
                         help='Print debugging output',
                         dest='debug')

    magic_group = parser.add_argument_group('MAGIC')
    magic_group.add_argument('--t-magic',
                             type=str,
                             default='auto',
                             help='Level of diffusion for MAGIC',
                             metavar='T')
    genes = magic_group.add_mutually_exclusive_group()
    genes.add_argument('--pca-only',
                       action='store_true',
                       default=False,
                       help='Return PCA on the smoothed matrix')
    genes.add_argument('--all-genes',
                       action='store_true',
                       default=False,
                       help='Return the entire smoothed matrix')
    genes.add_argument('--gene-list',
                       type=str,
                       nargs='+',
                       default=None,
                       help='List of genes to return from MAGIC, '
                       'either as integer indices or column names.',
                       metavar='GENE',
                       dest='genes')
    magic_group.add_argument('--output',
                             type=str,
                             default='magic.csv',
                             help='Output CSV file to save smoothed '
                             'data matrix',
                             metavar='FILE')

    args = parser.parse_args()

    if args.validate:
        tasklogger.set_level(2)
        tasklogger.log_info("Running MAGIC validation.")
        args.filename = "https://github.com/KrishnaswamyLab/scprep/raw/master/data/test_data/test_small.csv"
        args.sparse = False
        args.gene_names = True
        args.cell_names = True
        args.cell_axis = "row"
        args.gene_labels = "both"
        args.genome = None
        args.metadata_channels = None
        args.min_library_size = 1
        args.min_cells_per_gene = 1
        args.library_size_normalize = True
        args.transform = 'sqrt'
        args.pseudocount = None
        args.cofactor = None
        args.knn = 3
        args.decay = 20
        args.n_pca = None
        args.knn_dist = "euclidean"
        args.n_jobs = 1
        args.random_state = 42
        args.verbose = True
        args.debug = True
        args.t_magic = "auto"
        args.all_genes = True
        args.output = "magic-validate.csv"

    # fix magic "genes" argument
    if args.all_genes:
        args.genes = "all_genes"
    elif args.pca_only:
        args.genes = "pca_only"
    else:
        try:
            args.genes = [int(g) for g in args.genes]
        except TypeError:
            # string gene names
            pass
    del args.all_genes
    del args.pca_only
    # fix t argument
    if args.t_magic != 'auto':
        try:
            args.t_magic = int(args.t_magic)
        except TypeError:
            parser.error("argument --t-magic: invalid int value: '{}'".format(
                args.t_magic))
    # fix debug argument
    if args.debug:
        args.verbose = 2
    del args.debug

    # store None values where appropriate
    if args.decay is False:
        args.decay = None
    if args.n_pca is False:
        args.n_pca = None
    if args.min_library_size is False:
        args.min_library_size = None
    if args.min_cells_per_gene is False:
        args.min_cells_per_gene = None

    # check for inappropriately set defaults
    try:
        filetype = check_filetype(args.filename)
    except RuntimeError as e:
        parser.error(str(e))
    if filetype not in ['csv', 'tsv', 'csv.gz', 'tsv.gz', 'fcs']:
        if '--gene-names' not in sys.argv:
            args.gene_names = None
        else:
            parser.error(
                "Cannot handle --gene-names with {} file".format(filetype))
        if '--cell-names' not in sys.argv:
            args.cell_names = None
        else:
            parser.error(
                "Cannot handle --cell-names with {} file".format(filetype))
    if filetype not in ['csv', 'tsv', 'csv.gz', 'tsv.gz', 'mtx']:
        if '--cell-axis' not in sys.argv:
            args.cell_axis = None
        else:
            parser.error(
                "Cannot handle --cell-axis with {} file".format(filetype))
    if filetype not in ['dir', 'zip', 'hdf5', 'h5']:
        if '--gene-labels' not in sys.argv:
            args.gene_labels = None
        else:
            parser.error(
                "Cannot handle --gene-labels with {} file".format(filetype))
    if filetype not in ['hdf5', 'h5']:
        if '--genome' not in sys.argv:
            args.genome = None
        else:
            parser.error(
                "Cannot handle --genome with {} file".format(filetype))
    if filetype not in ['fcs']:
        if '--metadata-channels' not in sys.argv:
            args.metadata_channels = None
        else:
            parser.error(
                "Cannot handle --metadata-channels with {} file".format(
                    filetype))

    # check for inappropriately set parameters
    if not args.transform == 'log':
        if '--pseudocount' in sys.argv:
            parser.error(
                "Cannot handle --pseudocount with --transform {}".format(
                    args.transform))
        else:
            args.pseudocount = None
    if not args.transform == 'arcsinh':
        if '--cofactor' in sys.argv:
            parser.error("Cannot handle --cofactor with --transform {}".format(
                args.transform))
        else:
            args.cofactor = None

    return args
Ejemplo n.º 22
0
    def _cimpute(self, data):
        """Main function of C-Impute

        Parameters
        ----------
        data : matrix, shape (m x n)
            The raw reads count matrix

        Returns
        -------
        imputed_data: matrix, shape (m x n)
            The imputed matrix
        """
        # tasklogger.log_info('reading data...')
        # read data
        raw_data = data.fillna(0)

        if self.normalize:
            tasklogger.log_info('normalizing data by library size...')
            # normalize by library size
            norm_data = (raw_data * np.power(10, 6) /
                         raw_data.sum().replace(0, 1)).values
        else:
            norm_data = raw_data.values

        tasklogger.log_info('preprocessing data...')
        # remove zero sum genes and cells
        filtered_rows_indexes = np.where(np.all(norm_data == 0, axis=1))
        filtered_rows_data = np.delete(norm_data,
                                       filtered_rows_indexes[0],
                                       axis=0)
        filtered_columns_indexes = np.where(
            np.all(filtered_rows_data == 0, axis=0))
        filtered_data = np.delete(filtered_rows_data,
                                  filtered_columns_indexes[0],
                                  axis=1)

        # log(x + 1.01)
        if self.normalize:
            log_data = np.log10(filtered_data + 1.01)
        else:
            log_data = filtered_data + self.ZERO_VALUE

        tasklogger.log_info('performing pca...')
        # pca
        pca = PCA()
        pca_data = pca.fit_transform(log_data.T)
        selected_pca_data = pca_data[:, :self._cal_explained_component_number(
            pca.explained_variance_ratio_)].T

        tasklogger.log_info('detecting outlier cells...')
        # remove outlier cells
        # 1. cal distance matrix
        dist_matrix = euclidean_distances(selected_pca_data.T,
                                          selected_pca_data.T)
        dist_matrix[dist_matrix == 0.0] = np.inf
        # 2. get min distance vector for each cell
        min_dist_vector = dist_matrix.min(axis=0)
        # 3. find outlier cells
        outlier_indexes = self._detect_outliers(min_dist_vector)
        tmp_remained_data = np.delete(log_data, outlier_indexes[0], axis=1)
        remained_pca_data = np.delete(selected_pca_data,
                                      outlier_indexes[0],
                                      axis=1)

        # remove rows in which all values is zero
        all_zeros_rows_indexes = np.where(
            np.all(tmp_remained_data == self.ZERO_VALUE, axis=1))
        remained_data = np.delete(tmp_remained_data,
                                  all_zeros_rows_indexes[0],
                                  axis=0)

        tasklogger.log_info('calculating the affinity matrix...')
        # cal affinity matrix
        remained_dist_matrix = euclidean_distances(remained_pca_data.T,
                                                   remained_pca_data.T)
        remained_dist_matrix[remained_dist_matrix == 0.0] = np.inf
        if self.n >= int(remained_dist_matrix.shape[0] / 2):
            self.n = int(remained_dist_matrix.shape[0] / 2)
        nth_smallest_dist_index = self.n - 1
        nth_smallest_dist_vector = np.partition(
            remained_dist_matrix, nth_smallest_dist_index,
            axis=1)[:, nth_smallest_dist_index]

        exp_matrix = np.exp(-remained_dist_matrix /
                            (2 * np.power(nth_smallest_dist_vector, 2)))
        larged_indexes = np.column_stack(
            np.where(remained_dist_matrix <= nth_smallest_dist_vector))
        remained_dist_matrix[
            remained_dist_matrix > nth_smallest_dist_vector] = 0
        for index in larged_indexes:
            remained_dist_matrix[index[0]][index[1]] = exp_matrix[index[0]][
                index[1]]
        tasklogger.log_info(
            'calculating the droupout probability matrix using EM algorithm...'
        )
        # EM algorithm
        D = self._EM_algorithm(remained_data)

        tasklogger.log_info(
            'calculating the weight matrix using non-negative least squares lasso regression...'
        )
        imputed_data = np.zeros(remained_data.shape)
        for index, row in enumerate(remained_dist_matrix):
            tasklogger.log_info('imputing gene expression for cell ' +
                                str(index))
            D_j = D[:, index]
            X_j = remained_data[:, index]
            Y = (1 - D_j) * X_j
            non_self_dist = np.delete(row, index)  # N-1 X 1
            non_self_data = np.delete(remained_data, index, axis=1)  # M X N-1
            non_self_D = np.delete(D, index, axis=1)

            diag = np.diag(non_self_dist)  # N-1 X N-1
            X = np.matmul(diag, non_self_data.T).T * (1 - non_self_D
                                                      )  # M X N-1

            lasso = Lasso(alpha=self.alpha, positive=True, max_iter=3000)
            lasso.fit(X, Y)
            imputed_data[:, index] = lasso.predict(X)

        tasklogger.log_info('processing imputed data...')
        imputed_indexes = np.column_stack(np.where(D >= self.c_drop))

        for index in imputed_indexes:
            remained_data[index[0]][index[1]] = imputed_data[index[0]][
                index[1]]

        tasklogger.log_info('recovering unimputed data...')
        # recover deleted rows and columns
        # 1. insert rows
        for index in all_zeros_rows_indexes[0]:
            remained_data = np.insert(remained_data,
                                      index,
                                      tmp_remained_data[index],
                                      axis=0)
        # 2. insert columns
        for index in outlier_indexes[0]:
            remained_data = np.insert(remained_data,
                                      index,
                                      log_data[:, index],
                                      axis=1)
        # 3. recover original values for each value
        remained_data[remained_data < self.ZERO_VALUE] = self.ZERO_VALUE
        if self.normalize:
            remained_data = np.power(10, remained_data) - 1.01
        else:
            remained_data = remained_data - self.ZERO_VALUE
        # 4. insert columns
        for index in filtered_columns_indexes[0]:
            remained_data = np.insert(remained_data,
                                      index,
                                      filtered_rows_data[:, index],
                                      axis=1)
        # 5. insert rows
        for index in filtered_rows_indexes[0]:
            remained_data = np.insert(remained_data,
                                      index,
                                      norm_data[index],
                                      axis=0)
        # 6. recover original values
        if self.normalize:
            remained_data = remained_data * raw_data.sum().replace(0, 1).values / \
                np.power(10, 6)

        tasklogger.log_info('generating the final imputed matrix...')
        # recover indexes name and columns name
        final_imputed_data = pd.DataFrame(remained_data)
        final_imputed_data.index = raw_data.index
        final_imputed_data.columns = raw_data.columns

        # set percision
        final_imputed_data = round(final_imputed_data, 3)

        return final_imputed_data