コード例 #1
0
    def dca_impute(self, data):
        from dca.api import dca
        import scanpy as sc

        data = self.load()
        adata = sc.AnnData(data.values, obs=data.index, var=data.columns)
        dca(adata, threads=self.ncores)
        return pd.DataFrame(adata.X)
コード例 #2
0
def DCATransform(sc_data_matrix):

    # Create a scanpy AnnData object
    sc_data_matrix = sc.AnnData(numpy.transpose(sc_data_matrix.values))

    # Filter genes with count<2
    sc.pp.filter_genes(data=sc_data_matrix, min_counts=1)

    # Apply DCA transform
    dca(adata=sc_data_matrix, threads=4, epochs=10)

    print("DCA Denoised data prepared")

    return numpy.transpose(sc_data_matrix.X)
コード例 #3
0
def dca(
        adata,
        mode='denoise',
        ae_type='zinb-conddisp',
        normalize_per_cell=True,
        scale=True,
        log1p=True,
        # network args
        hidden_size=(64, 32, 64),
        hidden_dropout=0.,
        batchnorm=True,
        activation='relu',
        init='glorot_uniform',
        network_kwds={},
        # training args
        epochs=300,
        reduce_lr=10,
        early_stop=15,
        batch_size=32,
        optimizer='rmsprop',
        random_state=0,
        threads=None,
        verbose=False,
        training_kwds={},
        return_model=False,
        return_info=False,
        copy=False):
    """Deep count autoencoder [Eraslan18]_.

    Fits a count autoencoder to the raw count data given in the anndata object
    in order to denoise the data and to capture hidden representation of
    cells in low dimensions. Type of the autoencoder and return values are
    determined by the parameters.
    
    .. note::
        More information and bug reports `here <https://github.com/theislab/dca>`__.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        An anndata file with `.raw` attribute representing raw counts.
    mode : `str`, optional. `denoise`(default), or `latent`.
        `denoise` overwrites `adata.X` with denoised expression values.
        In `latent` mode DCA adds `adata.obsm['X_dca']` to given adata
        object. This matrix represent latent representation of cells via DCA.
    ae_type : `str`, optional. `zinb-conddisp`(default), `zinb`, `nb-conddisp` or `nb`.
        Type of the autoencoder. Return values and the architecture is
        determined by the type e.g. `nb` does not provide dropout
        probabilities. Types that end with "-conddisp", assumes that dispersion is mean dependant.
    normalize_per_cell : `bool`, optional. Default: `True`.
        If true, library size normalization is performed using
        the `sc.pp.normalize_per_cell` function in Scanpy and saved into adata
        object. Mean layer is re-introduces library size differences by
        scaling the mean value of each cell in the output layer. See the
        manuscript for more details.
    scale : `bool`, optional. Default: `True`.
        If true, the input of the autoencoder is centered using
        `sc.pp.scale` function of Scanpy. Note that the output is kept as raw
        counts as loss functions are designed for the count data.
    log1p : `bool`, optional. Default: `True`.
        If true, the input of the autoencoder is log transformed with a
        pseudocount of one using `sc.pp.log1p` function of Scanpy.
    hidden_size : `tuple` or `list`, optional. Default: (64, 32, 64).
        Width of hidden layers.
    hidden_dropout : `float`, `tuple` or `list`, optional. Default: 0.0.
        Probability of weight dropout in the autoencoder (per layer if list
        or tuple).
    batchnorm : `bool`, optional. Default: `True`.
        If true, batch normalization is performed.
    activation : `str`, optional. Default: `relu`.
        Activation function of hidden layers.
    init : `str`, optional. Default: `glorot_uniform`.
        Initialization method used to initialize weights.
    network_kwds : `dict`, optional.
        Additional keyword arguments for the autoencoder.
    epochs : `int`, optional. Default: 300.
        Number of total epochs in training.
    reduce_lr : `int`, optional. Default: 10.
        Reduces learning rate if validation loss does not improve in given number of epochs.
    early_stop : `int`, optional. Default: 15.
        Stops training if validation loss does not improve in given number of epochs.
    batch_size : `int`, optional. Default: 32.
        Number of samples in the batch used for SGD.
    optimizer : `str`, optional. Default: "rmsprop".
        Type of optimization method used for training.
    random_state : `int`, optional. Default: 0.
        Seed for python, numpy and tensorflow.
    threads : `int` or None, optional. Default: None
        Number of threads to use in training. All cores are used by default.
    verbose : `bool`, optional. Default: `False`.
        If true, prints additional information about training and architecture.
    training_kwds : `dict`, optional.
        Additional keyword arguments for the training process.
    return_model : `bool`, optional. Default: `False`.
        If true, trained autoencoder object is returned. See "Returns".
    return_info : `bool`, optional. Default: `False`.
        If true, all additional parameters of DCA are stored in `adata.obsm` such as dropout
        probabilities (obsm['X_dca_dropout']) and estimated dispersion values
        (obsm['X_dca_dispersion']), in case that autoencoder is of type
        zinb or zinb-conddisp.
    copy : `bool`, optional. Default: `False`.
        If true, a copy of anndata is returned.

    Returns
    -------
    If `copy` is true and `return_model` is false, AnnData object is returned.

    In "denoise" mode, `adata.X` is overwritten with the denoised values. In "latent" mode, latent\
    low dimensional representation of cells are stored in `adata.obsm['X_dca']` and `adata.X`\
    is not modified. Note that these values are not corrected for library size effects.

    If `return_info` is true, all estimated distribution parameters are stored in AnnData such as:

    - `.obsm["X_dca_dropout"]` which is the mixture coefficient (pi) of the zero component\
    in ZINB, i.e. dropout probability (only if `ae_type` is `zinb` or `zinb-conddisp`).

    - `.obsm["X_dca_dispersion"]` which is the dispersion parameter of NB.

    - `.uns["dca_loss_history"]` which stores the loss history of the training. See `.history`\
    attribute of Keras History class for mode details.

    Finally, the raw counts are stored in `.raw` attribute of AnnData object.

    If `return_model` is given, trained model is returned. When both `copy` and `return_model`\
    are true, a tuple of anndata and model is returned in that order.
    """

    try:
        from dca.api import dca
    except ImportError:
        raise ImportError(
            'Please install dca package (>= 0.2.1) via `pip install dca`')

    return dca(adata,
               mode=mode,
               ae_type=ae_type,
               normalize_per_cell=normalize_per_cell,
               scale=scale,
               log1p=log1p,
               hidden_size=hidden_size,
               hidden_dropout=hidden_dropout,
               batchnorm=batchnorm,
               activation=activation,
               init=init,
               network_kwds=network_kwds,
               epochs=epochs,
               reduce_lr=reduce_lr,
               early_stop=early_stop,
               batch_size=batch_size,
               optimizer=optimizer,
               random_state=random_state,
               threads=threads,
               verbose=verbose,
               training_kwds=training_kwds,
               return_model=return_model)
コード例 #4
0
ファイル: _dca.py プロジェクト: scottgigante/scanpy
def dca(
    adata: AnnData,
    mode: Literal['denoise', 'latent'] = 'denoise',
    ae_type: _AEType = 'zinb-conddisp',
    normalize_per_cell: bool = True,
    scale: bool = True,
    log1p: bool = True,
    # network args
    hidden_size: Sequence[int] = (64, 32, 64),
    hidden_dropout: Union[float, Sequence[float]] = 0.0,
    batchnorm: bool = True,
    activation: str = 'relu',
    init: str = 'glorot_uniform',
    network_kwds: Mapping[str, Any] = MappingProxyType({}),
    # training args
    epochs: int = 300,
    reduce_lr: int = 10,
    early_stop: int = 15,
    batch_size: int = 32,
    optimizer: str = 'rmsprop',
    random_state: Union[int, RandomState] = 0,
    threads: Optional[int] = None,
    learning_rate: Optional[float] = None,
    verbose: bool = False,
    training_kwds: Mapping[str, Any] = MappingProxyType({}),
    return_model: bool = False,
    return_info: bool = False,
    copy: bool = False,
) -> Optional[AnnData]:
    """\
    Deep count autoencoder [Eraslan18]_.

    Fits a count autoencoder to the raw count data given in the anndata object
    in order to denoise the data and to capture hidden representation of
    cells in low dimensions. Type of the autoencoder and return values are
    determined by the parameters.

    .. note::
        More information and bug reports `here <https://github.com/theislab/dca>`__.

    Parameters
    ----------
    adata
        An anndata file with `.raw` attribute representing raw counts.
    mode
        `denoise` overwrites `adata.X` with denoised expression values.
        In `latent` mode DCA adds `adata.obsm['X_dca']` to given adata
        object. This matrix represent latent representation of cells via DCA.
    ae_type
        Type of the autoencoder. Return values and the architecture is
        determined by the type e.g. `nb` does not provide dropout
        probabilities. Types that end with "-conddisp", assumes that dispersion is mean dependant.
    normalize_per_cell
        If true, library size normalization is performed using
        the `sc.pp.normalize_per_cell` function in Scanpy and saved into adata
        object. Mean layer is re-introduces library size differences by
        scaling the mean value of each cell in the output layer. See the
        manuscript for more details.
    scale
        If true, the input of the autoencoder is centered using
        `sc.pp.scale` function of Scanpy. Note that the output is kept as raw
        counts as loss functions are designed for the count data.
    log1p
        If true, the input of the autoencoder is log transformed with a
        pseudocount of one using `sc.pp.log1p` function of Scanpy.
    hidden_size
        Width of hidden layers.
    hidden_dropout
        Probability of weight dropout in the autoencoder (per layer if list
        or tuple).
    batchnorm
        If true, batch normalization is performed.
    activation
        Activation function of hidden layers.
    init
        Initialization method used to initialize weights.
    network_kwds
        Additional keyword arguments for the autoencoder.
    epochs
        Number of total epochs in training.
    reduce_lr
        Reduces learning rate if validation loss does not improve in given number of epochs.
    early_stop
        Stops training if validation loss does not improve in given number of epochs.
    batch_size
        Number of samples in the batch used for SGD.
    optimizer
        Type of optimization method used for training.
    random_state
        Seed for python, numpy and tensorflow.
    threads
        Number of threads to use in training. All cores are used by default.
    learning_rate
        Learning rate to use in the training.
    verbose
        If true, prints additional information about training and architecture.
    training_kwds
        Additional keyword arguments for the training process.
    return_model
        If true, trained autoencoder object is returned. See "Returns".
    return_info
        If true, all additional parameters of DCA are stored in `adata.obsm` such as dropout
        probabilities (obsm['X_dca_dropout']) and estimated dispersion values
        (obsm['X_dca_dispersion']), in case that autoencoder is of type
        zinb or zinb-conddisp.
    copy
        If true, a copy of anndata is returned.

    Returns
    -------
    If `copy` is true and `return_model` is false, AnnData object is returned.

    In "denoise" mode, `adata.X` is overwritten with the denoised values.
    In "latent" mode, latent low dimensional representation of cells are stored
    in `adata.obsm['X_dca']` and `adata.X` is not modified.
    Note that these values are not corrected for library size effects.

    If `return_info` is true, all estimated distribution parameters are stored
    in AnnData like this:

    `.obsm["X_dca_dropout"]`
        The mixture coefficient (pi) of the zero component in ZINB,
        i.e. dropout probability (if `ae_type` is `zinb` or `zinb-conddisp`).
    `.obsm["X_dca_dispersion"]`
        The dispersion parameter of NB.
    `.uns["dca_loss_history"]`
        The loss history of the training.
        See `.history` attribute of Keras History class for mode details.

    Finally, the raw counts are stored in `.raw` attribute of AnnData object.

    If `return_model` is given, trained model is returned.
    When both `copy` and `return_model` are true,
    a tuple of anndata and model is returned in that order.
    """

    try:
        from dca.api import dca
    except ImportError:
        raise ImportError(
            'Please install dca package (>= 0.2.1) via `pip install dca`')

    return dca(
        adata,
        mode=mode,
        ae_type=ae_type,
        normalize_per_cell=normalize_per_cell,
        scale=scale,
        log1p=log1p,
        hidden_size=hidden_size,
        hidden_dropout=hidden_dropout,
        batchnorm=batchnorm,
        activation=activation,
        init=init,
        network_kwds=network_kwds,
        epochs=epochs,
        reduce_lr=reduce_lr,
        early_stop=early_stop,
        batch_size=batch_size,
        optimizer=optimizer,
        random_state=random_state,
        threads=threads,
        learning_rate=learning_rate,
        verbose=verbose,
        training_kwds=training_kwds,
        return_model=return_model,
        return_info=return_info,
        copy=copy,
    )
コード例 #5
0
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--n_clusters', default=3, type=int)
    parser.add_argument('--data_file', default=None)
    args = parser.parse_args()

    data_mat = h5py.File('./normalized_raw_data/' + str(args.data_file))
    x = np.array(data_mat['X'])
    y = np.array(data_mat['Y'])

    adata = sc.AnnData(x)
    adata.obs['Group'] = y

    adata = read_dataset(adata, transpose=False, test_split=False, copy=True)

    sc.pp.filter_genes(adata, min_counts=1)
    dca(adata, threads=1)
    sc.pp.normalize_per_cell(adata)
    sc.pp.log1p(adata)
    sc.pp.pca(adata)
    print(adata)

    dca_pca = adata.obsm.X_pca[:, :2]

    kmeans = KMeans(n_clusters=args.n_clusters, n_init=20)
    y_pred = kmeans.fit_predict(dca_pca)

    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print('data: ' + str(args.data_file) +
          ' DCA+PCA+kmeans: ACC= %.4f, NMI= %.4f, ARI= %.4f' % (acc, nmi, ari))
コード例 #6
0
ファイル: DCA.py プロジェクト: shikun1408/DISC
os.makedirs(output_dir, exist_ok=True)

starttime = time.time()
gene_bc_mat, cell_id, gene_name = read_loom(FLAGS["loom"])
min_expressed_cell = FLAGS["min_expressed_cell"]
min_expressed_cell_average_expression = FLAGS[
    "min_expressed_cell_average_expression"]
expressed_cell = (gene_bc_mat > 0).sum(1)
gene_expression = gene_bc_mat.sum(1)
gene_filter = np.bitwise_and(
    expressed_cell >= min_expressed_cell,
    gene_expression > expressed_cell * min_expressed_cell_average_expression)
input_gene_bc_mat = gene_bc_mat[gene_filter, :]
print(input_gene_bc_mat.shape)
filt_adata = sc.AnnData(input_gene_bc_mat.transpose())
dca(filt_adata)
input_loom_name = FLAGS["loom"].rsplit("/", 1)[1]
output_h5 = input_loom_name.replace(
    ".loom",
    "_DCA_mc_{}_mce_{}.hdf5".format(min_expressed_cell,
                                    min_expressed_cell_average_expression))
with h5py.File("{}/{}".format(output_dir, output_h5), "w") as f:
    f["cell_id"] = cell_id.astype(h5py.special_dtype(vlen=str))
    f["gene_name"] = gene_name[gene_filter].astype(
        h5py.special_dtype(vlen=str))
    if_dset_imputation = f.create_dataset("imputation",
                                          shape=(cell_id.size,
                                                 gene_filter.sum()),
                                          chunks=(1, gene_filter.sum()),
                                          dtype=np.float32)
    if_dset_imputation[...] = filt_adata.X