コード例 #1
0
def test_dataframe_reserved_columns(tmp_path, diskfmt):
    reserved = ("_index", "__categories")
    adata_pth = tmp_path / f"adata.{diskfmt}"
    orig = ad.AnnData(X=np.ones((5, 5)))
    for colname in reserved:
        to_write = orig.copy()
        to_write.obs[colname] = np.ones(5)
        with pytest.raises(ValueError) as e:
            getattr(to_write, f"write_{diskfmt}")(adata_pth)
        assert colname in str(e.value)
    for colname in reserved:
        to_write = orig.copy()
        to_write.varm["df"] = pd.DataFrame({colname: list("aabcd")},
                                           index=to_write.var_names)
        with pytest.raises(ValueError) as e:
            getattr(to_write, f"write_{diskfmt}")(adata_pth)
        assert colname in str(e.value)
コード例 #2
0
 def __prep_predict_data(self, test_data):
     missing = set(self.genes).difference(test_data.var_names)
     if len(missing) > 0:
         data = pd.concat([
             pd.DataFrame(test_data.X,
                          index=test_data.obs_names,
                          columns=test_data.var_names),
             pd.DataFrame(0, index=test_data.obs_names, columns=missing)
         ],
                          axis=1)
         data = data[list(self.genes)]
         data_sc = anndata.AnnData(X=data.to_numpy())
         data_sc.var_names = data.columns
         data_sc.obs_names = data.index
         return data_sc
     else:
         return test_data
コード例 #3
0
ファイル: sparse2ann.py プロジェクト: dawe/scatACC
def main():
  options = get_options()

  if not options.inverse_transform:

    loader = np.load(options.sparse)
    try:
      count_matrix = sp.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
    except ValueError:
      count_matrix = sp.csc_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
    bc_list = loader['bc_list']

    if count_matrix.shape[1] == len(bc_list):
      # we probably have data in regions by cells, we need it the other way
      count_matrix = sp.csr_matrix(count_matrix.T) #convert

    if options.keep_all:
      mask = np.ones(count_matrix.shape[1], dtype=bool)
    else:
      mask = np.array(np.sum(count_matrix, axis=0) > 0).ravel()

    
    if options.peaks_file:
      regions = []
      for line in open(options.peaks_file):
        t = line.split()	
        chrom, start, end = t[:3]
        r_id = "%s:%s-%s" % (chrom, start, end)
        regions.append(r_id)
      regions = np.array(regions)  
    else:
      regions = np.arange(count_matrix.shape[1])  
    
    regions = regions[mask]
    count_matrix = count_matrix[:, mask]
    n_cells = pd.DataFrame(np.array(np.sum(count_matrix > 0, axis=0)).ravel(), index=regions, columns=['n_cells'])
    n_regions = pd.DataFrame(np.array(np.sum(count_matrix > 0, axis=1)).ravel(), index=bc_list, columns=['n_regions'])
    adata = anndata.AnnData(count_matrix, obs=n_regions, var=n_cells)
    adata.write(options.anndata)

  else:
    adata = anndata.read(options.anndata)
    count_matrix = 	adata.X
    bc_list = np.array(adata.obs.index)
    np.savez(options.sparse, data = count_matrix.data, indices=count_matrix.indices, 
             indptr=count_matrix.indptr, shape=count_matrix.shape, bc_list=bc_list)
コード例 #4
0
def read_expression_from_archive(archive: ZipFile) -> anndata.AnnData:
    info = archive.infolist()
    assert len(info) == 3
    mtx_data_info = next(i for i in info if i.filename.endswith(".mtx"))
    mtx_rows_info = next(i for i in info if i.filename.endswith(".mtx_rows"))
    mtx_cols_info = next(i for i in info if i.filename.endswith(".mtx_cols"))
    with archive.open(mtx_data_info, "r") as f:
        expr = read_mtx_from_stream(f)
    with archive.open(mtx_rows_info, "r") as f:
        # TODO: Check what other value could be
        varname = pd.read_csv(f, sep="\t", header=None)[1]
    with archive.open(mtx_cols_info, "r") as f:
        obsname = pd.read_csv(f, sep="\t", header=None).iloc[:, 0]
    adata = anndata.AnnData(expr)
    adata.var_names = varname
    adata.obs_names = obsname
    return adata
コード例 #5
0
ファイル: test.py プロジェクト: semir2/PHATE
def test_simple():
    tree_data, tree_clusters = phate.tree.gen_dla(n_branch=3)
    phate_operator = phate.PHATE(k=15, t=100)
    tree_phate = phate_operator.fit_transform(tree_data)
    assert tree_phate.shape == (tree_data.shape[0], 2)
    clusters = phate.cluster.kmeans(phate_operator, k=3)
    assert np.issubdtype(clusters.dtype, int)
    assert len(clusters.shape) == 1
    assert len(clusters) == tree_data.shape[0]
    phate_operator.fit(phate_operator.graph)
    G = graphtools.Graph(phate_operator.graph.kernel,
                         precomputed='affinity',
                         use_pygsp=True)
    phate_operator.fit(G)
    G = pygsp.graphs.Graph(G.W)
    phate_operator.fit(G)
    phate_operator.fit(anndata.AnnData(tree_data))
コード例 #6
0
 def _compute_neighbors(self):
     # nearest neighbors graph
     adata = anndata.AnnData(
         X=None,
         obs=pd.DataFrame([], index=[f'obs{i}' for i in range(self.n_obs)]),
         var=pd.DataFrame([], index=[f'var{i}' for i in range(self.n_pcs)]))
     adata.obsm['X_pca'] = self.X
     # here neighbors should only use PCs
     self._neighbors = Neighbors(adata=adata)
     self._neighbors.compute_neighbors(n_neighbors=self.n_neighbors,
                                       knn=True,
                                       n_pcs=self.n_pcs,
                                       use_rep='X_pca',
                                       method='umap',
                                       metric=self.knn_metric,
                                       random_state=self.random_state)
     return
コード例 #7
0
ファイル: census.py プロジェクト: sachitsaksena/wot
def main(args):
    tmap_model = wot.tmap.TransportMapModel.from_directory(args.tmap)
    cell_sets_matrix = wot.io.read_sets(args.cell_set)
    cell_sets = wot.io.convert_binary_dataset_to_dict(cell_sets_matrix)
    populations = tmap_model.population_from_cell_sets(cell_sets,
                                                       at_time=args.day)

    timepoints, census = tmap_model.ancestor_census(cell_sets_matrix,
                                                    *populations)

    obs = pd.DataFrame(index=timepoints)

    for i in range(len(census)):
        res = anndata.AnnData(census[i], obs, cell_sets_matrix.var)
        wot.io.write_dataset(res,
                             args.out + '_' + populations[i].name,
                             output_format='txt')
コード例 #8
0
def get_cellbench():
    protocols = ['10x', 'CELseq2', 'Dropseq']
    adatas = []
    for protocol in protocols:
        #print(protocol)
        counts = pd.read_csv('data/CellBench/{}_counts.csv'.format(protocol), index_col=0).T
        counts = counts.loc[:, ~counts.columns.duplicated()]
        meta = pd.read_csv('data/CellBench/{}_meta.csv'.format(protocol), index_col=0)
        counts, meta = preprocessing.remove_doublets(counts, meta)
        counts, meta = preprocessing.clean_counts(counts, meta, FILTER_MIN_GENES, FILTER_MIN_READS, FILTER_MIN_DETECTED)
        adatas.append(anndata.AnnData(X=counts.values, obs=meta, var=pd.DataFrame(index=counts.columns)))
        # print(adatas[-1].shape)
        # print(np.unique(adatas[-1].obs['cell_line_demuxlet']))
    adata = anndata.AnnData.concatenate(*adatas, join='inner', batch_key='protocol', batch_categories=protocols)
    # print(adata.X.shape)
    # print(adata.obs.info())
    return adata
コード例 #9
0
ファイル: test_views.py プロジェクト: yunpengl9071/anndata
def test_modify_view_component(matrix_type, mapping_name):
    adata = ad.AnnData(
        np.zeros((10, 10)),
        **{mapping_name: {
            "m": matrix_type(asarray(sparse.random(10, 10)))
        }},
    )
    init_hash = joblib.hash(adata)

    subset = adata[:5, :][:, :5]
    assert subset.isview
    m = getattr(subset, mapping_name)["m"]
    m[0, 0] = 100
    assert not subset.isview
    assert getattr(subset, mapping_name)["m"][0, 0] == 100

    assert init_hash == joblib.hash(adata)
コード例 #10
0
ファイル: test_repr.py プロジェクト: BacemDataScience/anndata
def adata():
    return ad.AnnData(
        np.zeros((20, 10)),
        obs=pd.DataFrame(
            {"obs_key": list(ascii_letters[:20])},
            index=[f"cell{i}" for i in range(20)],
        ),
        var=pd.DataFrame(
            {"var_key": np.arange(10)}, index=[f"gene{i}" for i in range(10)]
        ),
        varm={"varm_key": np.zeros((10, 20))},
        obsm={"obsm_key": np.zeros((20, 20))},
        layers={"layers_key": np.zeros((20, 10))},
        obsp={"obsp_key": np.zeros((20, 20))},
        varp={"varp_key": np.zeros((10, 10))},
        uns={"uns_key": dict(zip("abc", range(3)))},
    )
コード例 #11
0
def calculateExpressionRatio(adata, clusterby):
    """
    逐个计算adata中每个基因在每个cluster中的表达比例

    adata:
        需要含有raw
    clusterby:
        adata.obs中的某个列名
    """
    transformAdataRawToAd = lambda adata: anndata.AnnData(
        X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
    rawAd = transformAdataRawToAd(adata)
    expressionOrNotdf = (rawAd.to_df() > 0).astype(int)
    expressionOrNotdf[clusterby] = rawAd.obs[clusterby]
    expressionRatioDf = expressionOrNotdf.groupby(clusterby).agg(
        "sum") / expressionOrNotdf.groupby(clusterby).agg("count")
    return expressionRatioDf
コード例 #12
0
    def predict(self,
                adata,
                encoder_labels,
                decoder_labels,
                return_adata=True):
        """
            Predicts the cell type provided by the user in stimulated condition.
            # Parameters
                data: `~anndata.AnnData`
                    Annotated data matrix whether in primary space.
                labels: numpy nd-array
                    `numpy nd-array` of labels to be fed as CVAE's condition array.
            # Returns
                stim_pred: numpy nd-array
                    `numpy nd-array` of predicted cells in primary space.
            # Example
            ```python
            import scanpy as sc
            import scgen
            train_data = sc.read("train_kang.h5ad")
            validation_data = sc.read("./data/validation.h5ad")
            network = scgen.CVAE(train_data=train_data, use_validation=True, validation_data=validation_data, model_path="./saved_models/", conditions={"ctrl": "control", "stim": "stimulated"})
            network.scripts(n_epochs=20)
            prediction = network.predict('CD4T', obs_key={"cell_type": ["CD8T", "NK"]})
            ```
        """
        adata = remove_sparsity(adata)

        encoder_labels = to_categorical(encoder_labels,
                                        num_classes=self.n_conditions)
        decoder_labels = to_categorical(decoder_labels,
                                        num_classes=self.n_conditions)

        reconstructed = self.cvae_model.predict(
            [adata.X, encoder_labels, decoder_labels])[0]
        reconstructed = np.nan_to_num(reconstructed)

        if return_adata:
            output = anndata.AnnData(X=reconstructed)
            output.obs = adata.obs.copy(deep=True)
            output.var_names = adata.var_names
        else:
            output = reconstructed

        return output
コード例 #13
0
def ttest(filename: str):
    import anndata
    import diffxpy.api as de

    df = pandas.read_csv(filename, header=0, index_col=0).transpose()

    groupings = find_groupings(df, default_group='arpc')
    LOGGER.info(
        "group, filename=%s, groups=%d, groupings=%s",
        filename,
        len(groupings),
        [(x[0], y[0]) for x, y in groupings],
    )

    for grouping in groupings:
        tag_1, indices_1 = grouping[0]
        tag_2, indices_2 = grouping[1]
        indices_1 = set(indices_1)
        indices_2 = set(indices_2)
        remained_df = df.drop(index=[
            x for x in df.index if (x not in indices_1 and x not in indices_2)
        ])
        data = anndata.AnnData(remained_df)
        new_grouping = [
            f"1-{tag_1}" if x in indices_1 else f"2-{tag_2}"
            for x in data.obs.index.tolist()
        ]
        test = de.test.t_test(data, new_grouping)

        summary_output_file = get_output_filename(filename,
                                                  f".{tag_1}_{tag_2}.out.csv")
        test.summary().to_csv(summary_output_file)
        LOGGER.info("summary saved, output=%s", summary_output_file)

        volcano_output_file = get_output_filename(
            filename, f".{tag_1}_{tag_2}.volcano.jpg")
        test.plot_volcano(
            corrected_pval=True,
            alpha=0.05,
            size=20,
            show=False,
            save=volcano_output_file,
            # highlight_ids=["NPPA"],
        )
        LOGGER.info("volcano saved, output=%s", volcano_output_file)
コード例 #14
0
ファイル: gini.py プロジェクト: rdong08/GiniClust3
def clusterGini(adataSC, **kwargs):
    """
    Cluster cell based on Gini Index value.
    Params
    ------
    adata: Anndata
        The annotated data matrix of shape `n_obs` × `n_vars`.
        Rows correspond to cells and columns to genes.
    neighbors: int, optional (Default=5)
        The size of local neighborhood used for manifold approximation. Larger
        values result in more global views of the manifold, while smaller values
        result in more local data being preserved. For rare cell identification
        this values should be in the range 2 to 15. Recommended neighbors = 5.
    resolution: float, optional (Default=0.1)
        A parameter value controlling the coarseness of the clustering. Higher 
        values lead to more clusters.
    method: string, optional (Default: 'leiden')
        method='louvain' or method='leiden'.

    Returns
    -------
    Returns dictionary with gini cluster result. adata.var['rare']
    """

    cluster_neighbors = kwargs.get('neighbors', 5)
    cluster_resolution = kwargs.get('resolution', 0.1)
    cluster_method = kwargs.get('method', "leiden")
    if (cluster_method != "louvain" and cluster_method != "leiden"):
        raise SystemExit(
            "Only leiden or louvain cluster method is allowed in this step.")
    adataGini = adataSC[:, adataSC.var['gini']]
    scaleMatrix = arctanTransform((adataGini.X))
    adataScaleGini = anndata.AnnData(X=scaleMatrix)

    ###calculate neighbor and clustering###
    sc.pp.neighbors(adataScaleGini, use_rep='X', n_neighbors=cluster_neighbors)
    giniClust = []
    if (cluster_method == "louvain"):
        sc.tl.louvain(adataScaleGini, resolution=cluster_resolution)
        giniClust = adataScaleGini.obs['louvain'].values.tolist()
    else:
        sc.tl.leiden(adataScaleGini, resolution=cluster_resolution)
        giniClust = adataScaleGini.obs['leiden'].values.tolist()
    adataSC.obs['rare'] = giniClust
    return (adataScaleGini)
コード例 #15
0
    def to_anndata(
            self,
            cell_properties: Union[bool, Sequence[str]] = False,
            cell_channel_properties: Union[bool, Sequence[str]] = False,
            x_cell_channel_property: Optional[str] = None
    ) -> 'anndata.AnnData':
        """Returns an :class:`anndata.AnnData` representation of the current instance

        :param cell_properties: list of cell properties (e.g. regionprops) to include; set to ``True`` to include all
        :param cell_channel_properties: list of cell channel properties (e.g. intensity values) to include; set to
            ``True`` to include all
        :param x_cell_channel_property: cell channel property to use for the main AnnData data matrix (X)
        :return: AnnData object, in which cell channel properties (e.g. intensity values) are stored as layers and cell
            properties (e.g. regionprops) are stored as observations
        """
        if anndata is None:
            raise RuntimeError('anndata is not installed')
        obs_data = None
        if cell_properties:
            cell_property_dataset = self.to_dataset(
                cell_properties=cell_properties)
            obs_data = utils.to_table(
                xr.concat(cell_property_dataset.data_vars.values(),
                          'property'))
        layers = {}
        if cell_channel_properties:
            cell_channel_property_dataset = self.to_dataset(
                cell_channel_properties=cell_channel_properties)
            layers = {
                property_name: da.values
                for property_name, da in
                cell_channel_property_dataset.data_vars.items()
            }
        return anndata.AnnData(
            X=getattr(self, x_cell_channel_property).values
            if x_cell_channel_property is not None else None,
            obs=pd.DataFrame(index=pd.Index(data=self.cell_ids.astype(str),
                                            name='cell'),
                             data=obs_data),
            var=pd.DataFrame(
                index=pd.Index(data=self.channel_names, name='channel')),
            layers=layers or None,
            shape=(self.num_cells, self.num_channels)
            if x_cell_channel_property is None else None,
        )
コード例 #16
0
ファイル: test_readwrite.py プロジェクト: tkisss/anndata
def test_readwrite_loom(typ, tmp_path):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_src.obsm['X_a'] = np.zeros((adata_src.n_obs, 2))
    adata_src.varm['X_b'] = np.zeros((adata_src.n_vars, 3))
    adata_src.write_loom(tmp_path / 'test.loom', write_obsm_varm=True)

    adata = ad.read_loom(tmp_path / 'test.loom', sparse=typ is csr_matrix, cleanup=True)
    if isinstance(X, np.ndarray):
        assert np.allclose(adata.X, X)
    else:
        # TODO: this should not be necessary
        assert np.allclose(adata.X.toarray(), X.toarray())
    assert 'X_a' in adata.obsm_keys() and adata.obsm['X_a'].shape[1] == 2
    assert 'X_b' in adata.varm_keys() and adata.varm['X_b'].shape[1] == 3
    # as we called with `cleanup=True`
    assert 'oanno1b' in adata.uns['loom-obs']
    assert 'vanno2' in adata.uns['loom-var']
コード例 #17
0
    def predict(self, adata, target_label, condition_key, cell_type_key, cell_type_to_predict, source_condition,
                target_condition):
        adata = remove_sparsity(adata)

        cell_type_adata = adata[adata.obs[cell_type_key] == cell_type_to_predict]
        source_adata = cell_type_adata[cell_type_adata.obs[condition_key] == source_condition]

        y_test = np.zeros(source_adata.shape[0]) + target_label
        real_loader = Loader(source_adata.X, labels=y_test, shuffle=False)

        pred = self.model_backend.get_reconstruction(real_loader)
        pred = np.nan_to_num(pred[0])

        pred_adata = anndata.AnnData(X=pred)
        pred_adata.obs[condition_key] = f"{cell_type_to_predict}_pred_{target_condition}"
        pred_adata.var_names = adata.var_names

        return pred_adata
コード例 #18
0
ファイル: io.py プロジェクト: robindar/wot
def read_grp(path, feature_ids=None):
    elements = []
    with open(path) as fp:
        for line in fp:
            line = line.strip()
            if line == '' or line[0] == '#' or line[0] == '>':
                continue
            elements.append(line.lower())
    if feature_ids is None:
        feature_ids = list(sorted(elements))
    x = np.zeros((len(feature_ids), 1), dtype=np.int8)
    for i in range(len(feature_ids)):
        if feature_ids[i] in elements:
            x[i, 0] = 1
    set_name, _ = get_filename_and_extension(os.path.basename(path))
    obs = pd.DataFrame(index=feature_ids)
    var = pd.DataFrame(index=[set_name])
    return anndata.AnnData(X=x, obs=obs, var=var)
コード例 #19
0
 def louvain_clustering(self, res=1):
     if (not self.analysis_performed):
         print("Please run the SAM analysis first using 'run' after "
               "loading the data.")
     else:
         import anndata
         import scanpy.api as sc
         adata = anndata.AnnData(self.D,
                                 var={'genes': self.gene_names},
                                 obs={'cells': self.cell_names})
         adata.obsm['X_pca'] = self.wPCA_data
         sc.pp.neighbors(adata,
                         n_neighbors=self.k,
                         metric='correlation',
                         method='umap')
         sc.tl.louvain(adata, resolution=res)
         self.cluster_labels = adata.obs['louvain'].values.astype('int')
         self.output_vars['louvain_cluster_labels'] = self.cluster_labels
コード例 #20
0
def my_read_hdf(path):
    f = h5py.File(path, 'r')
    group_GRCh38 = f['/GRCh38']
    cell_names = group_GRCh38['barcodes']
    mat_shape = group_GRCh38['shape']
    indptr = np.array(group_GRCh38['indptr'])
    indices = np.array(group_GRCh38['indices'])
    data = group_GRCh38['data']
    genes = group_GRCh38['genes']
    X = csr_matrix((data, indices, indptr),
                   shape=(mat_shape[1], mat_shape[0])).toarray()
    print("aaaa")
    adata = anndata.AnnData(X,
                            pd.DataFrame(index=cell_names.value),
                            pd.DataFrame(index=genes.value),
                            dtype=X.dtype.name)
    print("bbb")
    return adata
コード例 #21
0
def cluster_features(features: pd.DataFrame, like=None):
    """Calculate leiden clustering of features.

    Specify filter of features using `like`.
    """
    # filter features
    if like is not None:
        features = features.filter(like=like)
    # create temporary adata to calculate the clustering
    adata = ad.AnnData(features)
    # important - feature values are not scaled, so need to scale them before PCA
    sc.pp.scale(adata)
    # calculate leiden clustering
    sc.pp.pca(adata, n_comps=min(10, features.shape[1] - 1))
    sc.pp.neighbors(adata)
    sc.tl.leiden(adata)

    return adata.obs["leiden"]
コード例 #22
0
def test_double_index():

    X = np.array(X_list)
    adata = ad.AnnData(X,
                       obs=obs_dict,
                       var=var_dict,
                       uns=uns_dict,
                       dtype='int32')

    adata.filename = './test.h5ad'

    from pytest import raises
    with raises(ValueError):
        # no view of view of backed object currently
        adata[:2][:, 0]

    # close backing file
    adata.write()
コード例 #23
0
ファイル: dataset_util.py プロジェクト: sidiatig/wot
def merge_datasets(*args):
    datasets = list(args)
    merged_x = np.concatenate([d.X for d in datasets])
    row_columns = set(datasets[0].obs.columns)
    if not all([set(d.obs.columns) == row_columns for d in datasets]):
        raise ValueError(
            "Unable to merge: incompatible metadata between datasets")
    merged_row_meta = pd.concat([d.obs for d in datasets], sort=True)
    if merged_row_meta.index.duplicated().any():
        raise ValueError(
            "Unable to merge: duplicate rows between datasets, cannot lose information"
        )
    col_index = datasets[0].var.index
    if not all([d.var.index.equals(col_index) for d in datasets]):
        raise ValueError(
            "Unable to merge: incompatible genes between datasets")
    merged_col_meta = datasets[0].var
    return anndata.AnnData(merged_x, merged_row_meta, merged_col_meta)
コード例 #24
0
def test_readwrite_roundtrip(typ, tmp_path, diskfmt, diskfmt2):
    tmpdir = Path(tmp_path)
    pth1 = tmpdir / f"first.{diskfmt}"
    write1 = lambda x: getattr(x, f"write_{diskfmt}")(pth1)
    read1 = lambda: getattr(ad, f"read_{diskfmt}")(pth1)
    pth2 = tmpdir / f"second.{diskfmt2}"
    write2 = lambda x: getattr(x, f"write_{diskfmt2}")(pth2)
    read2 = lambda: getattr(ad, f"read_{diskfmt2}")(pth2)

    adata1 = ad.AnnData(typ(X_list), obs=obs_dict, var=var_dict, uns=uns_dict)
    write1(adata1)
    adata2 = read1()
    write2(adata2)
    adata3 = read2()

    assert_equal(adata2, adata1)
    assert_equal(adata3, adata1)
    assert_equal(adata2, adata1)
コード例 #25
0
def creatAnndataFromDf(df, **layerInfoDt):
    """
    dataframe转换成anndata
    df,
    layerInfoDt:
        key为layer名
        value为mtx
    均行为barcode 列为feature 维度相同
    """
    transformedAd = anndata.AnnData(
        X=df.values,
        obs=pd.DataFrame(index=df.index),
        var=pd.DataFrame(index=df.columns),
    )
    for layerName, layerMtx in layerInfoDt.items():
        transformedAd.layers[layerName] = layerMtx

    return transformedAd
コード例 #26
0
    def trajectories(self, populations):
        """
        Computes a trajectory for each population

        Parameters
        ----------
        self : wot.TransportMapModel
            The TransportMapModel used to find ancestors and descendants of the population
        populations : list of wot.Population
            The target populations such as ones from self.population_from_cell_sets. THe populations must be from the same time.

        Returns
        -------
        trajectories : anndata.AnnData
            Rows : all cells, Columns : populations index. At point (i, j) : the probability that cell i is an
            ancestor/descendant of population j
        """
        wot.tmap.unique_timepoint(*populations)  # check for unique timepoint
        trajectories = []

        populations = Population.copy(*populations,
                                      normalize=True,
                                      add_missing=False)
        population_names = [p.name for p in populations]
        initial_populations = populations

        def update(head, populations_to_update):
            idx = 0 if head else len(trajectories)
            trajectories.insert(
                idx,
                np.array([pop.p for pop in populations_to_update]).T)

        update(True, populations)
        while self.can_pull_back(*populations):
            populations = self.pull_back(*populations, as_list=True)
            update(True, populations)
        populations = initial_populations
        while self.can_push_forward(*populations):
            populations = self.push_forward(*populations, as_list=True)
            update(False, populations)

        return anndata.AnnData(X=np.concatenate(trajectories),
                               obs=self.meta.copy(),
                               var=pd.DataFrame(index=population_names))
コード例 #27
0
def from_scanpy_dir(path, cell_type_identifier, covariate_key):
    """
    Creates a compositional analysis data set from all scanpy data sets in a directory.

    To use this function, all data sets need to have one common column in adata.obs that contans the cell type assignment.
    Also, the covariates need to be stored under the same key in adata.uns

    Usage: data = from_scanpy("./path/to/directory", cell_type_identifier="Louvain", covariate_key="covariates")

    Parameters
    ----------
    path -- str
        path to directory
    cell_type_identifier -- str
        column name in adata.obs that specifies the cell types
    covariate_key -- str
        key for adata.uns, where the covariate values are stored

    Returns
    -------
    data -- CompositionalData object
        A compositional analysis data set
    """

    count_data = pd.DataFrame()
    covariate_data = pd.DataFrame()

    filenames = os.listdir(path)
    for f in filenames:
        adata = ad.read_h5ad(f)

        cell_counts, covs = from_scanpy(adata, cell_type_identifier,
                                        covariate_key)
        count_data = count_data.append(cell_counts, ignore_index=True)
        covariate_data = covariate_data.append(pd.Series(covs),
                                               ignore_index=True)

    # Replace NaNs
    count_data = count_data.fillna(0)
    covariate_data = covariate_data.fillna(0)

    return ad.AnnData(X=count_data.values,
                      var=count_data.sum(axis=0).rename("n_cells").to_frame(),
                      obs=covariate_data)
コード例 #28
0
    def to_mmd_layer(self,
                     adata,
                     encoder_labels,
                     feed_fake=0,
                     return_adata=True):
        """
            Map `adata` in to the MMD layer of trVAE network. This function will compute output
            activation of MMD layer in trVAE.
            # Parameters
                adata: `~anndata.AnnData`
                    Annotated data matrix to be mapped to latent space. `data.X` has to be in shape [n_obs, n_vars].
                encoder_labels: numpy nd-array
                    `numpy nd-array` of labels to be fed as CVAE's condition array.
                feed_fake: int
                    if `feed_fake` is non-negative, `decoder_labels` will be identical to `encoder_labels`.
                    if `feed_fake` is not non-negative, `decoder_labels` will be fed with `feed_fake` value.
                return_adata: boolean
                    if `True`, will output as an `anndata` object or put the results in the `obsm` attribute of `adata`
            # Returns
                output: `~anndata.AnnData`
                    returns `anndata` object containing MMD latent space encoding of 'adata'
        """
        if feed_fake >= 0:
            decoder_labels = np.zeros(shape=encoder_labels.shape) + feed_fake
        else:
            decoder_labels = encoder_labels

        encoder_labels = to_categorical(encoder_labels,
                                        num_classes=self.n_conditions)
        decoder_labels = to_categorical(decoder_labels,
                                        num_classes=self.n_conditions)

        adata = remove_sparsity(adata)

        x = [adata.X, encoder_labels, decoder_labels]
        mmd_latent = self.cvae_model.predict(x)[1]
        mmd_latent = np.nan_to_num(mmd_latent)
        if return_adata:
            output = anndata.AnnData(X=mmd_latent)
            output.obs = adata.obs.copy(deep=True)
        else:
            output = mmd_latent

        return output
コード例 #29
0
ファイル: utils.py プロジェクト: mbernste/MAGIC
def convert_to_same_format(data,
                           target_data,
                           columns=None,
                           prevent_sparse=False):
    """Convert data to same format as target data."""
    # create new data object
    if scprep.utils.is_sparse_dataframe(target_data):
        if prevent_sparse:
            data = pd.DataFrame(data)
        else:
            data = scprep.utils.SparseDataFrame(data)
        pandas = True
    elif isinstance(target_data, pd.DataFrame):
        data = pd.DataFrame(data)
        pandas = True
    elif is_anndata(target_data):
        data = anndata.AnnData(data)
        pandas = False
    else:
        # nothing to do
        return data
    # retrieve column names
    target_columns = target_data.columns if pandas else target_data.var
    # subset column names
    try:
        if columns is not None:
            if pandas:
                target_columns = target_columns[columns]
            else:
                target_columns = target_columns.iloc[columns]
    except (KeyError, IndexError, ValueError):
        # keep the original column names
        if pandas:
            target_columns = columns
        else:
            target_columns = pd.DataFrame(index=columns)
    # set column names on new data object
    if pandas:
        data.columns = target_columns
        data.index = target_data.index
    else:
        data.var = target_columns
        data.obs = target_data.obs
    return data
コード例 #30
0
def generate_normal_uncorrelated(N, D, K, n_total, noise_std_true=1):
    """
    Scenario 1: Normally distributed, independent covariates

    Parameters
    ----------
    N -- int
        Number of samples
    D -- int
        Number of covariates
    K -- int
        Number of cell types
    n_total -- list
        Number of individual cells per sample
    noise_std_true -- float
        noise level. 0: No noise

    Returns
    -------
    data
        Anndata object
    """

    # Generate random composition parameters
    b_true = np.random.normal(0, 1, size=K).astype(np.float32)  # bias (alpha)
    w_true = np.random.normal(0, 1, size=(D, K)).astype(np.float32)  # weights (beta)

    # Generate random covariate matrix
    x = np.random.normal(0, 1, size=(N, D)).astype(np.float32)
    noise = noise_std_true * np.random.randn(N, 1).astype(np.float32)

    # Generate y
    y = np.zeros([N, K], dtype=np.float32)
    for i in range(N):
        # Concentration should sum to 1 for each sample
        concentration = softmax(x[i, :].T@w_true + b_true + noise[i, :]).astype(np.float32)
        y[i, :] = np.random.multinomial(n_total[i], concentration).astype(np.float32)

    x_names = ["x_" + str(n) for n in range(x.shape[1])]
    x_df = pd.DataFrame(x, columns=x_names)

    data = ad.AnnData(X=y, obs=x_df, uns={"b_true": b_true, "w_true": w_true})

    return data