Exemple #1
0
def test_concat_size_0_dim():
    # https://github.com/theislab/anndata/issues/526
    a = gen_adata((5, 10))
    b = gen_adata((5, 0))

    assert concat([a, b], axis=0).shape == (10, 0)
    assert concat([a, b], axis=1).shape == (5, 10)
Exemple #2
0
def test_batch_key(axis):
    """Test that concat only adds a label if the key is provided"""

    def get_annot(adata):
        return getattr(adata, ("obs", "var")[axis])

    lhs = gen_adata((10, 10))
    rhs = gen_adata((10, 12))

    # There is probably a prettier way to do this
    annot = get_annot(concat([lhs, rhs], axis=axis))
    assert (
        list(
            annot.columns.difference(
                get_annot(lhs).columns.union(get_annot(rhs).columns)
            )
        )
        == []
    )

    batch_annot = get_annot(concat([lhs, rhs], axis=axis, label="batch"))
    assert list(
        batch_annot.columns.difference(
            get_annot(lhs).columns.union(get_annot(rhs).columns)
        )
    ) == ["batch"]
Exemple #3
0
def test_nan_merge(axis, join_type, array_type):
    # concat_dim = ("obs", "var")[axis]
    alt_dim = ("var", "obs")[axis]
    mapping_attr = f"{alt_dim}m"
    adata_shape = (20, 10)

    arr = array_type(
        sparse.random(adata_shape[1 - axis], 10, density=0.1, format="csr")
    )
    arr_nan = arr.copy()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=sparse.SparseEfficiencyWarning)
        for _ in range(10):
            arr_nan[
                np.random.choice(arr.shape[0]), np.random.choice(arr.shape[1])
            ] = np.nan

    _data = {"X": sparse.csr_matrix(adata_shape), mapping_attr: {"arr": arr_nan}}
    orig1 = AnnData(**_data)
    orig2 = AnnData(**_data)
    result = concat([orig1, orig2], axis=axis, merge="same")

    assert_equal(getattr(orig1, mapping_attr), getattr(result, mapping_attr))

    orig_nonan = AnnData(
        **{"X": sparse.csr_matrix(adata_shape), mapping_attr: {"arr": arr}}
    )
    result_nonan = concat([orig1, orig_nonan], axis=axis, merge="same")

    assert len(getattr(result_nonan, mapping_attr)) == 0
Exemple #4
0
def test_concat_names(axis):
    def get_annot(adata):
        return getattr(adata, ("obs", "var")[axis])

    lhs = gen_adata((10, 10))
    rhs = gen_adata((10, 10))

    assert not get_annot(concat([lhs, rhs], axis=axis)).index.is_unique
    assert get_annot(concat([lhs, rhs], axis=axis, index_unique="-")).index.is_unique
Exemple #5
0
def test_pairwise_concat(axis, array_type):
    dim_sizes = [[100, 200, 50], [50, 50, 50]]
    if axis:
        dim_sizes.reverse()
    Ms, Ns = dim_sizes
    dim = ("obs", "var")[axis]
    alt = ("var", "obs")[axis]
    dim_attr = f"{dim}p"
    alt_attr = f"{alt}p"

    def gen_dim_array(m):
        return array_type(sparse.random(m, m, format="csr", density=0.1))

    adatas = {
        k: AnnData(
            **{
                "X": sparse.csr_matrix((m, n)),
                "obsp": {
                    "arr": gen_dim_array(m)
                },
                "varp": {
                    "arr": gen_dim_array(n)
                },
            })
        for k, m, n in zip("abc", Ms, Ns)
    }

    w_pairwise = concat(adatas, axis=axis, label="orig", pairwise=True)
    wo_pairwise = concat(adatas, axis=axis, label="orig", pairwise=False)

    # Check that argument controls whether elements are included
    assert getattr(wo_pairwise, dim_attr) == {}
    assert getattr(w_pairwise, dim_attr) != {}

    # Check values of included elements
    full_inds = np.arange(w_pairwise.shape[axis])
    groups = getattr(w_pairwise, dim).groupby("orig").indices
    for k, inds in groups.items():
        orig_arr = getattr(adatas[k], dim_attr)["arr"]
        full_arr = getattr(w_pairwise, dim_attr)["arr"]

        # Check original values are intact
        assert_equal(orig_arr, _subset(full_arr, (inds, inds)))
        # Check that entries are filled with zeroes
        assert_equal(
            sparse.csr_matrix((len(inds), len(full_inds) - len(inds))),
            _subset(full_arr, (inds, np.setdiff1d(full_inds, inds))),
        )
        assert_equal(
            sparse.csr_matrix((len(full_inds) - len(inds), len(inds))),
            _subset(full_arr, (np.setdiff1d(full_inds, inds), inds)),
        )

    # Check that argument does not affect alternative axis
    assert "arr" in getattr(
        concat(adatas, axis=axis, pairwise=False, merge="first"), alt_attr)
Exemple #6
0
def test_transposed_concat(array_type, axis, join_type, merge_strategy, fill_val):
    lhs = gen_adata((10, 10), X_type=array_type)
    rhs = gen_adata((10, 12), X_type=array_type)

    a = concat([lhs, rhs], axis=axis, join=join_type, merge=merge_strategy)
    b = concat(
        [lhs.T, rhs.T], axis=abs(axis - 1), join=join_type, merge=merge_strategy
    ).T

    assert_equal(a, b)
Exemple #7
0
def test_concat_null_X():
    adatas_orig = {k: gen_adata((20, 10)) for k in list("abc")}
    adatas_no_X = {}
    for k, v in adatas_orig.items():
        v = v.copy()
        del v.X
        adatas_no_X[k] = v

    orig = concat(adatas_orig, index_unique="-")
    no_X = concat(adatas_no_X, index_unique="-")
    del orig.X

    assert_equal(no_X, orig)
Exemple #8
0
def make_raw_dataset(samples, path, name):
    """
    Function to load, preprocess and concatenate a dataset from multiple RNAseq
     samples
    Inputs:
     samples, dictionary of sample file prefixes as keys and timepoint metadata
      as values
     path, path to directory containing sample files
     name, dataset name for labeling AnnData object metadata
    Output: AnnData object of concatenated samples, annotated with dataset,
     timepoint, and sample id labels
    """
    anndata_dict = {}

    for sm in samples.keys():
        print(sm)

        # read in data from GEO file
        data = sc.read_10x_mtx(path, prefix=sm, cache=True)

        # add metadata information
        data.obs['dataset'] = name
        data.obs['timepoint'] = samples[sm]

        # add to dict for concatenation
        anndata_dict[sm] = data

    # concatenate samples
    data_full = ad.concat(anndata_dict,
                          join='outer',
                          label='sample id',
                          index_unique='_',
                          fill_value=0.0)
    return data_full
def get_experiments_in_one_anndata(
    experiments_data_dir: Path, meta_data_path: Path,
    batch_filter_functions: List[Callable[[pd.DataFrame], pd.DataFrame]]
) -> ad.AnnData:
    # Read annotation file
    metadata = pd.read_table(meta_data_path)
    if config.DEBUG_MODE:
        batch_filter_functions.append(
            lambda df: df.head(config.DEBUG_N_BATCHES))
    for filter_func in batch_filter_functions:
        metadata = filter_func(metadata)

    # Read all plates into anndata and merge them
    col_names = metadata.columns
    adatas = process_map(partial(get_single_batch,
                                 col_names=col_names,
                                 experiments_data_dir=experiments_data_dir),
                         list(metadata.iterrows()),
                         max_workers=config.IO_N_WORKERS,
                         desc="loading relevant batches",
                         unit="batch")
    print("merging to single adata")
    adata = ad.concat(adatas, merge="same")
    print(f"converting adata to sparse matrix")
    adata.X = csr_matrix(adata.X)
    print("dropping Mouse columns, some bug with that column")
    adata.obs.drop(['Mouse'], axis='columns', inplace=True)
    return adata
Exemple #10
0
def test_full_selection(adatas):
    dat = AnnCollection(adatas, index_unique="_")
    adt_concat = ad.concat(adatas, index_unique="_")

    # sorted selection from one adata
    dat_view = dat[:2, :2]
    for adata in (adatas[0], adt_concat):
        adt_view = adata[:2, :2]
        np.testing.assert_allclose(_dense(dat_view.X), _dense(adt_view.X))
        np.testing.assert_allclose(dat_view.obsm["o_test"],
                                   adt_view.obsm["o_test"])
        np.testing.assert_array_equal(dat_view.obs["a_test"],
                                      adt_view.obs["a_test"])

    # sorted and unsorted selection from 2 adatas
    rand_idxs = np.random.choice(dat.shape[0], 4, replace=False)
    for select in (slice(2, 5), [4, 2, 3], rand_idxs):
        dat_view = dat[select, :2]
        adt_view = adt_concat[select, :2]
        np.testing.assert_allclose(_dense(dat_view.X), _dense(adt_view.X))
        np.testing.assert_allclose(dat_view.obsm["o_test"],
                                   adt_view.obsm["o_test"])
        np.testing.assert_array_equal(dat_view.obs["a_test"],
                                      adt_view.obs["a_test"])

    # test duplicate selection
    idxs = [1, 2, 4, 4]
    dat_view = dat[idxs, :2]
    np.testing.assert_allclose(_dense(dat_view.X),
                               np.array([[4, 5], [7, 8], [9, 8], [9, 8]]))
Exemple #11
0
    def merge(cls, cms, samplenames=None):
        """ Merge several countmatices.

        Matrices must have the same row dimensionality

        Parameters
        ----------
        cms : list(CountMatrix objects)
            List of count matrices
        samplenames : list(str) or None
            Associated sample labels. If None, a default sample name is used 'sample_x'.

        Returns
        -------
        CountMatrix object
        """

        for i, cm in enumerate(cms):
            if samplenames is not None:
                cm.adata.var.loc[:, 'sample'] = samplenames[i]
            if 'sample' not in cm.adata.var:
                cm.adata.var.loc[:, 'sample'] = f'sample_{i}'

        adata = ad.concat([cm.adata for cm in cms], axis=1)
        adata.obs = cms[0].adata.obs

        for i, cm in enumerate(cms):
            for k in dict(cm.adata.obsm):
                adata.obsm[f'{k}_{samplenames[i]}'] = cm.adata.obsm[k]

        return cls(adata.X, adata.obs, adata.var, adata.uns, adata.obsm,
                   adata.varm)
Exemple #12
0
def _load_spleen_lymph_cite_seq(
    save_path: str = "data/",
    protein_join: str = "inner",
    remove_outliers: bool = True,
):
    """
    Immune cells from the murine spleen and lymph nodes [GayosoSteier21]_.

    This dataset was used throughout the totalVI manuscript, and named SLN-all.

    Parameters
    ----------
    save_path
        Location to use when saving/loading the data.
    protein_join
        Whether to take an inner join or outer join of proteins
    remove_outliers
        Whether to remove clusters annotated as doublet or low quality

    Returns
    -------
    `AnnData` with `.obsm["protein_expression"]

    Missing protein values are zero, and are identified during `AnnData` setup.
    """
    url = "https://github.com/YosefLab/scVI-data/raw/master/sln_111.h5ad?raw=true"
    save_fn = "sln_111.h5ad"
    _download(url, save_path, save_fn)
    dataset1 = anndata.read_h5ad(os.path.join(save_path, save_fn))
    dataset1.obsm["isotypes_htos"] = dataset1.obsm["htos"].copy()
    del dataset1.obsm["htos"]

    url = "https://github.com/YosefLab/scVI-data/raw/master/sln_208.h5ad?raw=true"
    save_fn = "sln_208.h5ad"
    _download(url, save_path, save_fn)
    dataset2 = anndata.read_h5ad(os.path.join(save_path, save_fn))

    common_genes = dataset1.var_names.intersection(dataset2.var_names)
    dataset1 = dataset1[:, common_genes]
    dataset2 = dataset2[:, common_genes]

    del dataset1.uns["protein_names"]
    del dataset2.uns["protein_names"]

    dataset = anndata.concat(
        [dataset1, dataset2],
        join=protein_join,
    )
    dataset.obsm["protein_expression"] = dataset.obsm[
        "protein_expression"].fillna(0)

    if remove_outliers:
        include_cells = [
            c not in [
                "16,0", "17", "19", "21", "23", "24,0", "24,2", "25", "29"
            ] for c in dataset.obs["leiden_subclusters"]
        ]
        dataset = dataset[include_cells].copy()

    return dataset
    def add_promoter(self, file_promoter):
        if os.path.exists(self.file_peaks_sort):
            os.remove(self.file_peaks_sort)
        self.generate_peaks_file()
        file_peaks_promoter = os.path.join(self.path_process, 'peaks_promoter.txt')
        os.system(f"bedtools intersect -a {self.file_peaks_sort} -b {file_promoter} -wao "
                  f"> {file_peaks_promoter}")
        dict_promoter = defaultdict(list)
        with open(file_peaks_promoter, 'r') as w_pro:
            for line in w_pro:
                list_line = line.strip().split('\t')
                if list_line[4] == '.':
                    continue
                gene = list_line[7].strip().split('<-')[0]
                peak = list_line[3]
                dict_promoter[gene].append(peak)

        all_genes = dict_promoter.keys()
        list_peaks_1 = []
        list_genes_1 = []
        list_peaks_2 = []
        list_genes_2 = []
        for gene in all_genes:
            sub_peaks = dict_promoter[gene]
            if len(sub_peaks) == 1:
                list_peaks_1.extend(sub_peaks)
                list_genes_1.append(gene)
            else:
                list_genes_2.extend([gene for _ in range(len(sub_peaks))])
                list_peaks_2.extend(sub_peaks)
        adata_gene_1 = self.adata[:, list_peaks_1]
        df_gene_peak_1 = pd.DataFrame(adata_gene_1.X, index=adata_gene_1.obs.index,
                                      columns=list_genes_1)
        adata_gene_2 = self.adata[:, list_peaks_2]
        df_gene_peak_2 = pd.DataFrame(
            adata_gene_2.X,
            index=adata_gene_2.obs.index,
            columns=pd.MultiIndex.from_arrays([list_genes_2, list_peaks_2], names=['gene', 'peak']))
        df_gene_peak_2_t = df_gene_peak_2.T
        df_gene_peak_2_t_gene = df_gene_peak_2_t.groupby('gene').apply(lambda x: x.sum())
        df_gene_peak_2 = df_gene_peak_2_t_gene.T
        all_cols = set(list_peaks_1 + list_peaks_2)
        other_cols = set(self.adata.var.index).difference(all_cols)
        self.other_peaks = other_cols
        adata_other = self.adata[:, [one_peak for one_peak in self.adata.var.index
                                     if one_peak in other_cols]]
        adata_other.var['cRE_type'] = np.full(adata_other.n_vars, 'Other')

        df_gene = pd.concat([df_gene_peak_1, df_gene_peak_2], axis=1)
        adata_promoter = \
            ad.AnnData(X=df_gene,
                       var=pd.DataFrame(data={'cRE_type': np.full(df_gene.shape[1], 'Promoter')},
                                        index=df_gene.columns),
                       obs=pd.DataFrame(index=df_gene.index))
        self.all_genes = set(df_gene.columns)
        adata_merge = ad.concat([adata_promoter, adata_other], axis=1)
        self.adata_merge = adata_merge

        return
Exemple #14
0
def _load_pbmcs_10x_cite_seq(
    save_path: str = "data/",
    protein_join: str = "inner",
):
    """
    Filtered PBMCs from 10x Genomics profiled with RNA and protein.

    Datasets were filtered for doublets and other outliers as in
    https://github.com/YosefLab/totalVI_reproducibility/blob/master/data/data_filtering_scripts/pbmc_10k/pbmc_10k.py

    Parameters
    ----------
    save_path
        Location to use when saving/loading the data.
    protein_join
        Whether to take an inner join or outer join of proteins

    Returns
    -------
    `AnnData` with `.obsm["protein_expression"]

    Missing protein values are zero, and are identified during `AnnData` setup.
    """
    url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_10k_protein_v3.h5ad?raw=true"
    save_fn = "pbmc_10k_protein_v3.h5ad"
    _download(url, save_path, save_fn)
    dataset1 = anndata.read_h5ad(os.path.join(save_path, save_fn))
    dataset1.obs["batch"] = "PBMC10k"

    url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_5k_protein_v3.h5ad?raw=true"
    save_fn = "pbmc_5k_protein_v3.h5ad"
    _download(url, save_path, save_fn)
    dataset2 = anndata.read_h5ad(
        os.path.join(save_path, "pbmc_5k_protein_v3.h5ad"))
    dataset2.obs["batch"] = "PBMC5k"

    common_genes = dataset1.var_names.intersection(dataset2.var_names)
    dataset1 = dataset1[:, common_genes]
    dataset2 = dataset2[:, common_genes]
    dataset1.obsm["protein_expression"] = pd.DataFrame(
        dataset1.obsm["protein_expression"],
        columns=dataset1.uns["protein_names"],
        index=dataset1.obs_names,
    )
    dataset2.obsm["protein_expression"] = pd.DataFrame(
        dataset2.obsm["protein_expression"],
        columns=dataset2.uns["protein_names"],
        index=dataset2.obs_names,
    )
    del dataset1.uns["protein_names"]
    del dataset2.uns["protein_names"]

    dataset = anndata.concat([dataset1, dataset2], join=protein_join)
    dataset.obsm["protein_expression"] = dataset.obsm[
        "protein_expression"].fillna(0)

    return dataset
Exemple #15
0
def test_concat_outer_aligned_mapping(elem):
    a = gen_adata((5, 5))
    b = gen_adata((3, 5))
    del b.obsm[elem]

    concated = concat({"a": a, "b": b}, join="outer", label="group")
    result = concated.obsm[elem][concated.obs["group"] == "b"]

    check_filled_like(result, elem_name=f"obsm/{elem}")
Exemple #16
0
def test_concat_annot_join(obsm_adatas, join_type):
    adatas = [
        AnnData(sparse.csr_matrix(a.shape), obs=a.obsm["df"], var=a.var)
        for a in obsm_adatas
    ]
    pd.testing.assert_frame_equal(
        concat(adatas, join=join_type).obs,
        pd.concat([a.obs for a in adatas], join=join_type),
    )
Exemple #17
0
def merge_datasets(
    datasets: Sequence[sc.AnnData],
    symbol_in_n_datasets: Union[int, None] = None,
    min_batch_size=25,
) -> sc.AnnData:
    """
    Concatenate the anndata objects in `datasets`. Keeps symbols that are at
    least in `symbol_in_n_datasets` datasets. If `symbol_in_n_datasets` is None,
    it only keeps symbols that are in all datasets.

    Only keeps X, obs, var of all datasets.

    Adds log-norm transformed values to adata.raw.
    """
    if symbol_in_n_datasets is None:
        symbol_in_n_datasets = len(datasets)
    gene_ids = [set(adata.var_names.values) for adata in datasets]
    symbol_count = Counter(itertools.chain.from_iterable(gene_ids))
    keep_symbols = set(
        [sym for sym, c in symbol_count.items() if c >= symbol_in_n_datasets]
    )

    datasets_subset = list()
    for dataset in datasets:
        tmp_sym = sorted(list(set(dataset.var_names.values) & keep_symbols))
        tmp_adata = dataset[:, tmp_sym]
        tmp_obs = tmp_adata.obs.loc[:, MANDATORY_COLS + ["cell_type"]]
        # get rid of everything except X, obs, var
        datasets_subset.append(
            sc.AnnData(X=tmp_adata.X, obs=tmp_obs, var=tmp_adata.var)
        )

    for dataset in datasets:
        validate_adata(dataset)

    adata_merged = anndata.concat(datasets_subset, index_unique="-", join="outer")

    # add log-norm values to `.raw`
    adata_merged_raw = adata_merged.copy()
    sc.pp.normalize_total(adata_merged_raw)
    sc.pp.log1p(adata_merged_raw)
    adata_merged.raw = adata_merged_raw

    # Exclude too small batches.
    adata_merged.obs["batch"] = [
        f"{dataset}_{sample}"
        for dataset, sample in zip(
            adata_merged.obs["dataset"], adata_merged.obs["sample"]
        )
    ]

    batch_size = adata_merged.obs.groupby("batch").size()
    keep_batchs = batch_size[batch_size > 25].keys().values
    adata_merged = adata_merged[adata_merged.obs["batch"].isin(keep_batchs), :].copy()

    return adata_merged
Exemple #18
0
def test_concat_size_0_dim(axis, join_type, merge_strategy, shape):
    # https://github.com/theislab/anndata/issues/526
    a = gen_adata((5, 7))
    b = gen_adata(shape)
    alt_axis = 1 - axis
    dim = ("obs", "var")[axis]

    expected_size = expected_shape(a, b, axis=axis, join=join_type)
    result = concat(
        {
            "a": a,
            "b": b
        },
        axis=axis,
        join=join_type,
        merge=merge_strategy,
        pairwise=True,
        index_unique="-",
    )
    assert result.shape == expected_size

    if join_type == "outer":
        # Check new entries along axis of concatenation
        axis_new_inds = axis_labels(result, axis).str.endswith("-b")
        altaxis_new_inds = ~axis_labels(result, alt_axis).isin(
            axis_labels(a, alt_axis))
        axis_idx = make_idx_tuple(axis_new_inds, axis)
        altaxis_idx = make_idx_tuple(altaxis_new_inds, 1 - axis)

        check_filled_like(result.X[axis_idx], elem_name="X")
        check_filled_like(result.X[altaxis_idx], elem_name="X")
        for k, elem in getattr(result, "layers").items():
            check_filled_like(elem[axis_idx], elem_name=f"layers/{k}")
            check_filled_like(elem[altaxis_idx], elem_name=f"layers/{k}")

        if shape[axis] > 0:
            b_result = result[axis_idx].copy()
            mapping_elem = f"{dim}m"
            setattr(b_result, f"{dim}_names", getattr(b, f"{dim}_names"))
            for k, result_elem in getattr(b_result, mapping_elem).items():
                elem_name = f"{mapping_elem}/{k}"
                # pd.concat can have unintuitive return types. is similar to numpy promotion
                if isinstance(result_elem, pd.DataFrame):
                    assert_equal(
                        getattr(b, mapping_elem)[k].astype(object),
                        result_elem.astype(object),
                        elem_name=elem_name,
                    )
                else:
                    assert_equal(
                        getattr(b, mapping_elem)[k],
                        result_elem,
                        elem_name=elem_name,
                    )
Exemple #19
0
    def add_promoter(self, file_promoter, num_threads=20):
        if not os.path.exists(self.file_peaks_sort):
            self.generate_peaks_file()
        file_peaks_promoter = os.path.join(self.path_process,
                                           'peaks_promoter.txt')
        os.system(
            f"bedtools intersect -a {self.file_peaks_sort} -b {file_promoter} -wao "
            f"> {file_peaks_promoter}")
        dict_promoter = defaultdict(list)
        all_peaks = set()
        with open(file_peaks_promoter, 'r') as w_pro:
            for line in w_pro:
                list_line = line.strip().split('\t')
                if list_line[4] == '.':
                    continue
                gene = list_line[7].strip().split('<-')[0]
                peak = list_line[3]
                dict_promoter[gene].append(peak)
                all_peaks.add(peak)

        all_genes = dict_promoter.keys()
        adata_gene = self.adata[:, [
            one_peak for one_peak in self.adata.var.index
            if one_peak in all_peaks
        ]]
        df_gene_peak = pd.DataFrame(adata_gene.X,
                                    index=adata_gene.obs.index,
                                    columns=adata_gene.var.index)
        all_cols = df_gene_peak.columns
        other_cols = set(self.adata.var.index).difference(all_cols)
        self.other_peaks = other_cols
        adata_other = self.adata[:, [
            one_peak for one_peak in self.adata.var.index
            if one_peak in other_cols
        ]]
        adata_other.var['cRE_type'] = np.full(adata_other.n_vars, 'Other')

        pool = Pool(num_threads)
        func_sum = partial(self.sum_peaks, df_gene_peak, dict_promoter)
        result = pool.map(func_sum, all_genes)
        pool.close()
        # result = [one_df for one_df in result if one_df is not None]
        df_gene = pd.concat(result, axis=1)
        adata_promoter = \
            ad.AnnData(X=df_gene,
                       var=pd.DataFrame(data={'cRE_type': np.full(df_gene.shape[1], 'Promoter')},
                                        index=df_gene.columns),
                       obs=pd.DataFrame(index=df_gene.index))
        self.all_genes = set(df_gene.columns)
        adata_merge = ad.concat([adata_promoter, adata_other], axis=1)
        self.adata_merge = adata_merge

        return
Exemple #20
0
def test_concat_X_dtype():
    adatas_orig = {
        k: AnnData(np.ones((20, 10), dtype=np.int8), dtype=np.int8)
        for k in list("abc")
    }
    for adata in adatas_orig.values():
        adata.raw = AnnData(np.ones((20, 30), dtype=np.float64),
                            dtype=np.float64)

    result = concat(adatas_orig, index_unique="-")

    assert result.X.dtype == np.int8
    assert result.raw.X.dtype == np.float64
Exemple #21
0
def normalize(adata,
              filter_min_counts=True,
              size_factors=True,
              normalize_input=True,
              logtrans_input=True,
              var_order=None):

    if filter_min_counts:
        sc.pp.filter_genes(adata, min_counts=1)
        sc.pp.filter_cells(adata, min_counts=1)

    # add/reorder vars if needed
    if var_order is not None:
        obs = adata.obs
        a, b = set(var_order), set(adata.var.index.to_list())
        overlap = list(a.intersection(b))
        missing = list(a - set(overlap))
        logging.info(
            f'{len(overlap)} genes overlap with model after filtering')
        logging.info(
            f'{len(missing)} genes missing from dataset after filtering')

        new = adata[:, overlap]
        m = anndata.AnnData(X=np.zeros((adata.shape[0], len(missing))),
                            obs=adata.obs)
        m.var.index = missing
        new = anndata.concat((new, m), axis=1)

        adata = new[:, var_order]
        adata.obs = obs

    if size_factors or normalize_input or logtrans_input:
        adata.raw = adata.copy()
    else:
        adata.raw = adata

    if size_factors:
        sc.pp.normalize_per_cell(adata)
        adata.obs['size_factors'] = adata.obs.n_counts / np.median(
            adata.obs.n_counts)
    else:
        adata.obs['size_factors'] = 1.0

    if logtrans_input:
        sc.pp.log1p(adata)

    if normalize_input:
        sc.pp.scale(adata)

    return adata
Exemple #22
0
def test_concat_interface_errors():
    adatas = [gen_adata((5, 10)), gen_adata((5, 10))]

    with pytest.raises(ValueError):
        concat(adatas, axis=3)
    with pytest.raises(ValueError):
        concat(adatas, join="not implemented")
    with pytest.raises(ValueError):
        concat([])
Exemple #23
0
def make_all_raw_datasets(samples, paths, names, meta):
    """
    reads all datasets and performs integration
    :param samples: list of samples
    :param paths: list of paths
    :param names: list of names
    :param meta: list of metadata locations
    :return:
    """
    datasets = []
    for i in range(len(meta)):
        # make raw datsets using helper functions
        if meta[i] is None:
            dataset = make_raw_dataset(samples[i], paths[i], names[i])
            sc.pp.filter_genes(dataset, min_cells=10)
            run_normalization(dataset, n_top_genes=10000)
            datasets.append(dataset)
        else:
            dataset = make_raw_dataset_tsv(samples[i], meta[i], paths[i],
                                           names[i])
            sc.pp.filter_genes(dataset, min_cells=10)
            run_normalization(dataset, n_top_genes=10000)
            datasets.append(dataset)
    # concatenate data
    all_data = ad.concat(datasets,
                         join='outer',
                         label='sample id',
                         index_unique='_',
                         fill_value=0.0)

    # run harmony
    run_harmony_integration(all_data, normalize=False)

    # save data to reduce computation time
    with open('integrated/all_integrated', 'wb') as f:
        pickle.dump(all_data, f)

    datasets_integrated = []
    for name in names:
        dataset_int = all_data[np.equal(all_data.obs['dataset'], name), :]
        name_str = 'integrated/' + name + '_integrated'
        with open(name_str, 'wb') as f:
            pickle.dump(dataset_int, f)
        datasets_integrated.append(dataset_int)
    return all_data, datasets_integrated
Exemple #24
0
def test_de_4_groups(sparse):
    adata1 = get_example_data(sparse)
    adata2 = get_example_data(sparse)
    adata2.obs['sc_groups'] = adata2.obs['sc_groups'].replace({0: 2, 1: 3})
    adata = anndata.concat((adata1, adata2))
    adata.obs_names_make_unique()
    batch_size = 3
    obs_field = 'sc_groups'
    adata.obs[obs_field] = adata.obs[obs_field].astype('category')
    nfeatures = adata.shape[1]
    get_batch_fn = lambda i: adata[:, i:min(nfeatures, i + batch_size)]
    de = DE(series=adata.obs[obs_field],
            nfeatures=nfeatures,
            batch_size=batch_size,
            get_batch_fn=get_batch_fn,
            base=get_base(adata))
    for i in range(4):
        diff_results(adata, obs_field, de.pair2results[i], str(i))
Exemple #25
0
def make_raw_dataset_tsv(samples, meta, path, name):
    """
    Gets anndata object when samples are in tsv format
    :param samples: list of sample prefixes
    :param meta: metadata file path
    :param path: path to data
    :param name: name of dataset
    :return: full anndata object
    """
    anndata_dict = {}
    metadata = get_francesconi_metadata(meta)
    for sm in samples:
        print(sm)
        full_path = path + sm

        # read data from geo file
        data = sc.read(full_path, cache=True)
        data = data.transpose()

        # add metadata info
        data.obs['dataset'] = name
        with open(full_path, 'r') as f:
            line = f.readline().split()
            # get first name
            n = line[0]
            time = metadata.loc[metadata['title'] == n,
                                'time'].to_string(index=False)
            treatment = metadata.loc[metadata['title'] == n,
                                     'treatment'].to_string(index=False)
            time = time.replace('day ', 'D', 1)
        data.obs['timepoint'] = time
        if treatment == 'reprogramming' or time == '0h':
            anndata_dict[sm] = data

    # concatenate samples
    data_full = ad.concat(anndata_dict,
                          join='outer',
                          label='sample id',
                          index_unique='_',
                          fill_value=0.0)
    return rename_genes(data_full)
Exemple #26
0
 def load_scdata(self, data_directories, cell_types):
     # Read and merge 10X Genomics scRNA-seq data
     scdata = None
     print('Loading single cell dataset')
     for d, c in zip(tqdm(data_directories), cell_types):
         x = sc.read_10x_mtx(d)
         x.obs['celltype'] = [c]*len(x.obs.index)
         # Change each observation (cell) name to celltype + barcode
         x.obs.set_index(pd.Index([c+'_'+rn[:-2] for rn in x.obs.index]), inplace=True)
         if scdata is not None:
             scdata = ad.concat([scdata, x])
         else:
             scdata = x
     # Filter out cells and genes
     sc.pp.filter_cells(scdata, min_genes=200)
     sc.pp.filter_genes(scdata, min_cells=1)
     # Search for prefix "MT-" (mitochondrial genes) and make new column in variable annotations
     # Search for prefix "RPL/RPS" for ribosomal genes and "MRPL/MRPS" for mitochondrial ribosomal genes
     scdata.var['mito'] = scdata.var.index.str.match('^MT-')
     scdata.var['ribo'] = scdata.var.index.str.startswith(('RPL','RPS'))
     scdata.var['mribo'] = scdata.var.index.str.startswith(('MRPL','MRPS'))
     # Calculate QC metrics as per McCarthy et al., 2017 (Scater)
     sc.pp.calculate_qc_metrics(scdata, qc_vars=['mito','ribo', 'mribo'], inplace=True)
     # Plot QC metrics
     # sns.jointplot(x='total_counts', y='n_genes_by_counts', height=8, data=scdata.obs,
     #     kind='scatter', hue='celltype')
     # sns.jointplot(x='total_counts', y='pct_counts_mito', height=8, data=scdata.obs,
     #     kind='scatter', hue='celltype')
     # sns.jointplot(x='total_counts', y='pct_counts_ribo', height=8, data=scdata.obs,
     #     kind='scatter', hue='celltype')
     # sns.jointplot(x='total_counts', y='pct_counts_mribo', height=8, data=scdata.obs,
     #     kind='scatter', hue='celltype')
     # plt.show()
     # Filter out cells with >5% of counts from mitochondria and mitoribosome
     # scdata = scdata[scdata.obs.pct_counts_ribo > 30, :]
     scdata = scdata[scdata.obs.pct_counts_mito < 5, :]
     scdata = scdata[scdata.obs.pct_counts_mribo < 1, :]
     return scdata
Exemple #27
0
def states_across_time():
    """
    Makes tSNE plots across time for Babos and Shie
    :return:
    """
    # Analyze states across time
    states = [[({
        'GSM3964244_MEFs_': 'D0'
    }, 'Data/Babos/', 'babos'),
               ({
                   'GSM2836267_D0.': 'D0'
               }, 'Data/Shiebinger/GSE106340_RAW/', 'shiebinger')],
              [({
                  'GSM3964245_6F_P4_': 'D4'
              }, 'Data/Babos/', 'babos'),
               ({
                   'GSM2836270_D4-1.': 'D4'
               }, 'Data/Shiebinger/GSE106340_RAW/', 'shiebinger')],
              [({
                  'GSM3964247_6F_P8_': 'D8'
              }, 'Data/Babos/', 'babos'),
               ({
                   'GSM2836274_D8-1.': 'D8'
               }, 'Data/Shiebinger/GSE106340_RAW/', 'shiebinger')],
              [({
                  'GSM3964249_6F_iMN1_': 'D14'
              }, 'Data/Babos/', 'babos'),
               ({
                   'GSM2836288_iPSCs-serum.': 'iPSCs'
               }, 'Data/Shiebinger/GSE106340_RAW/', 'shiebinger')]]

    for state_data in states:
        raw_datasets = [make_raw_dataset(*sample) for sample in state_data]
        full_data = ad.concat(raw_datasets, join='outer', label='dataset')
        pca_df = run_harmony_integration(full_data)

        sc.tl.tsne(full_data, use_rep='X_pca_harmony')
        sc.pl.tsne(full_data, color='sample id')
    def make_guide_count_tables(self):
        all_sgRNA_counts = []
        for lane in self.lanes:
            sgRNA_counts = sc.read_h5ad(lane.GEX_fns['sgRNA_counts_h5ad'])
            lane_num = lane.name[-1]
            sgRNA_counts.obs.index = [
                f'{cell_bc.rsplit("-", 1)[0]}-{lane_num}'
                for cell_bc in sgRNA_counts.obs_names
            ]
            all_sgRNA_counts.append(sgRNA_counts)

        sgRNA_data = ad.concat(all_sgRNA_counts)
        sgRNA_data.write(self.GEX_fns['sgRNA_counts_h5ad'])

        df = sgRNA_data.to_df().astype(int)
        df.index.name = 'cell_barcode'
        df.columns.name = 'guide_identity'
        df.to_csv(self.GEX_fns['sgRNA_counts_csv'])

        stacked = df.stack()
        stacked.name = 'UMI_count'
        stacked.index.names = ('cell_barcode', 'guide_identity')
        stacked.to_csv(self.GEX_fns['sgRNA_counts_list'])
Exemple #29
0
    return adata


os.chdir(
    r"C:/Users/USER/Documents/R/RNAseq/scientificProject/data/Scadden/VeloData"
)
csv_loc = r"C:/Users/USER/Documents/R/RNAseq/scientificProject/data/Scadden/R_references/subsetting_EC/"
scv.settings.set_figure_params('scvelo')
file_list = os.listdir()

con_dir = {}
for file in file_list:
    name = re.sub("_.+", "", file)
    con_dir[name] = subset_anndata(file, csv_loc + name)

concat = anndata.concat(con_dir, axis=0, label="dataset")
path = Path(
    r"C:/Users/USER/Documents/R/RNAseq/scientificProject/data/Scadden/" +
    r"Concat_raw.h5ad")
concat.write_h5ad(filename=path, )
del concat
del con_dir
del file_list

loc = r"C:/Users/USER/Documents/R/RNAseq/scientificProject/data/Scadden/Raw_based/"
os.chdir(loc)
adata = scv.read(path)
adata.obs.dataset = [x for x in adata.obs.dataset]
new_index = []
for ob in range(len(adata.obs.index)):
    cell = adata.obs.index[ob]
Exemple #30
0
def merge_samples(adatalist):
    adata = ad.concat(adatalist, axis=0)
    adata.var = adatalist[0].var
    return adata