def test_concat_size_0_dim(): # https://github.com/theislab/anndata/issues/526 a = gen_adata((5, 10)) b = gen_adata((5, 0)) assert concat([a, b], axis=0).shape == (10, 0) assert concat([a, b], axis=1).shape == (5, 10)
def test_batch_key(axis): """Test that concat only adds a label if the key is provided""" def get_annot(adata): return getattr(adata, ("obs", "var")[axis]) lhs = gen_adata((10, 10)) rhs = gen_adata((10, 12)) # There is probably a prettier way to do this annot = get_annot(concat([lhs, rhs], axis=axis)) assert ( list( annot.columns.difference( get_annot(lhs).columns.union(get_annot(rhs).columns) ) ) == [] ) batch_annot = get_annot(concat([lhs, rhs], axis=axis, label="batch")) assert list( batch_annot.columns.difference( get_annot(lhs).columns.union(get_annot(rhs).columns) ) ) == ["batch"]
def test_nan_merge(axis, join_type, array_type): # concat_dim = ("obs", "var")[axis] alt_dim = ("var", "obs")[axis] mapping_attr = f"{alt_dim}m" adata_shape = (20, 10) arr = array_type( sparse.random(adata_shape[1 - axis], 10, density=0.1, format="csr") ) arr_nan = arr.copy() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=sparse.SparseEfficiencyWarning) for _ in range(10): arr_nan[ np.random.choice(arr.shape[0]), np.random.choice(arr.shape[1]) ] = np.nan _data = {"X": sparse.csr_matrix(adata_shape), mapping_attr: {"arr": arr_nan}} orig1 = AnnData(**_data) orig2 = AnnData(**_data) result = concat([orig1, orig2], axis=axis, merge="same") assert_equal(getattr(orig1, mapping_attr), getattr(result, mapping_attr)) orig_nonan = AnnData( **{"X": sparse.csr_matrix(adata_shape), mapping_attr: {"arr": arr}} ) result_nonan = concat([orig1, orig_nonan], axis=axis, merge="same") assert len(getattr(result_nonan, mapping_attr)) == 0
def test_concat_names(axis): def get_annot(adata): return getattr(adata, ("obs", "var")[axis]) lhs = gen_adata((10, 10)) rhs = gen_adata((10, 10)) assert not get_annot(concat([lhs, rhs], axis=axis)).index.is_unique assert get_annot(concat([lhs, rhs], axis=axis, index_unique="-")).index.is_unique
def test_pairwise_concat(axis, array_type): dim_sizes = [[100, 200, 50], [50, 50, 50]] if axis: dim_sizes.reverse() Ms, Ns = dim_sizes dim = ("obs", "var")[axis] alt = ("var", "obs")[axis] dim_attr = f"{dim}p" alt_attr = f"{alt}p" def gen_dim_array(m): return array_type(sparse.random(m, m, format="csr", density=0.1)) adatas = { k: AnnData( **{ "X": sparse.csr_matrix((m, n)), "obsp": { "arr": gen_dim_array(m) }, "varp": { "arr": gen_dim_array(n) }, }) for k, m, n in zip("abc", Ms, Ns) } w_pairwise = concat(adatas, axis=axis, label="orig", pairwise=True) wo_pairwise = concat(adatas, axis=axis, label="orig", pairwise=False) # Check that argument controls whether elements are included assert getattr(wo_pairwise, dim_attr) == {} assert getattr(w_pairwise, dim_attr) != {} # Check values of included elements full_inds = np.arange(w_pairwise.shape[axis]) groups = getattr(w_pairwise, dim).groupby("orig").indices for k, inds in groups.items(): orig_arr = getattr(adatas[k], dim_attr)["arr"] full_arr = getattr(w_pairwise, dim_attr)["arr"] # Check original values are intact assert_equal(orig_arr, _subset(full_arr, (inds, inds))) # Check that entries are filled with zeroes assert_equal( sparse.csr_matrix((len(inds), len(full_inds) - len(inds))), _subset(full_arr, (inds, np.setdiff1d(full_inds, inds))), ) assert_equal( sparse.csr_matrix((len(full_inds) - len(inds), len(inds))), _subset(full_arr, (np.setdiff1d(full_inds, inds), inds)), ) # Check that argument does not affect alternative axis assert "arr" in getattr( concat(adatas, axis=axis, pairwise=False, merge="first"), alt_attr)
def test_transposed_concat(array_type, axis, join_type, merge_strategy, fill_val): lhs = gen_adata((10, 10), X_type=array_type) rhs = gen_adata((10, 12), X_type=array_type) a = concat([lhs, rhs], axis=axis, join=join_type, merge=merge_strategy) b = concat( [lhs.T, rhs.T], axis=abs(axis - 1), join=join_type, merge=merge_strategy ).T assert_equal(a, b)
def test_concat_null_X(): adatas_orig = {k: gen_adata((20, 10)) for k in list("abc")} adatas_no_X = {} for k, v in adatas_orig.items(): v = v.copy() del v.X adatas_no_X[k] = v orig = concat(adatas_orig, index_unique="-") no_X = concat(adatas_no_X, index_unique="-") del orig.X assert_equal(no_X, orig)
def make_raw_dataset(samples, path, name): """ Function to load, preprocess and concatenate a dataset from multiple RNAseq samples Inputs: samples, dictionary of sample file prefixes as keys and timepoint metadata as values path, path to directory containing sample files name, dataset name for labeling AnnData object metadata Output: AnnData object of concatenated samples, annotated with dataset, timepoint, and sample id labels """ anndata_dict = {} for sm in samples.keys(): print(sm) # read in data from GEO file data = sc.read_10x_mtx(path, prefix=sm, cache=True) # add metadata information data.obs['dataset'] = name data.obs['timepoint'] = samples[sm] # add to dict for concatenation anndata_dict[sm] = data # concatenate samples data_full = ad.concat(anndata_dict, join='outer', label='sample id', index_unique='_', fill_value=0.0) return data_full
def get_experiments_in_one_anndata( experiments_data_dir: Path, meta_data_path: Path, batch_filter_functions: List[Callable[[pd.DataFrame], pd.DataFrame]] ) -> ad.AnnData: # Read annotation file metadata = pd.read_table(meta_data_path) if config.DEBUG_MODE: batch_filter_functions.append( lambda df: df.head(config.DEBUG_N_BATCHES)) for filter_func in batch_filter_functions: metadata = filter_func(metadata) # Read all plates into anndata and merge them col_names = metadata.columns adatas = process_map(partial(get_single_batch, col_names=col_names, experiments_data_dir=experiments_data_dir), list(metadata.iterrows()), max_workers=config.IO_N_WORKERS, desc="loading relevant batches", unit="batch") print("merging to single adata") adata = ad.concat(adatas, merge="same") print(f"converting adata to sparse matrix") adata.X = csr_matrix(adata.X) print("dropping Mouse columns, some bug with that column") adata.obs.drop(['Mouse'], axis='columns', inplace=True) return adata
def test_full_selection(adatas): dat = AnnCollection(adatas, index_unique="_") adt_concat = ad.concat(adatas, index_unique="_") # sorted selection from one adata dat_view = dat[:2, :2] for adata in (adatas[0], adt_concat): adt_view = adata[:2, :2] np.testing.assert_allclose(_dense(dat_view.X), _dense(adt_view.X)) np.testing.assert_allclose(dat_view.obsm["o_test"], adt_view.obsm["o_test"]) np.testing.assert_array_equal(dat_view.obs["a_test"], adt_view.obs["a_test"]) # sorted and unsorted selection from 2 adatas rand_idxs = np.random.choice(dat.shape[0], 4, replace=False) for select in (slice(2, 5), [4, 2, 3], rand_idxs): dat_view = dat[select, :2] adt_view = adt_concat[select, :2] np.testing.assert_allclose(_dense(dat_view.X), _dense(adt_view.X)) np.testing.assert_allclose(dat_view.obsm["o_test"], adt_view.obsm["o_test"]) np.testing.assert_array_equal(dat_view.obs["a_test"], adt_view.obs["a_test"]) # test duplicate selection idxs = [1, 2, 4, 4] dat_view = dat[idxs, :2] np.testing.assert_allclose(_dense(dat_view.X), np.array([[4, 5], [7, 8], [9, 8], [9, 8]]))
def merge(cls, cms, samplenames=None): """ Merge several countmatices. Matrices must have the same row dimensionality Parameters ---------- cms : list(CountMatrix objects) List of count matrices samplenames : list(str) or None Associated sample labels. If None, a default sample name is used 'sample_x'. Returns ------- CountMatrix object """ for i, cm in enumerate(cms): if samplenames is not None: cm.adata.var.loc[:, 'sample'] = samplenames[i] if 'sample' not in cm.adata.var: cm.adata.var.loc[:, 'sample'] = f'sample_{i}' adata = ad.concat([cm.adata for cm in cms], axis=1) adata.obs = cms[0].adata.obs for i, cm in enumerate(cms): for k in dict(cm.adata.obsm): adata.obsm[f'{k}_{samplenames[i]}'] = cm.adata.obsm[k] return cls(adata.X, adata.obs, adata.var, adata.uns, adata.obsm, adata.varm)
def _load_spleen_lymph_cite_seq( save_path: str = "data/", protein_join: str = "inner", remove_outliers: bool = True, ): """ Immune cells from the murine spleen and lymph nodes [GayosoSteier21]_. This dataset was used throughout the totalVI manuscript, and named SLN-all. Parameters ---------- save_path Location to use when saving/loading the data. protein_join Whether to take an inner join or outer join of proteins remove_outliers Whether to remove clusters annotated as doublet or low quality Returns ------- `AnnData` with `.obsm["protein_expression"] Missing protein values are zero, and are identified during `AnnData` setup. """ url = "https://github.com/YosefLab/scVI-data/raw/master/sln_111.h5ad?raw=true" save_fn = "sln_111.h5ad" _download(url, save_path, save_fn) dataset1 = anndata.read_h5ad(os.path.join(save_path, save_fn)) dataset1.obsm["isotypes_htos"] = dataset1.obsm["htos"].copy() del dataset1.obsm["htos"] url = "https://github.com/YosefLab/scVI-data/raw/master/sln_208.h5ad?raw=true" save_fn = "sln_208.h5ad" _download(url, save_path, save_fn) dataset2 = anndata.read_h5ad(os.path.join(save_path, save_fn)) common_genes = dataset1.var_names.intersection(dataset2.var_names) dataset1 = dataset1[:, common_genes] dataset2 = dataset2[:, common_genes] del dataset1.uns["protein_names"] del dataset2.uns["protein_names"] dataset = anndata.concat( [dataset1, dataset2], join=protein_join, ) dataset.obsm["protein_expression"] = dataset.obsm[ "protein_expression"].fillna(0) if remove_outliers: include_cells = [ c not in [ "16,0", "17", "19", "21", "23", "24,0", "24,2", "25", "29" ] for c in dataset.obs["leiden_subclusters"] ] dataset = dataset[include_cells].copy() return dataset
def add_promoter(self, file_promoter): if os.path.exists(self.file_peaks_sort): os.remove(self.file_peaks_sort) self.generate_peaks_file() file_peaks_promoter = os.path.join(self.path_process, 'peaks_promoter.txt') os.system(f"bedtools intersect -a {self.file_peaks_sort} -b {file_promoter} -wao " f"> {file_peaks_promoter}") dict_promoter = defaultdict(list) with open(file_peaks_promoter, 'r') as w_pro: for line in w_pro: list_line = line.strip().split('\t') if list_line[4] == '.': continue gene = list_line[7].strip().split('<-')[0] peak = list_line[3] dict_promoter[gene].append(peak) all_genes = dict_promoter.keys() list_peaks_1 = [] list_genes_1 = [] list_peaks_2 = [] list_genes_2 = [] for gene in all_genes: sub_peaks = dict_promoter[gene] if len(sub_peaks) == 1: list_peaks_1.extend(sub_peaks) list_genes_1.append(gene) else: list_genes_2.extend([gene for _ in range(len(sub_peaks))]) list_peaks_2.extend(sub_peaks) adata_gene_1 = self.adata[:, list_peaks_1] df_gene_peak_1 = pd.DataFrame(adata_gene_1.X, index=adata_gene_1.obs.index, columns=list_genes_1) adata_gene_2 = self.adata[:, list_peaks_2] df_gene_peak_2 = pd.DataFrame( adata_gene_2.X, index=adata_gene_2.obs.index, columns=pd.MultiIndex.from_arrays([list_genes_2, list_peaks_2], names=['gene', 'peak'])) df_gene_peak_2_t = df_gene_peak_2.T df_gene_peak_2_t_gene = df_gene_peak_2_t.groupby('gene').apply(lambda x: x.sum()) df_gene_peak_2 = df_gene_peak_2_t_gene.T all_cols = set(list_peaks_1 + list_peaks_2) other_cols = set(self.adata.var.index).difference(all_cols) self.other_peaks = other_cols adata_other = self.adata[:, [one_peak for one_peak in self.adata.var.index if one_peak in other_cols]] adata_other.var['cRE_type'] = np.full(adata_other.n_vars, 'Other') df_gene = pd.concat([df_gene_peak_1, df_gene_peak_2], axis=1) adata_promoter = \ ad.AnnData(X=df_gene, var=pd.DataFrame(data={'cRE_type': np.full(df_gene.shape[1], 'Promoter')}, index=df_gene.columns), obs=pd.DataFrame(index=df_gene.index)) self.all_genes = set(df_gene.columns) adata_merge = ad.concat([adata_promoter, adata_other], axis=1) self.adata_merge = adata_merge return
def _load_pbmcs_10x_cite_seq( save_path: str = "data/", protein_join: str = "inner", ): """ Filtered PBMCs from 10x Genomics profiled with RNA and protein. Datasets were filtered for doublets and other outliers as in https://github.com/YosefLab/totalVI_reproducibility/blob/master/data/data_filtering_scripts/pbmc_10k/pbmc_10k.py Parameters ---------- save_path Location to use when saving/loading the data. protein_join Whether to take an inner join or outer join of proteins Returns ------- `AnnData` with `.obsm["protein_expression"] Missing protein values are zero, and are identified during `AnnData` setup. """ url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_10k_protein_v3.h5ad?raw=true" save_fn = "pbmc_10k_protein_v3.h5ad" _download(url, save_path, save_fn) dataset1 = anndata.read_h5ad(os.path.join(save_path, save_fn)) dataset1.obs["batch"] = "PBMC10k" url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_5k_protein_v3.h5ad?raw=true" save_fn = "pbmc_5k_protein_v3.h5ad" _download(url, save_path, save_fn) dataset2 = anndata.read_h5ad( os.path.join(save_path, "pbmc_5k_protein_v3.h5ad")) dataset2.obs["batch"] = "PBMC5k" common_genes = dataset1.var_names.intersection(dataset2.var_names) dataset1 = dataset1[:, common_genes] dataset2 = dataset2[:, common_genes] dataset1.obsm["protein_expression"] = pd.DataFrame( dataset1.obsm["protein_expression"], columns=dataset1.uns["protein_names"], index=dataset1.obs_names, ) dataset2.obsm["protein_expression"] = pd.DataFrame( dataset2.obsm["protein_expression"], columns=dataset2.uns["protein_names"], index=dataset2.obs_names, ) del dataset1.uns["protein_names"] del dataset2.uns["protein_names"] dataset = anndata.concat([dataset1, dataset2], join=protein_join) dataset.obsm["protein_expression"] = dataset.obsm[ "protein_expression"].fillna(0) return dataset
def test_concat_outer_aligned_mapping(elem): a = gen_adata((5, 5)) b = gen_adata((3, 5)) del b.obsm[elem] concated = concat({"a": a, "b": b}, join="outer", label="group") result = concated.obsm[elem][concated.obs["group"] == "b"] check_filled_like(result, elem_name=f"obsm/{elem}")
def test_concat_annot_join(obsm_adatas, join_type): adatas = [ AnnData(sparse.csr_matrix(a.shape), obs=a.obsm["df"], var=a.var) for a in obsm_adatas ] pd.testing.assert_frame_equal( concat(adatas, join=join_type).obs, pd.concat([a.obs for a in adatas], join=join_type), )
def merge_datasets( datasets: Sequence[sc.AnnData], symbol_in_n_datasets: Union[int, None] = None, min_batch_size=25, ) -> sc.AnnData: """ Concatenate the anndata objects in `datasets`. Keeps symbols that are at least in `symbol_in_n_datasets` datasets. If `symbol_in_n_datasets` is None, it only keeps symbols that are in all datasets. Only keeps X, obs, var of all datasets. Adds log-norm transformed values to adata.raw. """ if symbol_in_n_datasets is None: symbol_in_n_datasets = len(datasets) gene_ids = [set(adata.var_names.values) for adata in datasets] symbol_count = Counter(itertools.chain.from_iterable(gene_ids)) keep_symbols = set( [sym for sym, c in symbol_count.items() if c >= symbol_in_n_datasets] ) datasets_subset = list() for dataset in datasets: tmp_sym = sorted(list(set(dataset.var_names.values) & keep_symbols)) tmp_adata = dataset[:, tmp_sym] tmp_obs = tmp_adata.obs.loc[:, MANDATORY_COLS + ["cell_type"]] # get rid of everything except X, obs, var datasets_subset.append( sc.AnnData(X=tmp_adata.X, obs=tmp_obs, var=tmp_adata.var) ) for dataset in datasets: validate_adata(dataset) adata_merged = anndata.concat(datasets_subset, index_unique="-", join="outer") # add log-norm values to `.raw` adata_merged_raw = adata_merged.copy() sc.pp.normalize_total(adata_merged_raw) sc.pp.log1p(adata_merged_raw) adata_merged.raw = adata_merged_raw # Exclude too small batches. adata_merged.obs["batch"] = [ f"{dataset}_{sample}" for dataset, sample in zip( adata_merged.obs["dataset"], adata_merged.obs["sample"] ) ] batch_size = adata_merged.obs.groupby("batch").size() keep_batchs = batch_size[batch_size > 25].keys().values adata_merged = adata_merged[adata_merged.obs["batch"].isin(keep_batchs), :].copy() return adata_merged
def test_concat_size_0_dim(axis, join_type, merge_strategy, shape): # https://github.com/theislab/anndata/issues/526 a = gen_adata((5, 7)) b = gen_adata(shape) alt_axis = 1 - axis dim = ("obs", "var")[axis] expected_size = expected_shape(a, b, axis=axis, join=join_type) result = concat( { "a": a, "b": b }, axis=axis, join=join_type, merge=merge_strategy, pairwise=True, index_unique="-", ) assert result.shape == expected_size if join_type == "outer": # Check new entries along axis of concatenation axis_new_inds = axis_labels(result, axis).str.endswith("-b") altaxis_new_inds = ~axis_labels(result, alt_axis).isin( axis_labels(a, alt_axis)) axis_idx = make_idx_tuple(axis_new_inds, axis) altaxis_idx = make_idx_tuple(altaxis_new_inds, 1 - axis) check_filled_like(result.X[axis_idx], elem_name="X") check_filled_like(result.X[altaxis_idx], elem_name="X") for k, elem in getattr(result, "layers").items(): check_filled_like(elem[axis_idx], elem_name=f"layers/{k}") check_filled_like(elem[altaxis_idx], elem_name=f"layers/{k}") if shape[axis] > 0: b_result = result[axis_idx].copy() mapping_elem = f"{dim}m" setattr(b_result, f"{dim}_names", getattr(b, f"{dim}_names")) for k, result_elem in getattr(b_result, mapping_elem).items(): elem_name = f"{mapping_elem}/{k}" # pd.concat can have unintuitive return types. is similar to numpy promotion if isinstance(result_elem, pd.DataFrame): assert_equal( getattr(b, mapping_elem)[k].astype(object), result_elem.astype(object), elem_name=elem_name, ) else: assert_equal( getattr(b, mapping_elem)[k], result_elem, elem_name=elem_name, )
def add_promoter(self, file_promoter, num_threads=20): if not os.path.exists(self.file_peaks_sort): self.generate_peaks_file() file_peaks_promoter = os.path.join(self.path_process, 'peaks_promoter.txt') os.system( f"bedtools intersect -a {self.file_peaks_sort} -b {file_promoter} -wao " f"> {file_peaks_promoter}") dict_promoter = defaultdict(list) all_peaks = set() with open(file_peaks_promoter, 'r') as w_pro: for line in w_pro: list_line = line.strip().split('\t') if list_line[4] == '.': continue gene = list_line[7].strip().split('<-')[0] peak = list_line[3] dict_promoter[gene].append(peak) all_peaks.add(peak) all_genes = dict_promoter.keys() adata_gene = self.adata[:, [ one_peak for one_peak in self.adata.var.index if one_peak in all_peaks ]] df_gene_peak = pd.DataFrame(adata_gene.X, index=adata_gene.obs.index, columns=adata_gene.var.index) all_cols = df_gene_peak.columns other_cols = set(self.adata.var.index).difference(all_cols) self.other_peaks = other_cols adata_other = self.adata[:, [ one_peak for one_peak in self.adata.var.index if one_peak in other_cols ]] adata_other.var['cRE_type'] = np.full(adata_other.n_vars, 'Other') pool = Pool(num_threads) func_sum = partial(self.sum_peaks, df_gene_peak, dict_promoter) result = pool.map(func_sum, all_genes) pool.close() # result = [one_df for one_df in result if one_df is not None] df_gene = pd.concat(result, axis=1) adata_promoter = \ ad.AnnData(X=df_gene, var=pd.DataFrame(data={'cRE_type': np.full(df_gene.shape[1], 'Promoter')}, index=df_gene.columns), obs=pd.DataFrame(index=df_gene.index)) self.all_genes = set(df_gene.columns) adata_merge = ad.concat([adata_promoter, adata_other], axis=1) self.adata_merge = adata_merge return
def test_concat_X_dtype(): adatas_orig = { k: AnnData(np.ones((20, 10), dtype=np.int8), dtype=np.int8) for k in list("abc") } for adata in adatas_orig.values(): adata.raw = AnnData(np.ones((20, 30), dtype=np.float64), dtype=np.float64) result = concat(adatas_orig, index_unique="-") assert result.X.dtype == np.int8 assert result.raw.X.dtype == np.float64
def normalize(adata, filter_min_counts=True, size_factors=True, normalize_input=True, logtrans_input=True, var_order=None): if filter_min_counts: sc.pp.filter_genes(adata, min_counts=1) sc.pp.filter_cells(adata, min_counts=1) # add/reorder vars if needed if var_order is not None: obs = adata.obs a, b = set(var_order), set(adata.var.index.to_list()) overlap = list(a.intersection(b)) missing = list(a - set(overlap)) logging.info( f'{len(overlap)} genes overlap with model after filtering') logging.info( f'{len(missing)} genes missing from dataset after filtering') new = adata[:, overlap] m = anndata.AnnData(X=np.zeros((adata.shape[0], len(missing))), obs=adata.obs) m.var.index = missing new = anndata.concat((new, m), axis=1) adata = new[:, var_order] adata.obs = obs if size_factors or normalize_input or logtrans_input: adata.raw = adata.copy() else: adata.raw = adata if size_factors: sc.pp.normalize_per_cell(adata) adata.obs['size_factors'] = adata.obs.n_counts / np.median( adata.obs.n_counts) else: adata.obs['size_factors'] = 1.0 if logtrans_input: sc.pp.log1p(adata) if normalize_input: sc.pp.scale(adata) return adata
def test_concat_interface_errors(): adatas = [gen_adata((5, 10)), gen_adata((5, 10))] with pytest.raises(ValueError): concat(adatas, axis=3) with pytest.raises(ValueError): concat(adatas, join="not implemented") with pytest.raises(ValueError): concat([])
def make_all_raw_datasets(samples, paths, names, meta): """ reads all datasets and performs integration :param samples: list of samples :param paths: list of paths :param names: list of names :param meta: list of metadata locations :return: """ datasets = [] for i in range(len(meta)): # make raw datsets using helper functions if meta[i] is None: dataset = make_raw_dataset(samples[i], paths[i], names[i]) sc.pp.filter_genes(dataset, min_cells=10) run_normalization(dataset, n_top_genes=10000) datasets.append(dataset) else: dataset = make_raw_dataset_tsv(samples[i], meta[i], paths[i], names[i]) sc.pp.filter_genes(dataset, min_cells=10) run_normalization(dataset, n_top_genes=10000) datasets.append(dataset) # concatenate data all_data = ad.concat(datasets, join='outer', label='sample id', index_unique='_', fill_value=0.0) # run harmony run_harmony_integration(all_data, normalize=False) # save data to reduce computation time with open('integrated/all_integrated', 'wb') as f: pickle.dump(all_data, f) datasets_integrated = [] for name in names: dataset_int = all_data[np.equal(all_data.obs['dataset'], name), :] name_str = 'integrated/' + name + '_integrated' with open(name_str, 'wb') as f: pickle.dump(dataset_int, f) datasets_integrated.append(dataset_int) return all_data, datasets_integrated
def test_de_4_groups(sparse): adata1 = get_example_data(sparse) adata2 = get_example_data(sparse) adata2.obs['sc_groups'] = adata2.obs['sc_groups'].replace({0: 2, 1: 3}) adata = anndata.concat((adata1, adata2)) adata.obs_names_make_unique() batch_size = 3 obs_field = 'sc_groups' adata.obs[obs_field] = adata.obs[obs_field].astype('category') nfeatures = adata.shape[1] get_batch_fn = lambda i: adata[:, i:min(nfeatures, i + batch_size)] de = DE(series=adata.obs[obs_field], nfeatures=nfeatures, batch_size=batch_size, get_batch_fn=get_batch_fn, base=get_base(adata)) for i in range(4): diff_results(adata, obs_field, de.pair2results[i], str(i))
def make_raw_dataset_tsv(samples, meta, path, name): """ Gets anndata object when samples are in tsv format :param samples: list of sample prefixes :param meta: metadata file path :param path: path to data :param name: name of dataset :return: full anndata object """ anndata_dict = {} metadata = get_francesconi_metadata(meta) for sm in samples: print(sm) full_path = path + sm # read data from geo file data = sc.read(full_path, cache=True) data = data.transpose() # add metadata info data.obs['dataset'] = name with open(full_path, 'r') as f: line = f.readline().split() # get first name n = line[0] time = metadata.loc[metadata['title'] == n, 'time'].to_string(index=False) treatment = metadata.loc[metadata['title'] == n, 'treatment'].to_string(index=False) time = time.replace('day ', 'D', 1) data.obs['timepoint'] = time if treatment == 'reprogramming' or time == '0h': anndata_dict[sm] = data # concatenate samples data_full = ad.concat(anndata_dict, join='outer', label='sample id', index_unique='_', fill_value=0.0) return rename_genes(data_full)
def load_scdata(self, data_directories, cell_types): # Read and merge 10X Genomics scRNA-seq data scdata = None print('Loading single cell dataset') for d, c in zip(tqdm(data_directories), cell_types): x = sc.read_10x_mtx(d) x.obs['celltype'] = [c]*len(x.obs.index) # Change each observation (cell) name to celltype + barcode x.obs.set_index(pd.Index([c+'_'+rn[:-2] for rn in x.obs.index]), inplace=True) if scdata is not None: scdata = ad.concat([scdata, x]) else: scdata = x # Filter out cells and genes sc.pp.filter_cells(scdata, min_genes=200) sc.pp.filter_genes(scdata, min_cells=1) # Search for prefix "MT-" (mitochondrial genes) and make new column in variable annotations # Search for prefix "RPL/RPS" for ribosomal genes and "MRPL/MRPS" for mitochondrial ribosomal genes scdata.var['mito'] = scdata.var.index.str.match('^MT-') scdata.var['ribo'] = scdata.var.index.str.startswith(('RPL','RPS')) scdata.var['mribo'] = scdata.var.index.str.startswith(('MRPL','MRPS')) # Calculate QC metrics as per McCarthy et al., 2017 (Scater) sc.pp.calculate_qc_metrics(scdata, qc_vars=['mito','ribo', 'mribo'], inplace=True) # Plot QC metrics # sns.jointplot(x='total_counts', y='n_genes_by_counts', height=8, data=scdata.obs, # kind='scatter', hue='celltype') # sns.jointplot(x='total_counts', y='pct_counts_mito', height=8, data=scdata.obs, # kind='scatter', hue='celltype') # sns.jointplot(x='total_counts', y='pct_counts_ribo', height=8, data=scdata.obs, # kind='scatter', hue='celltype') # sns.jointplot(x='total_counts', y='pct_counts_mribo', height=8, data=scdata.obs, # kind='scatter', hue='celltype') # plt.show() # Filter out cells with >5% of counts from mitochondria and mitoribosome # scdata = scdata[scdata.obs.pct_counts_ribo > 30, :] scdata = scdata[scdata.obs.pct_counts_mito < 5, :] scdata = scdata[scdata.obs.pct_counts_mribo < 1, :] return scdata
def states_across_time(): """ Makes tSNE plots across time for Babos and Shie :return: """ # Analyze states across time states = [[({ 'GSM3964244_MEFs_': 'D0' }, 'Data/Babos/', 'babos'), ({ 'GSM2836267_D0.': 'D0' }, 'Data/Shiebinger/GSE106340_RAW/', 'shiebinger')], [({ 'GSM3964245_6F_P4_': 'D4' }, 'Data/Babos/', 'babos'), ({ 'GSM2836270_D4-1.': 'D4' }, 'Data/Shiebinger/GSE106340_RAW/', 'shiebinger')], [({ 'GSM3964247_6F_P8_': 'D8' }, 'Data/Babos/', 'babos'), ({ 'GSM2836274_D8-1.': 'D8' }, 'Data/Shiebinger/GSE106340_RAW/', 'shiebinger')], [({ 'GSM3964249_6F_iMN1_': 'D14' }, 'Data/Babos/', 'babos'), ({ 'GSM2836288_iPSCs-serum.': 'iPSCs' }, 'Data/Shiebinger/GSE106340_RAW/', 'shiebinger')]] for state_data in states: raw_datasets = [make_raw_dataset(*sample) for sample in state_data] full_data = ad.concat(raw_datasets, join='outer', label='dataset') pca_df = run_harmony_integration(full_data) sc.tl.tsne(full_data, use_rep='X_pca_harmony') sc.pl.tsne(full_data, color='sample id')
def make_guide_count_tables(self): all_sgRNA_counts = [] for lane in self.lanes: sgRNA_counts = sc.read_h5ad(lane.GEX_fns['sgRNA_counts_h5ad']) lane_num = lane.name[-1] sgRNA_counts.obs.index = [ f'{cell_bc.rsplit("-", 1)[0]}-{lane_num}' for cell_bc in sgRNA_counts.obs_names ] all_sgRNA_counts.append(sgRNA_counts) sgRNA_data = ad.concat(all_sgRNA_counts) sgRNA_data.write(self.GEX_fns['sgRNA_counts_h5ad']) df = sgRNA_data.to_df().astype(int) df.index.name = 'cell_barcode' df.columns.name = 'guide_identity' df.to_csv(self.GEX_fns['sgRNA_counts_csv']) stacked = df.stack() stacked.name = 'UMI_count' stacked.index.names = ('cell_barcode', 'guide_identity') stacked.to_csv(self.GEX_fns['sgRNA_counts_list'])
return adata os.chdir( r"C:/Users/USER/Documents/R/RNAseq/scientificProject/data/Scadden/VeloData" ) csv_loc = r"C:/Users/USER/Documents/R/RNAseq/scientificProject/data/Scadden/R_references/subsetting_EC/" scv.settings.set_figure_params('scvelo') file_list = os.listdir() con_dir = {} for file in file_list: name = re.sub("_.+", "", file) con_dir[name] = subset_anndata(file, csv_loc + name) concat = anndata.concat(con_dir, axis=0, label="dataset") path = Path( r"C:/Users/USER/Documents/R/RNAseq/scientificProject/data/Scadden/" + r"Concat_raw.h5ad") concat.write_h5ad(filename=path, ) del concat del con_dir del file_list loc = r"C:/Users/USER/Documents/R/RNAseq/scientificProject/data/Scadden/Raw_based/" os.chdir(loc) adata = scv.read(path) adata.obs.dataset = [x for x in adata.obs.dataset] new_index = [] for ob in range(len(adata.obs.index)): cell = adata.obs.index[ob]
def merge_samples(adatalist): adata = ad.concat(adatalist, axis=0) adata.var = adatalist[0].var return adata