def setUp(self): # Get Haber count data data_raw = pd.read_csv("./data/haber_counts.csv") salm_indices = [0, 1, 2, 3, 8, 9] salm_df = data_raw.iloc[salm_indices, :] data_salm = dat.from_pandas(salm_df, covariate_columns=["Mouse"]) data_salm.obs["Condition"] = data_salm.obs["Mouse"].str.replace(r"_[0-9]", "") self.data = data_salm
def test_from_pandas(self): # Get Haber Salmonella data data_raw = pd.read_csv("./data/haber_counts.csv") salm_indices = [0, 1, 2, 3, 8, 9] salm_df = data_raw.iloc[salm_indices, :] data_salm = dat.from_pandas(salm_df, covariate_columns=["Mouse"]) data_salm.obs["Condition"] = data_salm.obs["Mouse"].str.replace(r"_[0-9]", "") # Only check size of x, obs x_shape = (data_salm.X.shape == (6, 8)) obs_shape = (data_salm.obs.shape == (6, 2)) self.assertTrue(x_shape & obs_shape)
#%% Haber data on multiple categories cell_counts = pd.read_csv( "C:\\Users\\Johannes\\Documents\\Uni\\Master's_Thesis\\compositionalDiff-johannes_tests_2\\data\\haber_counts.csv" ) print(cell_counts) # Convert data to anndata object # Filter out salmonella data salm_indices = [0, 1, 2, 3, 8, 9] salm_df = cell_counts.iloc[salm_indices, :] data_salm = dat.from_pandas(salm_df, covariate_columns=["Mouse"]) # Extract condition from mouse name and add it as an extra column to the covariates data_salm.obs["Condition"] = data_salm.obs["Mouse"].str.replace(r"_[0-9]", "") print(data_salm.X) print(data_salm.obs) #%% salm_df.index = pd.Series([0, 1, 2, 3, 4, 5]) print(salm_df.index) data_salm_2 = dat.from_pandas(salm_df, covariate_columns=["Mouse"]) data_salm_2.obs["Condition"] = data_salm_2.obs["Mouse"].str.replace( r"_[0-9]", "") #%%
# read metadata with open(data_path + "/sample-metadata.tsv", "rb") as f: metadata = pd.read_csv(f, sep="\t", index_col="sample-id").iloc[1:, :] metadata_columns = [ "subject", "reported-antibiotic-usage", "days-since-experiment-start", "body-site" ] # add subject to count data biom_data = pd.merge(biom_data, metadata[metadata_columns], left_index=True, right_index=True) data = dat.from_pandas(biom_data, metadata_columns) data.obs = data.obs.rename( columns={ "reported-antibiotic-usage": "antibiotic", "body-site": "site", "days-since-experiment-start": "days_since_start" }) print(data.obs) #%% importlib.reload(viz) sns.set(style="ticks", font_scale=2) args_swarmplot = {"hue": "subject", "size": 10, "palette": "Reds"} viz.boxplot_facets(data, feature="site")
# read metadata with open(data_path + "/sample-metadata.tsv", "rb") as f: metadata = pd.read_csv(f, sep="\t", index_col="sample-id").iloc[1:, :] metadata_columns = [ "subject", "reported-antibiotic-usage", "days-since-experiment-start", "body-site" ] # add subject to count data biom_data = pd.merge(biom_data, metadata[metadata_columns], left_index=True, right_index=True) data = dat.from_pandas(biom_data, metadata_columns) data.obs = data.obs.rename( columns={ "reported-antibiotic-usage": "antibiotic", "body-site": "site", "days-since-experiment-start": "days_since_start" }) print(data.obs) #%% def plot_one_stackbar(y, type_names, title, level_names): plt.figure(figsize=(20, 10))
cell_counts = celltypes.loc[:, ["sample_id", "cell_type", "counts"]].\ set_index(["sample_id", "cell_type"]).unstack(fill_value=0).fillna(0).reset_index() print(cell_counts) #%% cell_counts_2 = cell_counts cell_counts_2.columns = cell_counts_2.columns.droplevel(0) cell_counts_2 = cell_counts_2.rename(columns={"": "sample_id"}) cell_counts_2["Condition"] = cell_counts_2["sample_id"].str.replace( r"[0-9]", "") print(cell_counts_2) #%% importlib.reload(dat) data = dat.from_pandas(cell_counts_2, ["sample_id", "Condition"]) print(data.X) print(data.obs) print(data.var) #%% cells = cell_counts.iloc[:, 1:].to_numpy().astype("int") print(cells) obs = pd.DataFrame(cell_counts["sample_id"]) obs["Condition"] = obs["sample_id"].str.replace(r"[0-9]", "") print(obs) var = pd.DataFrame(index=cell_counts.iloc[:, 1:].columns.droplevel(0)) print(var)
# Remove otus with low counts (<100 total) --> Leaves 250 OTUs # Leaving in low expression OTUs leads to nonconvergence for shorter chains (1000 samples), # longer chains cant be done on my computer counts_bal = data_bal.iloc[:, :-33] counts_bal = counts_bal.loc[:, np.sum(counts_bal, axis=0) >= 100] data_bal_expr = pd.merge(counts_bal, data_bal.loc[:, col], right_index=True, left_index=True) print(data_bal_expr) data_scdcdm = dat.from_pandas(data_bal_expr, col) print(data_scdcdm.X.shape) # Free up some memory del ([counts_bal, data, data_bal, metadata, meta_rel, file, otus, split]) #%% # Experimental model configuration class NoBaselineModelExperimental(dm.CompositionalModel): """" implements statistical model for compositional differential change analysis without specification of a baseline cell type