Exemple #1
0
    def setUp(self):

        # Get Haber count data
        data_raw = pd.read_csv("./data/haber_counts.csv")

        salm_indices = [0, 1, 2, 3, 8, 9]
        salm_df = data_raw.iloc[salm_indices, :]

        data_salm = dat.from_pandas(salm_df, covariate_columns=["Mouse"])
        data_salm.obs["Condition"] = data_salm.obs["Mouse"].str.replace(r"_[0-9]", "")
        self.data = data_salm
Exemple #2
0
    def test_from_pandas(self):
        # Get Haber Salmonella data
        data_raw = pd.read_csv("./data/haber_counts.csv")

        salm_indices = [0, 1, 2, 3, 8, 9]
        salm_df = data_raw.iloc[salm_indices, :]

        data_salm = dat.from_pandas(salm_df, covariate_columns=["Mouse"])
        data_salm.obs["Condition"] = data_salm.obs["Mouse"].str.replace(r"_[0-9]", "")

        # Only check size of x, obs
        x_shape = (data_salm.X.shape == (6, 8))
        obs_shape = (data_salm.obs.shape == (6, 2))

        self.assertTrue(x_shape & obs_shape)
Exemple #3
0
#%% Haber data on multiple categories

cell_counts = pd.read_csv(
    "C:\\Users\\Johannes\\Documents\\Uni\\Master's_Thesis\\compositionalDiff-johannes_tests_2\\data\\haber_counts.csv"
)

print(cell_counts)

# Convert data to anndata object

# Filter out salmonella data
salm_indices = [0, 1, 2, 3, 8, 9]
salm_df = cell_counts.iloc[salm_indices, :]

data_salm = dat.from_pandas(salm_df, covariate_columns=["Mouse"])

# Extract condition from mouse name and add it as an extra column to the covariates
data_salm.obs["Condition"] = data_salm.obs["Mouse"].str.replace(r"_[0-9]", "")
print(data_salm.X)
print(data_salm.obs)

#%%

salm_df.index = pd.Series([0, 1, 2, 3, 4, 5])
print(salm_df.index)
data_salm_2 = dat.from_pandas(salm_df, covariate_columns=["Mouse"])
data_salm_2.obs["Condition"] = data_salm_2.obs["Mouse"].str.replace(
    r"_[0-9]", "")

#%%
# read metadata
with open(data_path + "/sample-metadata.tsv", "rb") as f:
    metadata = pd.read_csv(f, sep="\t", index_col="sample-id").iloc[1:, :]

metadata_columns = [
    "subject", "reported-antibiotic-usage", "days-since-experiment-start",
    "body-site"
]

# add subject to count data
biom_data = pd.merge(biom_data,
                     metadata[metadata_columns],
                     left_index=True,
                     right_index=True)

data = dat.from_pandas(biom_data, metadata_columns)
data.obs = data.obs.rename(
    columns={
        "reported-antibiotic-usage": "antibiotic",
        "body-site": "site",
        "days-since-experiment-start": "days_since_start"
    })

print(data.obs)

#%%
importlib.reload(viz)

sns.set(style="ticks", font_scale=2)
args_swarmplot = {"hue": "subject", "size": 10, "palette": "Reds"}
viz.boxplot_facets(data, feature="site")
# read metadata
with open(data_path + "/sample-metadata.tsv", "rb") as f:
    metadata = pd.read_csv(f, sep="\t", index_col="sample-id").iloc[1:, :]

metadata_columns = [
    "subject", "reported-antibiotic-usage", "days-since-experiment-start",
    "body-site"
]

# add subject to count data
biom_data = pd.merge(biom_data,
                     metadata[metadata_columns],
                     left_index=True,
                     right_index=True)

data = dat.from_pandas(biom_data, metadata_columns)
data.obs = data.obs.rename(
    columns={
        "reported-antibiotic-usage": "antibiotic",
        "body-site": "site",
        "days-since-experiment-start": "days_since_start"
    })

print(data.obs)

#%%


def plot_one_stackbar(y, type_names, title, level_names):

    plt.figure(figsize=(20, 10))
cell_counts = celltypes.loc[:, ["sample_id", "cell_type", "counts"]].\
    set_index(["sample_id", "cell_type"]).unstack(fill_value=0).fillna(0).reset_index()
print(cell_counts)

#%%

cell_counts_2 = cell_counts
cell_counts_2.columns = cell_counts_2.columns.droplevel(0)
cell_counts_2 = cell_counts_2.rename(columns={"": "sample_id"})
cell_counts_2["Condition"] = cell_counts_2["sample_id"].str.replace(
    r"[0-9]", "")
print(cell_counts_2)

#%%
importlib.reload(dat)
data = dat.from_pandas(cell_counts_2, ["sample_id", "Condition"])
print(data.X)
print(data.obs)
print(data.var)

#%%

cells = cell_counts.iloc[:, 1:].to_numpy().astype("int")
print(cells)

obs = pd.DataFrame(cell_counts["sample_id"])
obs["Condition"] = obs["sample_id"].str.replace(r"[0-9]", "")
print(obs)

var = pd.DataFrame(index=cell_counts.iloc[:, 1:].columns.droplevel(0))
print(var)
# Remove otus with low counts (<100 total) --> Leaves 250 OTUs
# Leaving in low expression OTUs leads to nonconvergence for shorter chains (1000 samples),
# longer chains cant be done on my computer

counts_bal = data_bal.iloc[:, :-33]

counts_bal = counts_bal.loc[:, np.sum(counts_bal, axis=0) >= 100]

data_bal_expr = pd.merge(counts_bal,
                         data_bal.loc[:, col],
                         right_index=True,
                         left_index=True)

print(data_bal_expr)

data_scdcdm = dat.from_pandas(data_bal_expr, col)

print(data_scdcdm.X.shape)

# Free up some memory

del ([counts_bal, data, data_bal, metadata, meta_rel, file, otus, split])

#%%

# Experimental model configuration


class NoBaselineModelExperimental(dm.CompositionalModel):
    """"
    implements statistical model for compositional differential change analysis without specification of a baseline cell type