forked from fedarko/poisson-cat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
executable file
·117 lines (109 loc) · 3.58 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
import click
from biom import load_table
import pandas as pd
from poisson_cat import poisson_cat
# def get_row_indices_where_value_in_col(df, col_name, value):
# row_indices = []
# i = 0
# for row_val in df[col_name]:
# if row_val == value:
# row_indices.append(i)
# i += 1
# return row_indices
@click.command()
@click.option(
"-t", "--table", required=True, help="BIOM table with count data"
)
@click.option("-m", "--metadata", required=True, help="Sample metadata file")
@click.option(
"-c",
"--category",
required=True,
help=(
"Metadata category of interest; currently only binary categories "
"(i.e. those containing only two unique values) are supported"
),
)
@click.option(
"-r",
"--reference-category",
default=None,
help=(
"Reference metadata category of interest; if not "
"specified, the first category will be picked"
),
)
@click.option(
"-o",
"--output-path",
required=True,
help="Output filepath to which differentials TSV will be written",
)
@click.option(
"-f",
"--filter-category-value",
default=None,
help=(
"If passed, this will filter out all samples with a -c category value "
'of this string. This will also afterwards filter out all "empty" '
"features. This is useful if you have a "
"category with three possible values that you'd like to make into a "
"binary category, so that it can be used here."
),
)
def run_poisson_cat(
table: str,
metadata: str,
category: str,
reference_category: str,
output_path: str,
filter_category_value: str,
) -> None:
# table_df = load_table(table).to_dataframe()
loaded_table = load_table(table)
metadata_df = pd.read_csv(metadata, index_col=0, sep="\t")
unique_cats = metadata_df[category].unique()
if filter_category_value is not None and unique_cats.shape[0] > 2:
if filter_category_value in unique_cats:
# Based on https://stackoverflow.com/a/18173074/1073
print(
"Number of samples pre-filtering: {}".format(
metadata_df.shape[0]
)
)
# filtered_row_idxs = metadata_df[
# metadata_df[category] == filter_category_value
# ].index
metadata_df = metadata_df[
metadata_df[category] != filter_category_value
]
loaded_table.filter(metadata_df.index)
print(
"Number of samples post-filtering: {}".format(
metadata_df.shape[0]
)
)
print(
"Number of features pre-filtering those with 0 counts: {}".format(
loaded_table.shape[0]
)
)
# remove features in table with 0 counts (to eliminate any features
# that were only present in now-filtered-out samples)
loaded_table.remove_empty(axis="observation")
print(
"Number of features post-filtering those with 0 counts: {}".format(
loaded_table.shape[0]
)
)
# TODO remove samples in table without a certain amount of reads
# supporting them?
print("Running poisson_cat...")
diff = poisson_cat(loaded_table, metadata_df, category, reference_category)
print("Done.")
diff.to_csv(
output_path, sep="\t", header=["Differential"], index_label="FeatureID"
)
if __name__ == "__main__":
run_poisson_cat()