-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
121 lines (108 loc) · 2.99 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os, sys
import h5py
import biom
import pandas
import numpy
ranks = {
'phylum': 2,
'class': 3,
'order': 4,
'family': 5,
'species': 6,
}
biopsy_loc_dict = {
"TI" : ["Terminal ileum", "Terminalileum"],
"Stool" : ["stool"],
"Sigmoid" : ["Sigmoid", "Recto-Sigmoid"],
"Rectum" : ["Rectum"],
"Colon" : ["Descending colon", "Ascending colon", "Transverse colon"],
"Cecum" : ["Cecum"]
}
label_map = {
"cd" : {
"CD" : 1,
"no" : 0
},
"sex" : {
"female" : 1,
"male": 0
},
"smoking" : {
"Current" : 1,
"Never" : 0,
},
"steroids" : {
"true" : 1,
"false" : 0,
},
"mesalamine" : {
"true" : 1,
"false" : 0,
},
"race" : {
"african" : 1,
"caucasian" : 0,
},
"immunosup" : {
"true" : 1,
"false" : 0,
},
"ileal_invovlement" : {
"true" : 1,
"false" : 0,
},
"antibiotics" : {
"true" : 1,
"false" : 0,
True : 1,
False : 0,
},
"inflammationstatus" : {
"inflamed" : 1,
"non-inflamed" : 0,
},
}
def collapse_to(lst, item, default='Unassigned'):
if len(lst)> item:
return lst[item]
else:
return default
def load_b(files):
b = []
for file in files:
b.append(biom.load_table(file))
b_all = b[0]
for i in range(1,len(b)):
b_all = b_all.merge(b[i], sample='union', observation='union')
b_norm = b_all.norm(axis = 'sample', inplace=False)
b_rank = {}
for r,n in ranks.items():
b_rank[r] = b_norm.collapse(lambda id_, md: collapse_to(md['taxonomy'], n, 'Unassigned'), axis='observation', norm=False).to_dataframe(dense=True)
b_rank['zotu'] = b_norm.to_dataframe(dense=True)
return b_rank
def load_m(files):
m = pandas.read_csv(files[0], sep='\t', index_col=0)
for i in range(1,len(files)):
m.append(pandas.read_csv(files[i], sep='\t', index_col=0), sort=True)
return m
def get_label(label, m, ind):
if label == "cd":
return (m[m.sample_name == ind].gastrointest_disord).to_numpy()[0]
elif label == "sex":
return (m[m.sample_name == ind].sex).to_numpy()[0]
elif label == "mesalamine":
return (m[m.sample_name == ind].mesalamine).to_numpy()[0]
elif label == "steroids":
return (m[m.sample_name == ind].steroids).to_numpy()[0]
elif label == "smoking":
return (m[m.sample_name == ind].smoking).to_numpy()[0]
elif label == "race":
return (m[m.sample_name == ind].race).to_numpy()[0]
elif label == "immunosup":
return (m[m.sample_name == ind].immunosup).to_numpy()[0]
elif label == "ileal_invovlement":
return (m[m.sample_name == ind].ileal_invovlement).to_numpy()[0]
elif label == "antibiotics":
return (m[m.sample_name == ind].antibiotics).to_numpy()[0]
elif label == "inflammationstatus":
return (m[m.sample_name == ind].inflammationstatus).to_numpy()[0]