forked from redbluewater/NB_Distribution
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fxn_gatherDetails.py
340 lines (309 loc) · 17.5 KB
/
fxn_gatherDetails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
#make one py file with all the plotting functions and information gathering
#KLongnecker, 13 April 2016, updated 4/15/2016, updated 4/18/2016
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import palettable as pal
import glob
from Bio import SeqIO
from Bio.KEGG.REST import *
from Bio.KEGG.KGML import KGML_parser
from Bio.Graphics.KGML_vis import KGMLCanvas
from IPython.display import Image, HTML
def gatherDetails(makeNclusters,trimPath,forRelatedness,folderName,CO_fromMATLAB,KO_Norm2Mean,Insitu_TPM_DIA,Insitu_TPM_DIN,Insitu_TPM_Oth):
colLabel = ['nCpds','nGenes'] #starting with this is easiest - makes one list, no need to flatten
for item in range(makeNclusters):
colLabel.append('Km' + str(item) + '_cpd')
colLabel.append('Km' + str(item) + '_gene')
gatherCounts = pd.DataFrame(0, index = trimPath, columns = colLabel)
#setup the strings to match first
rnString = re.compile('(?:[rn:R])(\d+)$') #will return R00190
cpdString = re.compile('(?:[cpd:C])(\d+)$') #will return C00190
size = 20 #turns out I can increase the size of the compounds in the plots
for kn in range(makeNclusters):
fullSet = set(forRelatedness.KEGG)
oneK = forRelatedness[forRelatedness.kmeans == kn] #get gene & transcript information for one Kmeans group
getKm = 'Km' + str(kn)
#check if the directories exist, one for pathway files
directoryPDF = folderName + str(kn) + '/pathway_files'
if not os.path.exists(directoryPDF):
os.makedirs(directoryPDF)
else:
raise ValueError('Krista - be careful, this folder already exists')
#check if the directories exist, one for reaction files
directoryPNG = folderName + str(kn) + '/reaction_files'
if not os.path.exists(directoryPNG):
os.makedirs(directoryPNG)
else:
raise ValueError('Krista - be careful, this folder already exists')
#check if the directories exist, one for species
directorySpecies = folderName + str(kn) + '/species_files'
if not os.path.exists(directorySpecies):
os.makedirs(directorySpecies)
else:
raise ValueError('Krista - be careful, this folder already exists')
for item in trimPath: #searching within one pathway at a time
plotPathway = [] #gather up yes/no and will only plot if have linked genes/mtabs
genes = getKfrom_ko(item)
compounds = getCfrom_ko(item)
gatherCounts.loc[item,'nCpds'] = len(compounds)
gatherCounts.loc[item,'nGenes'] = len(genes)
#have to track genes and compounds differently for the biopython plotting later on
setG = set(genes)
setC = set(compounds)
setB = set(oneK.KEGG)
intGenes = setG.intersection(setB)
intCompounds = setC.intersection(setB)
gatherCounts.loc[item,(getKm + '_gene')] = len(intGenes)
gatherCounts.loc[item,(getKm + '_cpd')] = len(intCompounds)
for gen in intGenes: #go through each gene...one at a time
rnList = kegg_link('reaction',gen).read() #get the list of reactions for that gene
#can have cases where there is a gene and no reaction (K02906 for example). This returns rnList = '\n'
#since this is not actually empty...need a few way to filter those out
test = '\n'
if test != rnList:
for line in rnList.rstrip().split('\n'):
countCpd = []
countGene = []
m = rnString.search(line) #get the reaction number
cpdList = kegg_link('cpd',m.group(0)).read() #now go get the compounds for that reaction
#can have no compounds in a reaction (only glycans, begin with G, nothing I have matched)
if len(cpdList) > 1: #will be true if cpdList includes compounds
for line2 in cpdList.rstrip().split('\n'):
m2 = cpdString.search(line2).group(0)
#now that I have a compound, check if it is in intCompounds
if m2 in intCompounds:
countCpd.append(m2)
countGene.append(gen)
plotPathway.append('yes')
##Now, plot the PNG files (one for each reaction within a pathway)
if len(countCpd) > 0:
dayList = ['S1','S2','S3','S4','S5']
kData = pd.DataFrame(columns = dayList)
for k in set(countGene):
kData = kData.append(oneK.ix[k,dayList])
cData = pd.DataFrame(columns = dayList)
for co in set(countCpd):
#convert CO to RI, can have multiple options
j = findRInumber(oneK,co)
cData = cData.append(oneK.loc[j,dayList])
fig,ax = plt.subplots(1)
cData.T.plot(color = 'k',ax=ax)
kData.T.plot(color = 'r',ax=ax)
handles, labels = ax.get_legend_handles_labels()
#convert the RI numbers to COnumbers for the figure
for ia, a in enumerate(labels):
#add compound/gene name to the legend
if a[0]== 'R':
tLabel = convertRItoCO(CO_fromMATLAB,a)
fn = kegg_list(tLabel).read()
labels[ia] = fn
elif a[0] == 'K':
fn = kegg_list(a).read()
labels[ia] = fn
ax.legend(handles, labels, bbox_to_anchor = ([-1, 0.5]))
fig.suptitle('pathway ' + item + ', Kmeans grp ' + str(kn))
pngName = 'pathway' + item + '_' + m.group(0) + '.png'
fig.savefig(directoryPNG + '/' + pngName, bbox_inches = 'tight')
pngName = None #empty it in case that is where I am having issues
plt.close()
if len(plotPathway)>0:
## plot the pathway map for this pathway, get details from KEGG for plotting
useColors = pal.colorbrewer.qualitative.Set1_4.hex_colors
useColors.insert(0,'#f7f7f7') ## insert white at beginning
# order of colors: white, red, blue,green,purple
sd = 0 #not in dataset
sk = 1 #in K means group and pathway
sa = 2 #in pathway, in any K means (for genes, bc overlap in numbers)
sn = 3 #in pathway, not in K means group (compounds only)
su = 4 #unconnected gene or compound
line1 = useColors[sd] + ', not in dataset' + '\n'
line2 = useColors[sk] + ', in K means group and pathway' + '\n'
line3 = useColors[sa] + ', #in pathway, in any K means (for genes, bc overlap in numbers)' +'\n'
line4 = useColors[sn] + ', #in pathway, not in K means group (compounds only)' + '\n'
line5 = useColors[su] + ', #unconnected gene or compound' + '\n'
file = open("readme_colorsInPathways.txt", "w")
file.write(line1 + line2 + line3 + line4 + line5)
file.close()
pathway = KGML_parser.read(kegg_get(item, "kgml"))
for element in pathway.orthologs:
#print element.name
for graphic in element.graphics:
tg = element.name[3:9] #skip over the 'ko:'
if (tg in intGenes):
#in the pathway AND in the set for this particular K means group
graphic.bgcolor = useColors[sk] #
#if this is something in the pathway, plot up the species for the K number
if tg in Insitu_TPM_DIA.index.tolist():
Dk=Insitu_TPM_DIA.loc[tg]
else:
Dk = 0/Insitu_TPM_DIA.iloc[0] #make an empty frame
if tg in Insitu_TPM_DIN.index.tolist():
Nk=Insitu_TPM_DIN.loc[tg]
else:
Nk = 0/Insitu_TPM_DIN.iloc[0]
if tg in Insitu_TPM_Oth.index.tolist():
Ok=Insitu_TPM_Oth.loc[tg]
else:
Ok = 0/Insitu_TPM_Oth.iloc[0]
fig,ax=plt.subplots(1)
ax.stackplot(range(5), Dk, Nk, Ok, colors=pal.colorbrewer.qualitative.Set3_6_r.hex_colors, lw=0)
ax.set_xticks(range(5))
ax.set_xticklabels([1,2,3,4,5])
ax.set_ylabel('In situ TPM')
plt.title(tg + ', lt orange=diatoms, blue=dinos, dk orange=other')
fig.savefig(directorySpecies + '/' + tg + '_species.png',bbox_inches='tight')
plt.close()
elif (tg in fullSet) and (tg in genes) and (tg not in intGenes):
#in the pathway AND in the set of genes from RI, allow any Kmeans group for genes
graphic.bgcolor = useColors[sa] #
elif (tg not in fullSet) and (tg in genes) and (tg not in KO_Norm2Mean.index.tolist()):
#in the pathway, but *not* in anything from the RI samples
graphic.bgcolor = useColors[sd] #
elif (tg not in fullSet) and (tg in genes) and (tg in KO_Norm2Mean.index.tolist()):
#an unconnected gene in the RI data
graphic.bgcolor = useColors[su] #
# Change the colours of compounds (mostly same as genes
for element in pathway.compounds:
for graphic in element.graphics:
tc = element.name[4:10] #skip over the 'cpd:'
if (tc in intCompounds):
#in the pathway AND in the set for this particular K means group
graphic.bgcolor = useColors[sk] #
graphic.width = size
graphic.height = size
elif (tc in fullSet) and (tc in compounds) and (tc not in intCompounds):
#in the pathway AND in the set of compounds from RI, but *not* in this Kmeans group
graphic.bgcolor = useColors[sn] #
graphic.width = size
graphic.height = size
elif (tc not in fullSet) and (tc in compounds) and (tc not in CO_fromMATLAB.cNumber.values):
#in the pathway, but *not* in anything from the RI samples
graphic.bgcolor = useColors[sd] #
elif (tc not in fullSet) and (tc in compounds) and (tc in CO_fromMATLAB.cNumber.values): #seems like a hack
#unconnected compound in the RI data
graphic.bgcolor = useColors[su] #
graphic.width = size
graphic.height = size
canvas = KGMLCanvas(pathway, import_imagemap=True)
pdfName = 'mapWithColors_' + str(item) + '.pdf'
canvas.draw(directoryPDF + '/' + pdfName)
pdfName = None #empty it in case that is where I am having issues
#stick the pathway information into gatherCounts before I export...
#want to export gatherCounts, with the added pathway name as a new column
gatherCounts['pathwayInfo'] = ''
gatherCounts['pathwayGroup_A'] = ''
gatherCounts['pathwayGroup_B'] = ''
gatherCounts['pathwayGroup_C'] = ''
#go read in the file from KEGG
D = glob.glob('br08901.keg') #from http://www.genome.jp/kegg-bin/get_htext?br08901.keg; 3/15/2016
allBRITE=[]
for idx,nof in enumerate(D):
allBRITE = ReadBRITEfile(nof)
#put the pathway name and group into the data frame before exporting it
for item in gatherCounts.index:
#if this error appears: IndexError: index 0 is out of bounds for axis 0 with size 0
#KEGG has updated a pathway, but not the BRITE file (see below for work around)
pathstr = kegg_list(item).read()
#this next line splits the string at the '\t', then keeps the piece at index = 1, and strips off the '\n'
gatherCounts.loc[item,('pathwayInfo')] = pathstr.split('\t')[1].rstrip()
t = allBRITE.loc[allBRITE['map']==item[2:]]
#put in a check to see if t.empty ...will be empty if KEGG updated pathway and not BRITE file
if t.empty is False:
gatherCounts.set_value(item,'pathwayGroup_A',t['A'].values[0])
gatherCounts.set_value(item,'pathwayGroup_B',t['B'].values[0])
gatherCounts.set_value(item,'pathwayGroup_C',t['C'].values[0])
return gatherCounts
#set up a function to get the list of K orthologues for a given pathway (must be defined as ko00140 NOT map00140)
def getKfrom_ko(ko_id):
pathway_file = kegg_get(ko_id).read() # query and read the pathway
K_list = []
current_section = None
for line in pathway_file.rstrip().split("\n"):
section = line[:12].strip() # section names are within 12 columns
if not section == "":
current_section = section
if current_section == "ORTHOLOGY":
K_identifiers = line[12:].split("; ")
t = K_identifiers[0]
K_id = t[0:6]
if not K_id in K_list:
K_list.append(K_id)
return K_list
#set up a function to get the list of compounds for a given pathway (must be defined as ko00140 NOT map00140)
def getCfrom_ko(ko_id):
pathway_file = kegg_get(ko_id).read() # query and read the pathway
compound_list = []
current_section = None
for line in pathway_file.rstrip().split("\n"):
section = line[:12].strip() # section names are within 12 columns
if not section == "":
current_section = section
if current_section == "COMPOUND":
compound_identifiers = line[12:].split("; ")
t = compound_identifiers[0]
compound_id = t[0:6]
if not compound_id in compound_list:
compound_list.append(compound_id)
return compound_list
def findRInumber(dataIn,KEGGin):
#find possible RI numbers for a given KEGG number.
dataOut = []
for i,KEGG in enumerate(dataIn['KEGG']):
if KEGG == KEGGin:
t = dataIn.index[i]
dataOut.append(t)
return dataOut
def convertRItoCO(dataIn,RIin):
#do the reverse, given an RInumber find the cNumber
dataOut = dataIn.loc[RIin].loc['cNumber']
return dataOut
# A bit of code that will help us display the PDF output
def PDF(filename):
return HTML('<iframe src=%s width=700 height=350></iframe>' % filename)
# A bit of helper code to shorten long text
def head(text, lines=10):
""" Print the first lines lines of the passed text.
"""
print '\n'.join(text.split('\n')[:lines] + ['[...]'])
#organize pathways into the groups defined in the BRITE file
def ReadBRITEfile(briteFile):
forBrite = pd.DataFrame(columns = ['map','A','B','C','wholeThing'])
# set up the expressions to match each level in the BRITE hierarchy
textA = re.compile(r'(^A<b>)(.+)(</b>)\s*(.*)$')
textB = re.compile(r'(^B)\s*(.*)$')
textC = re.compile(r'(\d+)\s*(.*)$')
#this relies on the fact that the rows are in order: A, with B subheadings, then C subheadings
setA = []
idxA = []
setB = []
setC = []
with open(briteFile) as f:
for idx,line in enumerate(f):
if line[0] is not '#': #skip over the comments
mA = textA.search(line)
mB = textB.search(line)
mC = textC.search(line)
if mA:
setA = mA.group(2)
#house cleaning (probably c)
idxA = idx
forBrite.loc[idx,'A'] = setA
forBrite.loc[idx,'wholeThing'] = line #using this as a double check for now
#forBrite.loc[idx,'map'] = mC.group(1)
elif mB:
setB = mB.group(2)
forBrite.loc[idx,'A'] = setA
forBrite.loc[idx,'B'] = setB
forBrite.loc[idx,'wholeThing'] = line
#forBrite.loc[idx,'map'] = mC.group(1)
elif mC:
#Tracer()()
setC = mC.group(2)
forBrite.loc[idx,'A'] = setA
forBrite.loc[idx,'B'] = setB
forBrite.loc[idx,'C'] = setC
forBrite.loc[idx,'wholeThing'] = line
forBrite.loc[idx,'map'] = mC.group(1)
return forBrite