-
Notifications
You must be signed in to change notification settings - Fork 4
/
metatranscriptomics_bipype.py
855 lines (708 loc) · 28 KB
/
metatranscriptomics_bipype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
import sqlite3
import cPickle
import subprocess
from bs4 import BeautifulSoup
from glob import glob
from urllib import urlopen
from multiprocessing import Process
from collections import Counter
from time import time
from os.path import exists as pexists
from os.path import join as pjoin
from os.path import dirname, realpath
from os import system, chdir, getcwd
from settings_bipype import *
def dicto_reduce(present, oversized):
"""Removes all elements from dictionaries,
which keys aren't present in both.
Args:
present: dict
oversized: dict
Returns:
(oversized, present):
tuple of dicts
Warning:
Order of parametres is opposite to results.
Example:
>>> dict_1={'a':1,'c':3,'d':4}
>>> dict_2={'a':3,'b':4,'c':4}
>>> dicto_reduce(dict_1, dict_2)
({'a': 3, 'c': 4}, {'a': 1, 'c': 3})
"""
surplus = set(oversized.keys()) - set(present.keys())
for gid in surplus:
del oversized[gid]
unaccounted = set(present.keys()) - set(oversized.keys())
for gid in unaccounted:
del present[gid]
return (oversized, present)
def connect_db(db):
"""Connects database
Args:
db: Path to SQL database
Returns:
Cursor object to database
"""
conn = sqlite3.connect(db)
conn.row_factory = sqlite3.Row
conn.text_factory = str
return conn.cursor()
def get_tables(database):
"""Prints all tables included in SQLite3 database.
Args:
database: Cursor object to SQLite3 database.
"""
database.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = database.fetchall()
for row in tables:
print row[0]
def auto_tax_read(db_loc):
"""Reads pickled ``{KEGG GENES number: set[KO identifiers]}`` dict."""
with open(db_loc, 'rb') as file_:
dictionary = cPickle.load(file_)
file_.close()
return dictionary
def pickle_or_db(pickle, db):
"""Reads pickle or SQL database, than makes a dict.
If appropriate pickle (a dict) is available, it is read.
In the other case function reads 'kogenes' table from
SQL database and makes missing pickle. Eventually returns dict.
Args:
pickle: Path to pickled dict in following format:
``{KEGG GENES identifier : set[KO identifiers]}``
db: Cursor object to SQL database with 'kogenes' table
``(KO identifier KEGG GENES identifier)``
Returns:
Dict in ``{KEGG GENES identifier: set[KO identifiers]}`` format.
Some information for Bipype's developers
(delete this before final version):
Code from this fuction was not a fuction in previous version and
'args' was hardcoded to:
'kogenes.pckl' & c (variable with db's cursor)
"""
start_time = time()
multi_id = {}
if pexists(pickle):
multi_id = auto_tax_read(pickle)
kogenes_time = time()
print ('kogenes reading time', kogenes_time - start_time)
else:
db.execute('select * from KoGenes')
KoPath_gid_all = db.fetchall()
for (koid, gid,) in KoPath_gid_all:
if gid not in multi_id:
multi_id[gid] = set([koid])
else:
multi_id[gid].add(koid)
with open(pickle, 'w') as output:
cPickle.dump(multi_id, output)
return multi_id
def get_pathways(database):
"""Make dictionary from pathways table from SQLite3 database.
Args:
database: Cursor object to SQLite3 database.
Returns:
dict: dictionary in following format::
{KEGG_Pathway_id:Name}
For example::
{
'ko04060': 'Cytokine-cytokine receptor interaction',
'ko00910': 'Nitrogen metabolism'
}
"""
database.execute('select * from Pathways')
paths = database.fetchall()
pathways = {}
for path in paths:
pathways[path[0]] = path[1]
return pathways
def get_kopathways(database):
"""Makes dictionaries from kopathways table from SQLite3 database.
Args:
database: Cursor object to SQLite3 database.
Returns:
Two dictionaries:
id -> pathways::
{KO identifier: set[KEGG_Pathway_ids]}
For example::
{
'K01194': set(['ko00500','ko00600',...]),
'K04501': set(['ko04390',...])
}
pathway -> ids mappings::
{KEGG_Pathway_id: set[KO identifiers]}
For example::
{ko12345: set([K12345, K12346,...]),...}
"""
database.execute('select * from KoPathways')
kopaths = database.fetchall()
kopathways = {}
kopath_path = {}
for kopath in kopaths:
if kopath[0] not in kopathways:
kopathways[kopath[0]] = set([kopath[1]])
else:
kopathways[kopath[0]].add(kopath[1])
try:
kopath_path[kopath[1]].add(kopath[0])
except KeyError:
kopath_path[kopath[1]] = set([kopath[0]])
return (kopathways, kopath_path)
def m8_to_ko(file_, multi_id):
"""Assigns and counts KEGG GENES identifiers from BLAST Tabular
(flag: -m 8) output format file, for every KO from multi_id.
After mapping, writes data to output file.
Args:
`file_`: Path to BLAST Tabular (flag: -m 8) format file
multi_id: Dict ``{KEGG GENES identifier : set[KO identifiers]}``
Output file (outname) has following name::
outname = file_.replace('txt.m8', 'count')
and following format::
K00161 2
K00627 0
K00382 11
"""
start_time = time()
tmp_ko_dict = {}
outname = file_.replace('txt.m8', 'count')
content = open(file_, 'r')
hit_gid = []
for line in content:
if line[0] != '#':
gid = line.split('\t')[1]
hit_gid.append(gid)
file_reading_time = time()
print (file_, 'file_reading seconds', file_reading_time - start_time)
gid_count = Counter(hit_gid)
(multi_clean, gid_clean,) = dicto_reduce(gid_count, multi_id)
cleaning_time = time()
print (file_, 'cleaning time seconds', cleaning_time - file_reading_time)
for gid in gid_clean:
for ko in multi_clean[gid]:
try:
tmp_ko_dict[ko] += gid_clean[gid]
except KeyError:
tmp_ko_dict[ko] = gid_clean[gid]
comparison_time = time()
print (file_, 'comparing time seconds', comparison_time-cleaning_time)
with open(outname, 'w') as out_file:
for ko in tmp_ko_dict:
to_print = '%s\t%i\n' % (ko, tmp_ko_dict[ko])
out_file.write(to_print)
writing_time = time()
print (
file_, 'comparing time seconds', writing_time - comparison_time,
'total time', writing_time - start_time
)
def out_content(filelist, kopath_values, path_names, method='DESeq2'):
"""For every item in 'kopath_values' dictionary and for every file
in 'filelist', writes to output file line with KOs, which are common
for item.value and the set of KOs obtained from file.
Args:
filelist:
List of paths to tab-delimited .txt files, where
first column is a KO identifier.
kopath_values:
``{KEGG_Pathway_id:set[KO identifiers]}`` dict.
For example::
{ko12345:set([K12345, K12346,...]),...}
path_names:
Dictionary in ``{KEGG_Pathway_id:Name}`` format.
For example::
{
'ko04060': 'Cytokine-cytokine receptor interaction',
'ko00910': 'Nitrogen metabolism'
}
method:
Argument used only as a part of output file name
Output file has following name::
(method+'_'+filename.replace('txt', 'path_counts.csv'))
where:
filename = filepath.split('\')[-1], if '\' in filepath.
filename = filepath.split('/')[-1], if '/' in filepath.
filename = filepath, in other cases.
anf following headline::
ko_path_id;ko_path_name;percent common;common KOs
Writes only lines with non-zero common KOs.
"""
for filepath in filelist:
if '\\' in filepath:
filename = filepath.split('\\')[-1]
elif '/' in filepath:
filename = filepath.split('/')[-1]
else:
filename = filepath
outname = method + '_' + filename.replace('txt', 'path_counts.csv')
Kids = set()
with open(filepath, 'r') as file_:
filecontent = file_.readlines()[1:]
for line in filecontent:
Kid = line.rstrip().split('\t')[0]
Kids.add(Kid)
with open(outname, 'w') as outfile:
outfile.write('ko_path_id;ko_path_name;percent common;common KOs\n')
for (path, Kset,) in kopath_values.items():
common = Kids & Kset
if len(common) > 0:
percent_ko = str(len(common) * 100.0 / len(Kset))
print_ko = ' '.join(common)
path_name_comma = path_names[path]
path_name = path_name_comma.replace(',', ' _')
outline = ';'.join([
path,
path_name,
percent_ko,
print_ko
]) + '\n'
outfile.write(outline)
def fastq_to_fasta(fastq):
"""Runs fastq_to_fasta on fastq.
GLOBAL:
- path to fastq_to_fasta program: PATH_FQ2FA
"""
out_file = fastq.rsplit('.', 1)[0] + '.fasta'
out_file = out_file.rsplit('/', 1)[-1]
subprocess.check_call([PATH_FQ2FA, '-i', fastq, '-o', out_file, '-Q33'])
def rapsearch2(input_file, threads):
"""Runs ``rapsearch2`` for `input_file` in fasta format.
Writes outputs in "m8/" directory.
GLOBALS:
- path to RAPSearch2 program: PATH_RAPSEARCH
- path to similarity search database: PATH_REF_PROT_KO
"""
out_name = input_file.replace('tmp.fasta', 'txt')
subprocess.check_call(
[
PATH_RAPSEARCH, '-q', input_file, '-d',
PATH_REF_PROT_KO, '-o', out_name, '-z', str(threads),
'-v', '20', '-b', '1', '-t', 'n', '-a', 't'
]
)
def get_ko_fc(ko_dict, ref_cond, filepath, deseq=False):
"""From given table file (SARTool), adds found fold changes to ko_dict.
Args:
ko_dict: ``{KO_id:{cond1:value1, cond2:value2...}...}`` dict
ref_cond: reference condition (string)
filepath: filepath to output table file from edgeR or DESeq2
deseq: True, if filepath points to DESeq2 table file
False, if filepath points to edgeR table file
Returns:
ko_dict with added fold changes from table file
"""
with open(filepath) as _file:
l_cond = filepath.split('/')[-1].split('.')[0].split('vs', 2)
if l_cond[0] != ref_cond:
cond = l_cond[0]
else:
cond = l_cond[1]
file_content = _file.readlines()
if deseq:
fc_index = file_content[0].rstrip().split('\t').index('FoldChange')
else:
fc_index = file_content[0].rstrip().split('\t').index('FC')
for i in range(1, len(file_content)):
line = file_content[i].rstrip().split('\t')
KO = line[0]
base_fc = line[fc_index]
fc_num = True
try:
fc = float(base_fc)
except ValueError:
fc = base_fc
fc_num = False
if '.down.' in filepath and fc_num:
fc = -1.0 / float(line[fc_index])
if KO in ko_dict.keys():
ko_dict[KO][cond] = fc
else:
ko_dict[KO] = {cond: fc}
return ko_dict
def low_change(ko_dict, all_conds):
"""For every KO adds condition: 0, if condition is missing.
Args:
ko_dict: ``{KO_id:{cond1:value1, cond2:value2...}...}`` dict
all_conds: list of conditions (list of strings)
Returns:
suplemented ko_dict
For example::
low_change(
{
'K12345': {'pH5': 1.41, 'pH6': 1.73},
'K23456': {'pH6': 2.0, 'pH8': 2.24}
},
['pH5', 'pH6', 'pH8']
)
gives::
{
'K12345': {'pH5': 1.41, 'pH6': 1.73, 'pH8': 0.0},
'K23456': {'pH5': 0.0, 'pH6': 2.0, 'pH8': 2.24}
}
"""
for (KO, conds,) in ko_dict.items():
for cond in all_conds:
if cond not in conds:
ko_dict[KO][cond] = 0.0
return ko_dict
def get_kegg_name(ko):
"""Returns name assigned to given KO identifier (from kegg.jp)
Args:
ko: KO identifier (string)
Returns:
name assigned to ko (string)
"""
url = 'http://www.kegg.jp/dbget-bin/www_bget?ko:' + ko
name = 'NA'
ko_kegg = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(ko_kegg, 'html.parser')
if soup:
table = soup.find('td', attrs={'class': 'fr4'})
if table:
rows = table.findAll('tr')
if rows:
for tr in rows:
th = tr.find('th')
if th:
colname = th.find('nobr')
if colname:
if 'Name' in colname:
content = tr.find('td')
if content:
name = content.find(text=True)
return name
def mapper(ko_dict, ko_set):
"""Assings every KO_id from ko_dict to KEGG_Pathway_id from `ko_set`
Args:
ko_dict: ``{KO_id:{cond1:value1, cond2:value2...}...}`` dict
ko_set: ``{KEGG_Pathway_id:set[KO identifiers]}`` dict
Returns:
dict: Dict with structure::
{KEGG_Pathway_id:{KO_id:{cond1:value1, cond2:value2...}...}...}
"""
all_ko = set(ko_dict.keys())
mapper_d = {}
for (path_name, path_Kids,) in ko_set.items():
kommon = all_ko & path_Kids
if len(kommon) > 0:
mapper_d[path_name] = {}
for Kid in kommon:
fc = ko_dict[Kid]
mapper_d[path_name][Kid] = fc
return mapper_d
def mapper_write(ko_path_dict, all_conds, out_dir):
"""Writes file with KO and corresponding fold change,
for every combination of condition & KEGG_Pathway_id.
Args:
ko_path_dict:
``{KEGG_Pathway_id:{KO_id:{cond1:value1, cond2:value2...}...}...}``
all_conds: list of conditions (list of strings)
out_dir: relative output directory path
Output file has following path::
out_dir/condX/
, following name:
KEGG_Pathway_id.txt
, following header:
# KO KEGG_Pathway_id
& following format:
KO_id corresponding_fold_change
"""
for cond in all_conds:
path = pjoin(out_dir, cond)
system('mkdir ' + path)
for pathway in ko_path_dict.keys():
header = '#KO ' + path + '\n'
filename = pathway + '.txt'
output_path = pjoin(path, filename)
with open(output_path, 'w') as _file:
_file.write(header)
for Kid in ko_path_dict[pathway]:
line = '\t'.join([
Kid,
str(ko_path_dict[pathway][Kid][cond]),
'\n'
])
_file.write(line)
def config_from_file(_file):
"""Reads parameters from configuration `_file`.
Prepares target.txt and templates for SARTools.
Args:
`_file`: configuration file for metatranscriptomic pipeline
Returns:
(ref_cond, all_conds, fastqs):
- ref_cond: reference condition defined by user
- all_conds: set of conditions (groups) from target.txt
- fastqs: list of fastq files on which analysis will be done
"""
all_conds = []
fastqs = []
with open(_file) as f:
lines = f.readlines()
ref_cond = lines[0].split()[0]
with open('target.txt', 'w') as f:
f.write('label\tfiles\tgroup')
idents = []
for line in lines[1:]:
id_num = 1
line = line.split()
all_conds.append(line[2])
while True:
hyp_ident = line[2] + '_' + str(id_num)
if hyp_ident in idents:
id_num += 1
else:
idents.append(hyp_ident)
break
fastqs.append(line[0])
fastqs.append(line[1])
target_name = line[0].rsplit('/', 1)[-1]
target_name = target_name.replace('R1_', '')
target_name = target_name.replace('.fastq', '.count')
f.write('\n' + hyp_ident + '_\t' + target_name + '\t' + line[2])
with open('template_script_DESeq2.r') as f:
lines = f.readlines()
lines[24] = 'condRef <- "' + ref_cond + '"' + '\n'
with open('template_script_DESeq2.r', 'w') as f:
for line in lines:
f.write(line)
with open('template_script_edgeR.r') as f:
lines = f.readlines()
lines[24] = 'condRef <- "' + ref_cond + '"' + '\n'
with open('template_script_edgeR.r', 'w') as f:
for line in lines:
f.write(line)
return (ref_cond, set(all_conds), fastqs)
def run_fastq_to_fasta(fastqs):
"""Runs :func:`fastq_to_fasta` for every .fastq in fastqs."""
for fastq in fastqs:
fastq_to_fasta(fastq)
def run_cat_pairing():
"""Merges fasta files with paired-end reads in cwd."""
for file_R1 in glob('*R1*fasta'):
for file_R2 in glob('*R2*fasta'):
if file_R1.split('R1') == file_R2.split('R2'):
outname = file_R1.replace('R1_', '')
outname = outname.replace('.fasta', '.tmp.fasta')
subprocess.check_call(['touch', outname])
with open(outname, 'w') as out:
with open(file_R1) as f1:
with open(file_R2) as f2:
out.write(f1.read())
out.write(f2.read())
def run_rapsearch(threads):
"""Runs :func:`rapsearch2` for every .tmp.fasta in cwd."""
for _file in glob('*tmp.fasta'):
rapsearch2(_file, threads)
def run_ko_map():
"""Runs :func:`m8_to_ko` for every .m8 file in cwd.
GLOBALS:
- path to KO database: PATH_KO_DB
- pickle to dict from KO GENES table from KO database: PATH_KO_PCKL
"""
data = pickle_or_db(PATH_KO_PCKL, connect_db(PATH_KO_DB))
p_list = []
for file_ in glob('*m8'):
p = Process(target=m8_to_ko, args=(file_, data))
p.start()
p_list.append(p)
for p in p_list:
p.join()
def run_SARTools():
"""Runs SARTools in R.
HARDCODED:
R templates:
- edger: template_script_DESeq2.r
- deseq: template_script_edgeR.r
"""
system('Rscript template_script_DESeq2.r')
system('Rscript template_script_edgeR.r')
def run_pre_ko_remap():
"""Prepares args for :func:`run_ko_remap` or :func:`run_new_ko_remap`
Returns:
path_names: ``{KEGG_Pathway_id:Name}`` dict
kopath_keys: ``{KO identifier:set[KEGG_Pathway_ids]}`` dict
kopath_values: ``{KEGG_Pathway_id:set[KO identifiers]}`` dict
edger_files: list of edgeR outputs paths
deseq_diles: list of DESeq outputs paths
HARDCODED:
Paths to files from SARTools:
- edger: `'edger/*[pn].txt'`
- deseq: `'deseq/*[pn].txt'`
GLOBALS:
- path to KO database: PATH_KO_DB
"""
cursor = connect_db(PATH_KO_DB)
path_names = get_pathways(cursor)
(kopath_keys, kopath_values,) = get_kopathways(cursor)
edger_files = glob('edger/tables/[pn]*.txt')
deseq_files = glob('deseq/tables/[pn]*.txt')
return (path_names, kopath_keys, kopath_values, edger_files, deseq_files)
def run_ko_remap(deseq_files, edger_files, kopath_values, path_names):
"""Runs ``out_content(files, kopath_values, path_names (,'edgeR'))``
for files from `edger_paths` and `deseq_paths`.
Args:
deseq_diles: list of DESeq outputs paths
edger_files: list of edgeR outputs paths
kopath_values: ``{KEGG_Pathway_id: set[KO identifiers]}`` dict
path_names: ``{KEGG_Pathway_id: Name}`` dict
"""
out_content(deseq_files, kopath_values, path_names)
out_content(edger_files, kopath_values, path_names, 'edgeR')
def run_new_ko_remap(deseq_files, edger_files, kopath_values, all_conds, ref_cond):
"""Runs :func:`get_ko_fc`, :func:`low_change`, :func:`mapper` and
:func:`mapper_write` in appropriate way for files from
`deseq_files` and `edger_files`.
Args:
deseq_diles: list of DESeq outputs paths
edger_files: list of edgeR outputs paths
ref_cond: Reference condition (group) - string
kopath_values: ``{KEGG_Pathway_id:set[KO identifiers]}`` dict
all_conds: list of conditions (list of strings)
Returns:
ko_dict_deseq: ``{KO_id:{cond1:value1, cond2:value2...}...}`` dict
ko_dict_edger: ``{KO_id:{cond1:value1, cond2:value2...}...}`` dict
HARDCODED:
Output directories paths:
- deseq: 'new_ko_remap/deseq/'
- edger: 'new_ko_remap/edger/'
"""
ko_dict_deseq = {}
ko_dict_edger = {}
for _file in deseq_files:
if ref_cond in _file:
ko_dict_deseq = get_ko_fc(ko_dict_deseq, ref_cond, _file, True)
for _file in edger_files:
if ref_cond in _file:
ko_dict_edger = get_ko_fc(ko_dict_edger, ref_cond, _file)
ko_dict_deseq = low_change(ko_dict_deseq, all_conds)
ko_dict_edger = low_change(ko_dict_edger, all_conds)
mapper_deseq = mapper(ko_dict_deseq, kopath_values)
mapper_edger = mapper(ko_dict_edger, kopath_values)
system('mkdir new_ko_remap')
system('mkdir new_ko_remap/deseq new_ko_remap/edger')
mapper_write(mapper_deseq, all_conds, 'new_ko_remap/deseq/')
mapper_write(mapper_edger, all_conds, 'new_ko_remap/edger/')
return (ko_dict_deseq, ko_dict_edger)
def run_ko_csv(ko_dict_deseq, ko_dict_edger, all_conds, kopath_keys, path_names, ref_cond):
"""For given ko_dicts writes CSV files with pathways and foldchanges
Args:
ko_dict: ``{KO_id:{cond1:value1, cond2:value2...}...}`` dict
all_conds: list of conditions (list of strings)
kopath_keys: ``{KO identifier:set[KEGG_Pathway_ids]}`` dict
path_names: ``{KEGG_Pathway_id:Name}`` dict
filepath: output filepath
Output files have following format (and header)::
KO_id;Gene_name;paths ids;paths names;FC vs cond1;FC vs cond2;...;
HARDCODED:
Output files paths:
- deseq: 'deseq.csv'
- edger: 'edger.csv'
"""
for touple in [(ko_dict_deseq, 'deseq.csv'),
(ko_dict_edger, 'edger.csv')]:
filepath = touple[1]
ko_dict = touple[0]
with open(filepath, 'wb') as _file:
l_header = ['KO_id', 'Gene_name', 'paths ids', 'paths names']
for cond in all_conds:
l_header.append('FC ' + ref_cond + ' vs ' + cond)
header = ';'.join(l_header + ['\n'])
_file.write(header)
for Kid in ko_dict.keys():
to_write = [Kid] # ['K01369']
to_write.append(get_kegg_name(Kid)) # ['K01369','LGMN']
if Kid in kopath_keys.keys():
koids = []
konames = []
for path in kopath_keys[Kid]:
koids.append(path)
konames.append(path_names[path])
to_write.append(','.join(koids)) # ['K01369','LGMN', 'ko04612, ko04142']
to_write.append(','.join(konames)) # ['K01369','LGMN', 'ko04612, ko04142','Antigen processing and presentation,Lysosome']
else:
to_write.append('NA')
to_write.append('NA')
for cond in all_conds:
# ['K01369','LGMN', 'ko04612, ko04142','Antigen processing and presentation,Lysosome','0','1.12','2.32']
to_write.append(str(ko_dict[Kid][cond]))
_file.write(';'.join(to_write) + '\n')
def progress(what, estimated_percentage=None, done=True):
"""Prints specially formatted information about progress.
Args:
what:
a string with name of operation which was just performed,
and should be reported to standard output as don or failed,
estimated_percentage: (int)
Percent should be calculated as part of whole execution;
first and last 5 percent should be reserved for programs
which runs 'metatranscriptomics', for pre- and postprocessing,
done:
informs whether the operation from 'what' argument failed or was
successfully done.
"""
state = 'DONE' if done else 'FAILED'
print '{0}: {1}'.format(what, state)
if estimated_percentage is not None:
print 'progress={0}'.format(estimated_percentage)
def metatranscriptomics(opts):
"""Performs analyse of metagenomic data.
See Also:
For more information please refer to:
- :func:`run_fastq_to_fasta`
- :func:`run_rapsearch`
- :func:`run_ko_map`
- :func:`run_SARTools`
- :func:`run_pre_ko_remap`
- :func:`run_ko_remap`
- :func:`run_new_ko_remap`
- :func:`run_ko_csv`
"""
assert opts.out_dir != 'in_situ'
before_cwd = getcwd()
tmp_dir = pjoin(opts.out_dir, '.meta_tmp_results')
if not pexists(tmp_dir) or not opts.e:
system('mkdir ' + tmp_dir)
chdir(tmp_dir)
for i in ('template_script_DESeq2.r', 'template_script_edgeR.r'):
system('cp ' + pjoin(dirname(realpath(__file__)), i) + ' .')
ref_cond, all_conds, fastqs = config_from_file(opts.metatr_config)
progress('configuration file reading', 15)
if (len(fastqs) > len(glob('*_R[12]_*.fasta'))) or not opts.e:
run_fastq_to_fasta(fastqs)
progress('fastq_to_fasta', 25)
if (len(fastqs)/2 > len(glob('*.tmp.fasta'))) or not opts.e:
run_cat_pairing()
progress('cat', 35)
if (len(fastqs)/2 > len(glob('*.txt.m8'))) or not opts.e:
run_rapsearch(opts.threads)
progress('rapsearch', 45)
if (len(fastqs)/2 > len(glob('*.count'))) or not opts.e:
run_ko_map()
progress('KO mapping', 55)
if ('edger' not in glob('*')) or not opts.e:
system('mkdir deseq edger')
run_SARTools()
progress('SARTools', 65)
path_names, kopath_keys, kopath_values, edger_files, deseq_files = run_pre_ko_remap()
if opts.metatr_output_type != 'new':
if (
len(all_conds) * (len(all_conds) - 1) > len(glob('*.path_counts.csv')) or
not opts.e
):
run_ko_remap(deseq_files, edger_files, kopath_values, path_names)
if opts.metatr_output_type != 'old':
if ('edger.csv' not in glob('*')) or not opts.e:
ko_dict_deseq, ko_dict_edger = run_new_ko_remap(
deseq_files, edger_files, kopath_values, all_conds, ref_cond)
progress('pathway mapping', 75)
run_ko_csv(ko_dict_deseq, ko_dict_edger, all_conds, kopath_keys,
path_names, ref_cond)
progress('generating summative CSV', 85)
if opts.metatr_output_type != 'new':
old_path = opts.out_dir + '/old'
system('mkdir ' + old_path)
system('cp *path_counts.csv ' + old_path)
if opts.metatr_output_type != 'old':
system('cp deseq.csv edger.csv ' + opts.out_dir)
system('cp edger/_report.html ' + opts.out_dir + '/edger_report.html')
system('cp deseq/_report.html ' + opts.out_dir + '/deseq_report.html')
chdir(before_cwd)
progress('METATRANSCRIPTOMIC WORKFLOW', 95)