forked from daveuu/baga
/
PrepareReads.py
586 lines (494 loc) · 26.2 KB
/
PrepareReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
#! /usr/bin/env python2
# -*- coding: utf-8 -*-
#
# This file is part of the Bacterial and Archaeal Genome Analyser
# Copyright (C) 2015 David Williams
# david.williams.at.liv.d-dub.org.uk
# License GPLv3+: GNU GPL version 3 or later
# This is free software: you are free to change and redistribute it
# There is NO WARRANTY, to the extent permitted by law
#
# Work on this software was started at The University of Liverpool, UK
# with funding from The Wellcome Trust (093306/Z/10) awarded to:
# Dr Steve Paterson (The University of Liverpool, UK)
# Dr Craig Winstanley (The University of Liverpool, UK)
# Dr Michael A Brockhurst (The University of York, UK)
#
'''
PrepareReads module from the Bacterial and Archaeal Genome Analyzer (BAGA).
This module contains functions to subsample large read sets, and wrappers
around tools remove adaptor sequences and trim low quality positions.
'''
# stdlib
from time import sleep as _sleep
from baga import _subprocess
from baga import _os
from baga import _multiprocessing
from baga import _cPickle
from baga import _gzip
from baga import _time
from baga import _re
from cStringIO import StringIO as _StringIO
from random import sample as _sample
# external Python modules
from Bio import SeqIO as _SeqIO
# package functions
from baga import decide_max_processes as _decide_max_processes
from baga import get_exe_path as _get_exe_path
from baga import report_time as _report_time
def main():
pass
def cpu_count():
try:
return _multiprocessing.cpu_count()
except (ImportError, NotImplementedError):
pass
# Linux
try:
res = open('/proc/cpuinfo').read().count('processor\t:')
if res > 0:
return res
except IOError:
pass
# Windows
try:
res = int(_os.environ['NUMBER_OF_PROCESSORS'])
if res > 0:
return res
except (KeyError, ValueError):
pass
def insert_suffix(path, insert_suffix, known_suffixes = ['.fastq.gz','.fq.gz','.fastq','.fq']):
this_suffix = False
for known_suffix in known_suffixes:
thismatch = _re.findall('('+known_suffix+')$', path)
if thismatch:
this_suffix = thismatch[0]
break
assert this_suffix, 'Could not find any known suffixes in {} . . . please report this as a bug.'.format(path)
processed_path = _re.sub(this_suffix+'$', insert_suffix+this_suffix, path)
return(processed_path)
class Reads:
'''
Prepare reads for alignment to genome sequence by removing adaptor sequences
and trimming by position specific scores. Align reads to a reference genome
sequence.
'''
def __init__(self, reads = False, path_to_baga = False):
'''
Initialise with a baga.CollectData.Reads object
or
path to a previously save baga object of this class
'''
assert bool(reads) ^ bool(path_to_baga), 'Instantiate with baga.CollectData.Reads or '\
'the path to a previously saved baga.PreparedReads.Reads object' # xor
if reads:
try:
self.read_files = reads.read_files
except AttributeError:
print('baga.PrepareReads.Reads needs a baga.CollectData.Reads object '\
'with a "read_files" attribute. This can be obtained with the '\
'"getFromENA()" or "getFromPath()" methods.')
else:
try:
loaded_baga = _cPickle.load(_gzip.open(path_to_baga,'rb'))
for attribute_name in dir(loaded_baga):
if attribute_name[0] != '_':
setattr(self, attribute_name, getattr(loaded_baga, attribute_name))
except IOError:
print('Could not access {}'.format(file_name))
def saveLocal(self, name):
'''
Save processed read info to a local compressed pickle file.
'name' can exclude extension: .baga will be added
'''
fileout = 'baga.PrepareReads.Reads-%s.baga' % name
print('Saving to %s' % fileout)
_cPickle.dump(self, _gzip.open(fileout, 'wb'))
def subsample(self, genome_size = 6601757,
read_cov_depth = 80,
pc_loss = 0.2,
force = False,
cov_closeness = 5):
'''
Given the size in basepairs of a genome sequence, downsample fastq files to a
desired average read coverage depth predicted after read alignment. Read lengths
are taken from the file. By default, 20% are assumed to be lost at downstream
quality control stages (e.g. quality score based trimming). The percent loss is
used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent
subsampling if within 5x coverage: avoids time consuming subsampling that will only
make a small difference.
'''
subsampled_read_files = {}
start_time = _time.time()
for cnum,(pairname,files) in enumerate(self.read_files.items()):
processed_path_1 = insert_suffix(files[1], '_subsmp')
processed_path_2 = insert_suffix(files[2], '_subsmp')
if not all([_os.path.exists(processed_path_1),
_os.path.exists(processed_path_2)]) \
or force:
if files[1][-2:] == 'gz':
fh1 = _gzip.open(files[1])
else:
fh1 = open(files[1])
aread = _SeqIO.parse(fh1, 'fastq').next()
read_len = len(aread.seq)
print('Counting reads in %s' % files[1])
fh1.seek(0)
lines = 0
# report per half million reads
interval = 2000000
nextreport = interval
for line in fh1:
lines += 1
if lines == nextreport:
print('{:,} reads'.format(lines/4))
nextreport += interval
totalreads = lines / 4.0
print('Found %s reads' % totalreads)
full_depth_coverage = read_len * 2 * totalreads * (1 - pc_loss) / genome_size
print('These paired read files would provide approximately {:.1f}x coverage depth'.format(full_depth_coverage))
numreads2keep = int( round(genome_size * read_cov_depth / (read_len * 2) / (1 - pc_loss), 0) )
if numreads2keep >= totalreads:
print('This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.'.format(full_depth_coverage, read_cov_depth))
print('No sampling performed. Original files will be used')
# pass original files over with subsampled
subsampled_read_files[pairname] = {}
subsampled_read_files[pairname][1] = files[1]
subsampled_read_files[pairname][2] = files[2]
fh1.close()
if len(self.read_files) > 1:
# report durations, time left etc
_report_time(start_time, cnum, len(self.read_files))
continue
elif full_depth_coverage < read_cov_depth + cov_closeness:
print('This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.'.format(full_depth_coverage, cov_closeness, read_cov_depth))
print('No sampling performed. Original files will be used')
# pass original files over with subsampled
subsampled_read_files[pairname] = {}
subsampled_read_files[pairname][1] = files[1]
subsampled_read_files[pairname][2] = files[2]
fh1.close()
if len(self.read_files) > 1:
# report durations, time left etc
_report_time(start_time, cnum, len(self.read_files))
continue
else:
print('For approximately {}x read coverage, will retain {} of {} {}bp read pairs'.format(
read_cov_depth, numreads2keep, totalreads, read_len))
fh1.seek(0)
if files[2][-2:] == 'gz':
fh2 = _gzip.open(files[2])
else:
fh2 = open(files[2])
fout1 = _gzip.open(processed_path_1, 'wb')
fout2 = _gzip.open(processed_path_2, 'wb')
batch_size = 200000
keep_per_pop = int(numreads2keep / float(totalreads) * batch_size) + 1
nextwrite = batch_size
written = 0
n1 = 0
n2 = 0
these_lines1 = []
these_lines2 = []
reportfreq = 10
thisreport = 0
print('Subsampling . . .')
for line in fh1:
these_lines1 += [line]
if len(these_lines1) % 4 == 0:
n1 += 1
if n1 == nextwrite:
keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
keep_these = []
for i in keep_indices:
i1 = i * 4
i2 = i * 4 + 4
keep_these += these_lines1[i1:i2]
# try parsing a read for QC
assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
fout1.write(''.join(keep_these))
these_lines1 = []
written += keep_per_pop
thisreport += 1
if thisreport == reportfreq or written == keep_per_pop:
# report first time and at intevals
print('Written {:,} reads ({:.1%}) to {}'.format(written,
written/float(numreads2keep),
processed_path_1))
for line2 in fh2:
these_lines2 += [line2]
if len(these_lines2) % 4 == 0:
n2 += 1
if n2 == nextwrite:
keep_these = []
for i in keep_indices:
i1 = i * 4
i2 = i * 4 + 4
keep_these += these_lines2[i1:i2]
assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
fout2.write(''.join(keep_these))
these_lines2 = []
if thisreport == reportfreq or written == keep_per_pop:
thisreport = 0
print('Written {:,} reads ({:.1%}) to {}'.format(written,
written/float(numreads2keep),
processed_path_2))
nextwrite += batch_size
break
# write remainder
remainder = nextwrite - n1
keep_in_remainder = int(keep_per_pop * (remainder / float(batch_size))) + 1
keep_indices = sorted(_sample(xrange(remainder), keep_in_remainder))
keep_these = []
for i in keep_indices:
i1 = i * 4
i2 = i * 4 + 4
keep_these += these_lines1[i1:i2]
# try parsing a read for QC
assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq')
fout1.write(''.join(keep_these))
written += keep_in_remainder
print('Written {:,} reads ({:.1%}) to {}'.format(written,
written/float(numreads2keep),
processed_path_1))
# get remainder
for line2 in fh2:
these_lines2 += [line2]
# write remainder
keep_these = []
for i in keep_indices:
i1 = i * 4
i2 = i * 4 + 4
keep_these += these_lines2[i1:i2]
assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') ###### check why keep_these was empty
fout2.write(''.join(keep_these))
print('Written {:,} reads ({:.1%}) to {}'.format(written,
written/float(numreads2keep),
processed_path_2))
# not sure if this is quicker/slower (more calls to .join())
# this_read = []
# for line in fh1:
# this_read += [line]
# if len(this_read) == 4:
# these_reads1 += [''.join(this_read)]
# #these_reads1 += this_read
# this_read = []
# n1 += 1
# if n1 == nextwrite:
# keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop))
# # try parsing a read for QC
# assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq')
# fout1.write(''.join([these_reads1[i] for i in keep_indices]))
# these_reads1 = []
# written += keep_per_pop
# print('Written {:,} reads ({:.2%}) to {}'.format(written,
# written/float(numreads2keep),
# processed_path_1))
# for line2 in fh2:
# this_read += [line2]
# if len(this_read) == 4:
# these_reads2 += [''.join(this_read)]
# this_read = []
# n2 += 1
# if n2 == nextwrite:
# assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq')
# fout2.write(''.join([these_reads2[i] for i in keep_indices]))
# these_reads2 = []
# print('Written {:,} reads ({:.2%}) to {}'.format(written,
# written/float(numreads2keep),
# processed_path_2))
# nextwrite += batch_size
# break
fout1.close()
fout2.close()
fh1.close()
fh2.close()
else:
print('Found:')
print(processed_path_1)
print(processed_path_2)
print('use "force = True" to overwrite')
if len(self.read_files) > 1:
# report durations, time left etc
_report_time(start_time, cnum, len(self.read_files))
subsampled_read_files[pairname] = {}
subsampled_read_files[pairname][1] = processed_path_1
subsampled_read_files[pairname][2] = processed_path_2
# replace here as this step is optional
self.fullsized_read_files = list(self.read_files)
self.read_files = subsampled_read_files
def cutAdaptors(self, path_to_exe = False, force = False, max_cpus = -1):
if not path_to_exe:
path_to_exe = _get_exe_path('cutadapt')
adaptorcut_read_files = {}
adaptor_seqs = [
'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC',
'AGATCGGAAGAGCACACGTCT',
'AGATCGGAAGAGC',
'GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG',
'ACACTCTTTCCCTACACGACGCTCTTCCGATCT',
]
cmds = []
processed_paths_to_do = []
for cnum,(pairname,files) in enumerate(self.read_files.items()):
processed_path_1 = insert_suffix(files[1], '_adpt')
processed_path_2 = insert_suffix(files[2], '_adpt')
# print(files[1], processed_path_1)
# print(files[2], processed_path_2)
# single end
cmd = [path_to_exe] + \
[a for b in [('-a', a) for a in adaptor_seqs] for a in b] + \
['-o', processed_path_1, files[1]]
# paired end
cmd = [path_to_exe] + \
[a for b in [('-a', a) for a in adaptor_seqs] for a in b] + \
[a for b in [('-A', a) for a in adaptor_seqs] for a in b] + \
['-o', processed_path_1, '-p', processed_path_2] + \
[files[1], files[2]]
if not all([_os.path.exists(processed_path_1),
_os.path.exists(processed_path_2)]) \
or force:
# collect expected outputs
processed_paths_to_do += [(processed_path_1,processed_path_2)]
# collect all the commands to be issued
cmds += [(pairname,cmd)]
else:
print('Found:')
print(processed_path_1)
print(processed_path_2)
print('use "force = True" to overwrite')
adaptorcut_read_files[pairname] = {}
adaptorcut_read_files[pairname][1] = processed_path_1
adaptorcut_read_files[pairname][2] = processed_path_2
if len(cmds):
max_processes = _decide_max_processes(max_cpus)
processes = {}
### how to combine this which hangs on _os.wait()
for pairname,cmd in cmds:
print('Called: "%s"' % ' '.join(cmd))
# process is key, open file being piped to is value
# baga CollectReads currently includes path in pairname
this_stdout_file = open(pairname+'_cutadapt.log',"w")
thisprocess = _subprocess.Popen(cmd, shell=False, stdout = this_stdout_file)
processes[thisprocess] = this_stdout_file
if len(processes) >= max_processes:
_os.wait()
finished = dict([(p,f) for p,f in processes.items() if p.poll() is not None])
# close files for finished processes
for process,stdout_file in finished.items():
stdout_file.close()
# update active processes
del processes[process]
# Check if all the child processes were closed
for p in processes:
if p.poll() is None:
p.wait()
fails = []
for (pairname,cmd),(processed_path_1,processed_path_2) in zip(cmds,processed_paths_to_do):
if _os.path.exists(processed_path_1) and _os.path.exists(processed_path_2):
print('Found:')
print(processed_path_1)
print(processed_path_2)
adaptorcut_read_files[pairname] = {}
adaptorcut_read_files[pairname][1] = processed_path_1
adaptorcut_read_files[pairname][2] = processed_path_2
else:
print('Processing of the following pair seems to have failed')
print(processed_path_1)
print(processed_path_2)
fails += [(processed_path_1,processed_path_2)]
assert len(fails) == 0, 'There was a problem finding all of the output from cutadapt. Try repeating this or an eralier step with the --force option to overwite previous, possibly incomplete, files'
self.adaptorcut_read_files = adaptorcut_read_files
def trim(self, path_to_exe = False,
force = False,
max_cpus = -1):
if not path_to_exe:
exe_sickle = _get_exe_path('sickle')
else:
exe_sickle = _os.path.sep.join(path_to_exe)
e1 = 'Could not find "adaptorcut_read_files" attribute. \
Before quality score trimming, reads must be cleaned of \
library preparation sequences. Please run cutAdaptors() \
method on this Reads instance.'
assert hasattr(self, 'adaptorcut_read_files'), e1
e2 = 'Could not find %s. Either run cutAdaptors() again \
or ensure file exists'
for pairname, files in self.adaptorcut_read_files.items():
assert _os.path.exists(files[1]), e2 % files[1]
assert _os.path.exists(files[1]), e2 % files[1]
trimmed_read_files = {}
print(sorted(self.adaptorcut_read_files))
cmds = []
processed_paths_to_do = []
for pairname,files in self.adaptorcut_read_files.items():
processed_path_1 = insert_suffix(files[1], '_qual')
processed_path_2 = insert_suffix(files[2], '_qual')
processed_path_s = insert_suffix(files[2], '_singletons_qual')
# Illumina quality using CASAVA >= 1.8 is Sanger encoded
QSscore_scale = 'sanger'
cmd = [exe_sickle, 'pe',
'-f', files[1] ,'-r', files[2],
'-t', QSscore_scale,
'-o', processed_path_1,
'-p', processed_path_2,
'-s', processed_path_s,
# quality 25, length 50 (of 150)
'-q','25','-l','50']
if not all([_os.path.exists(processed_path_1),
_os.path.exists(processed_path_2),
_os.path.exists(processed_path_s)]) \
or force:
# collect expected outputs
processed_paths_to_do += [(processed_path_1,processed_path_2,processed_path_s)]
# collect all the commands to be issued
cmds += [(pairname,cmd)]
else:
print('Found:')
print(processed_path_1)
print(processed_path_2)
print(processed_path_s)
print('use "force = True" to overwrite')
trimmed_read_files[pairname] = {}
trimmed_read_files[pairname][1] = processed_path_1
trimmed_read_files[pairname][2] = processed_path_2
if len(cmds):
max_processes = _decide_max_processes(max_cpus)
processes = {}
### how to combine this which hangs on _os.wait()
for pairname,cmd in cmds:
print('Called: "%s"' % ' '.join(cmd))
# process is key, open file being piped to is value
# baga CollectReads currently includes path in pairname
this_stdout_file = open(pairname+'_sickle.log',"w")
thisprocess = _subprocess.Popen(cmd, shell = False, stdout = this_stdout_file)
processes[thisprocess] = this_stdout_file
if len(processes) >= max_processes:
_os.wait()
finished = dict([(p,f) for p,f in processes.items() if p.poll() is not None])
# close files for finished processes
for process,stdout_file in finished.items():
stdout_file.close()
# update active processes
del processes[process]
# Check if all the child processes were closed
for p in processes:
if p.poll() is None:
p.wait()
fails = []
for (pairname,cmd),(processed_path_1,processed_path_2,processed_path_s) in zip(cmds,processed_paths_to_do):
if _os.path.exists(processed_path_1) and _os.path.exists(processed_path_2):
print('Found:')
print(processed_path_1)
print(processed_path_2)
trimmed_read_files[pairname] = {}
trimmed_read_files[pairname][1] = processed_path_1
trimmed_read_files[pairname][2] = processed_path_2
else:
print('Processing of the following pair seems to have failed')
print(processed_path_1)
print(processed_path_2)
fails += [(processed_path_1,processed_path_2)]
assert len(fails) == 0, 'There was a problem finding all of the output from sickle. Try repeating this or an earlier step with the --force option to overwite previous, possibly incomplete, files'
self.trimmed_read_files = trimmed_read_files
if __name__ == '__main__':
main()