forked from daveuu/baga
/
AlignReads.py
437 lines (355 loc) · 17.6 KB
/
AlignReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
#! /usr/bin/env python2
# -*- coding: utf-8 -*-
#
# This file is part of the Bacterial and Archaeal Genome Analyser
# Copyright (C) 2015-16 David Williams
# david.williams.at.liv.d-dub.org.uk
# License GPLv3+: GNU GPL version 3 or later
# This is free software: you are free to change and redistribute it
# There is NO WARRANTY, to the extent permitted by law
#
# Work on this software was started at The University of Liverpool, UK
# with funding from The Wellcome Trust (093306/Z/10) awarded to:
# Dr Steve Paterson (The University of Liverpool, UK)
# Dr Craig Winstanley (The University of Liverpool, UK)
# Dr Michael A Brockhurst (The University of York, UK)
#
'''
AlignReads module from the Bacterial and Archaeal Genome Analyzer (BAGA).
This module contains wrappers around tools to align reads to a reference
genome, including realignment around possible insertions and deletions.
'''
# stdlib
from time import sleep as _sleep
from baga import _subprocess
from baga import _os
from baga import _multiprocessing
from baga import _cPickle
from baga import _gzip
from baga import _StringIO
from baga import _tarfile
from baga import _array
from baga import _json
# external Python modules
from Bio.Seq import Seq as _Seq
from Bio.SeqRecord import SeqRecord as _SeqRecord
from Bio import SeqIO as _SeqIO
from random import sample as _sample
# import pysam
# from Bio.Seq import Seq
# from Bio.SeqRecord import SeqRecord
# package functions
from baga import decide_max_processes as _decide_max_processes
from baga import get_exe_path as _get_exe_path
from baga import get_jar_path as _get_jar_path
def main():
pass
class SAMs:
'''
A collection of short read datasets that have been aligned to the same genome
sequence in Sequence Alignment/Map (SAM) format for variant calling using GATK.
'''
def __init__(self, reads = False, genome = False, baga = False):
'''
Initialise with:
a baga.PrepareReads.Reads object and,
a baga.CollectData.Genome object.
OR
a path to baga.AlignReads.SAMs (like this one) object that
was previously saved.
'''
if (reads and genome) and not baga:
try:
self.read_files = reads.trimmed_read_files
except AttributeError:
text = 'WARNING: baga was not used to quality-score trim these reads. Read trimming is recommended for most types of analysis. This can be achieved with the "trim()" method of the Reads class in the PrepareReads module.'
print(text)
try:
self.read_files = reads.adaptorcut_read_files
except AttributeError:
text = 'WARNING: baga was not used to remove library preparation adaptor sequences from these reads. Adaptor removal is highly recommended so hopefully you already removed adaptor sequences! This can be achieved with the "cutAdaptors()" method of the Reads class in the PrepareReads module.'
self.read_files = reads.read_files
print(text)
print('continuing with these reads . . .')
# currently baga CollectData includes path to reads in pairname keys to read file pair values
# check and remove here
for pairname, files in self.read_files.items():
if _os.path.sep in pairname:
self.read_files[pairname.split(_os.path.sep)[-1]] = files
del self.read_files[pairname]
self.genome_sequence = genome.sequence
self.genome_id = genome.id
elif baga and not (reads and genome):
# for reloading a previous instantiation
with _tarfile.open(baga, "r:gz") as tar:
for member in tar:
contents = _StringIO(tar.extractfile(member).read())
try:
# either json serialised conventional objects
contents = _json.loads(contents.getvalue())
except ValueError:
#print('json failed: {}'.format(member.name))
# or longer python array.array objects
contents = _array('c', contents.getvalue())
setattr(self, member.name, contents)
else:
raise NameError('instantiate baga.AlignReads.SAMs with either loaded baga.PrepareReads.Reads-*.baga and baga.CollectData.Genome-*.baga objects or previous saved alignments (baga.AlignReads.SAMs-*.baga)')
def saveLocal(self, name):
'''
Save processed SAM file info to a local compressed pickle file.
'name' can exclude extension: .baga will be added
'''
fileout = 'baga.AlignReads.SAMs-{}.baga'.format(name)
with _tarfile.open(fileout, "w:gz") as tar:
print('Writing to {} . . . '.format(fileout))
for att_name, att in self.__dict__.items():
if isinstance(att, _array):
io = _StringIO(att.tostring())
io.seek(0, _os.SEEK_END)
length = io.tell()
io.seek(0)
thisone = _tarfile.TarInfo(name = att_name)
thisone.size = length
tar.addfile(tarinfo = thisone, fileobj = io)
else:
# try saving everything else here by jsoning
try:
io = _StringIO()
_json.dump(att, io)
io.seek(0, _os.SEEK_END)
length = io.tell()
io.seek(0)
thisone = _tarfile.TarInfo(name = att_name)
thisone.size = length
tar.addfile(tarinfo = thisone, fileobj = io)
except TypeError:
# ignore non-jsonable things like functions
# include unicodes, strings, lists etc etc
#print('omitting {}'.format(att_name))
pass
def align(self, insert_size = False,
path_to_exe = False,
local_alns_path = ['alignments'],
force = False,
max_cpus = -1):
if not path_to_exe:
path_to_exe = _get_exe_path('bwa')
# write genome sequence to a fasta file
try:
_os.makedirs('genome_sequences')
except OSError:
pass
genome_fna = 'genome_sequences/%s.fna' % self.genome_id
_SeqIO.write(_SeqRecord(_Seq(self.genome_sequence.tostring()), id = self.genome_id),
genome_fna,
'fasta')
# make folder for alignments (BAMs)
local_alns_path = _os.path.sep.join(local_alns_path)
if not _os.path.exists(local_alns_path):
_os.makedirs(local_alns_path)
# make a subdir for this genome
local_alns_path_genome = _os.path.sep.join([
local_alns_path,
self.genome_id])
if not _os.path.exists(local_alns_path_genome):
_os.makedirs(local_alns_path_genome)
max_processes = _decide_max_processes( max_cpus )
e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.'
assert hasattr(self, 'read_files'), e1
e2 = 'Could not find %s. Either run trim() again or ensure file exists'
for pairname, files in self.read_files.items():
assert _os.path.exists(files[1]), e2 % files[1]
assert _os.path.exists(files[2]), e2 % files[2]
# always (re)index in case of upstream changes in data
print('Writing BWA index files for %s' % genome_fna)
_subprocess.call([path_to_exe, 'index', genome_fna])
aligned_read_files = {}
for pairname,files in self.read_files.items():
RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname,pairname)
if insert_size:
cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-I', insert_size, '-R', RGinfo, genome_fna, files[1], files[2]]
else:
# BWA can estimate on-the-fly
cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna, files[1], files[2]]
out_sam = _os.path.sep.join([local_alns_path_genome, '%s__%s.sam' % (pairname, self.genome_id)])
if not _os.path.exists(out_sam) or force:
print('Called: "%s"' % ' '.join(cmd))
with open(out_sam, "wb") as out:
_subprocess.call(cmd, stdout = out)
else:
print('Found:')
print(out_sam)
print('use "force = True" to overwrite')
print(' '.join(cmd))
aligned_read_files[pairname] = out_sam
self.aligned_read_files = aligned_read_files
def toBAMs(self, path_to_exe = False, force = False, max_cpus = -1):
if not path_to_exe:
path_to_exe = _get_exe_path('samtools')
processes = set()
max_processes = _decide_max_processes( max_cpus )
paths_to_BAMs = []
for pairname,SAM in self.aligned_read_files.items():
BAM_out = SAM[:-4] + '.bam'
if not _os.path.exists(BAM_out) or force:
cmd = '{0} view -buh {1} | {0} sort - {2}; {0} index {2}.bam'.format(path_to_exe, SAM, SAM[:-4])
print('Called: %s' % cmd)
processes.add( _subprocess.Popen(cmd, shell=True) )
if len(processes) >= max_processes:
(pid, exit_status) = _os.wait()
processes.difference_update(
[p for p in processes if p.poll() is not None])
else:
print('Found:')
print(BAM_out)
print('use "force = True" to overwrite')
paths_to_BAMs += [BAM_out]
# Check if all the child processes were closed
for p in processes:
if p.poll() is None:
p.wait()
self.paths_to_BAMs = paths_to_BAMs
def removeDuplicates(self, path_to_jar = False, force = False, mem_num_gigs = 2, max_cpus = -1):
if not path_to_jar:
path_to_jar = _get_jar_path('picard')
processes = set()
max_processes = _decide_max_processes( max_cpus )
print('Print: will use %s cpus for picard' % max_processes)
paths_to_BAMs_dd = []
for BAM in self.paths_to_BAMs:
BAM_out = BAM[:-4] + '_dd.bam'
if not _os.path.exists(BAM_out) or force:
# an alternative parallel code with polling not waiting
# while len(processes) >= max_processes:
# _sleep(1)
# print('processes: %s' % (len(processes)))
# processes.difference_update(
# [p for p in processes if p.poll() is not None]
# )
picard_command = ['MarkDuplicates', 'I=', BAM, 'O=', BAM_out, 'M=', BAM[:-4] + '_dd.log'] #, 'VALIDATION_STRINGENCY=','LENIENT']
cmd = ['java', '-Xmx%sg' % mem_num_gigs, '-jar', path_to_jar] + picard_command
processes.add( _subprocess.Popen(cmd, shell=False) )
print('Called: %s' % (' '.join(map(str, cmd))))
#_subprocess.call(cmd, shell=False)
print('processes: %s, max_processes: %s' % (len(processes),max_processes))
if len(processes) >= max_processes:
(pid, exit_status) = _os.wait()
processes.difference_update(
[p for p in processes if p.poll() is not None])
else:
print('Found:')
print(BAM_out)
print('use "force = True" to overwrite')
paths_to_BAMs_dd += [BAM_out]
# Check if all the child processes were closed
for p in processes:
if p.poll() is None:
p.wait()
self.paths_to_BAMs_dd = paths_to_BAMs_dd
def sortIndexBAMs(self, path_to_exe = False, force = False, max_cpus = -1):
if not path_to_exe:
path_to_exe = _get_exe_path('samtools')
processes = set()
max_processes = _decide_max_processes( max_cpus )
paths_to_BAMs_dd_si = []
for SAM in self.paths_to_BAMs_dd:
BAM_out = SAM[:-4] + '_si.bam'
if not _os.path.exists(BAM_out) or force:
cmd = '{0} sort {1} {2}_si; {0} index {2}_si.bam'.format(path_to_exe, SAM, SAM[:-4])
print('Called: %s' % cmd)
processes.add( _subprocess.Popen(cmd, shell=True) )
if len(processes) >= max_processes:
(pid, exit_status) = _os.wait()
processes.difference_update(
[p for p in processes if p.poll() is not None])
else:
print('Found:')
print(BAM_out)
print('use "force = True" to overwrite')
paths_to_BAMs_dd_si += [BAM_out]
# Check if all the child processes were closed
for p in processes:
if p.poll() is None:
p.wait()
self.paths_to_BAMs_dd_si = paths_to_BAMs_dd_si
def IndelRealignGATK(self,
jar = ['external_programs', 'GenomeAnalysisTK', 'GenomeAnalysisTK.jar'],
picard_jar = False,
samtools_exe = False,
use_java = 'java',
force = False,
mem_num_gigs = 2,
max_cpus = -1):
# GATK is manually downloaded by user and placed in folder of their choice
jar = _os.path.sep.join(jar)
if not picard_jar:
picard_jar = _get_jar_path('picard')
if not samtools_exe:
samtools_exe = _get_exe_path('samtools')
genome_fna = 'genome_sequences/{}.fna'.format(self.genome_id)
e1 = 'Could not find "paths_to_BAMs_dd_si" attribute. Before starting GATK analysis, read alignments must have duplicates removed. Please run: .toBAMS(), .removeDuplicates(), .sortIndexBAMs() methods on this SAMs instance, or --deduplicate if using baga_cli.py.'
assert hasattr(self, 'paths_to_BAMs_dd_si'), e1
e2 = 'Could not find %s. Please ensure file exists'
for BAM in self.paths_to_BAMs_dd_si:
assert _os.path.exists(BAM), e2 % BAM
# always (re)generate dict in case of upstream changes in data
print('Creating sequence dictionary for %s' % genome_fna)
_subprocess.call([use_java, '-jar', picard_jar, 'CreateSequenceDictionary', 'R=', genome_fna, 'O=', genome_fna[:-4] + '.dict'])
# always (re)index in case of upstream changes in data
print('Writing index files for %s' % genome_fna)
_subprocess.call([samtools_exe, 'faidx', genome_fna])
processes = set()
max_processes = _decide_max_processes( max_cpus )
for BAM in self.paths_to_BAMs_dd_si:
intervals = BAM[:-4] + '.intervals'
if not _os.path.exists(intervals) or force:
cmd = [use_java, '-Xmx%sg' % mem_num_gigs, '-jar', jar,
'-T', 'RealignerTargetCreator',
'-R', genome_fna,
'-I', BAM,
'-o', intervals] #, '--validation_strictness', 'LENIENT']
print(' '.join(map(str, cmd)))
processes.add( _subprocess.Popen(cmd, shell=False) )
if len(processes) >= max_processes:
(pid, exit_status) = _os.wait()
processes.difference_update(
[p for p in processes if p.poll() is not None])
else:
print('Found:')
print(intervals)
print('use "force = True" to overwrite')
# Check if all the child processes were closed
for p in processes:
if p.poll() is None:
p.wait()
paths_to_BAMs_dd_si_ra = []
for BAM in self.paths_to_BAMs_dd_si:
intervals = BAM[:-4] + '.intervals'
bam_out = BAM[:-4] + '_realn.bam'
if not _os.path.exists(bam_out) or force:
cmd = [use_java, '-Xmx4g', '-jar', jar,
'-T', 'IndelRealigner',
'-R', genome_fna,
'-I', BAM,
'-targetIntervals', intervals,
'-o', bam_out,
'--filter_bases_not_stored']
print(' '.join(map(str, cmd)))
processes.add( _subprocess.Popen(cmd, shell=False) )
if len(processes) >= max_processes:
_os.wait()
processes.difference_update(
[p for p in processes if p.poll() is not None])
else:
print('Found:')
print(bam_out)
print('use "force = True" to overwrite')
paths_to_BAMs_dd_si_ra += [bam_out]
for p in processes:
if p.poll() is None:
p.wait()
# the last list of BAMs in ready_BAMs is input for CallgVCFsGATK
# both IndelRealignGATK and recalibBaseScoresGATK put here
self.ready_BAMs = [paths_to_BAMs_dd_si_ra]
if __name__ == '__main__':
main()