/
calculate_mismatch_probs.py
289 lines (205 loc) · 11.8 KB
/
calculate_mismatch_probs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
#! /usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
"""
Created on Mon Jun 23 16:21:32 2014
@author: Peter Edge
"""
desc = '''
This file performs analysis of two SAM files in order to estimate the number
of RNAseq reads with errors to the original sequence. This will play a role in
the SPARTA software by modifying the original quality scores to reflect the
actual observed scores at each phred score.
'''
import argparse
from Bio.Seq import MutableSeq
import os
import pysam
import re
import sys
import time
from util import compatibility_dict
from util import fix_read_mate_order
from util import izip
# default arguments
from sparta import default_output_dir
from sparta import default_pileup_height
from sparta import default_sample_every
# regex for the MD string that specifies errors from the reference.
# more information about the MD string: page 7 of http://samtools.github.io/hts-specs/SAMv1.pdf
MD_REGEX = re.compile("([0-9]+)([A-Z]|\^[A-Z]+)")
def parseargs():
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('samfiles', nargs='+', type = str, help='input samfiles', default=[])
parser.add_argument('-o', '--output_dir', nargs='?', type = str, help='directory to write output to', default=default_output_dir)
# default to help option. credit to unutbu: http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu
if len(sys.argv) < 3:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
return args
# take an aligned read, return the genomic seq EXCEPT for deletions (^)
def create_genome_seq(aligned):
aligned_seq = aligned.seq if type(aligned.seq) == str else aligned.seq.decode('UTF-8')
genome_seq = MutableSeq(aligned_seq)
# see samtools documentation for MD string
err = re.findall(MD_REGEX, aligned.opt("MD"))
seq_ix = 0
# step through sequence
for matched_bases, curr_err in err:
seq_ix += int(matched_bases)
assert '^' not in curr_err
assert curr_err != genome_seq[seq_ix]
genome_seq[seq_ix] = curr_err
seq_ix += 1
if aligned.is_reverse:
genome_seq.reverse_complement()
return genome_seq
def add_to_pileup_dict(sams, aligned_read_set, pileup_dict):
# sanity check that all the qnames (RNA read IDs) are the same
for read in aligned_read_set:
assert read.qname == aligned_read_set[0].qname
if not True in [read.is_unmapped for read in aligned_read_set]:
# all alignments mapped
for read in aligned_read_set:
for op, op_len in read.cigar:
if op > 0 and op < 7:
# do not sample reads where there are insertions or deletions
return
assert len(read.seq) == len(aligned_read_set[0].seq)
# if aligned reads are reversed, we reverse them and hold on to that info.
pos_dicts = [dict(read.aligned_pairs) for read in aligned_read_set]
genome_seqs = [create_genome_seq(read) for read in aligned_read_set]
qual = bytearray(aligned_read_set[0].qual, 'utf-8')
seq = MutableSeq(aligned_read_set[0].seq if type(aligned_read_set[0].seq) == str else aligned_read_set[0].seq.decode('UTF-8'))
if aligned_read_set[0].is_reverse:
seq.reverse_complement()
qual = qual[::-1]
for genome_seq in genome_seqs:
assert len(genome_seq) == len(seq)
for i in range(0, len(seq)):
# need (chrom, pos, genome_seq[i]) tuples for each aligned_read
chroms = [sam.getrname(a.tid) for sam, a in izip(sams, aligned_read_set)]
positions = [d[i] if not a.is_reverse else d[len(seq) - i - 1] for d, a in zip(pos_dicts, aligned_read_set)]
genome_seq_i = [g[i] for g in genome_seqs]
genomic_locs = tuple(zip(chroms, positions, genome_seq_i))
pileup_dict[genomic_locs][seq[i]][qual[i]] += 1
def create_mismatch_prob_dict(samfiles, output_dir = default_output_dir, paired_end=False, pileup_height=default_pileup_height, sample_every=default_sample_every):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
sams = tuple(pysam.Samfile(i) for i in samfiles)
pileup_dict = compatibility_dict(lambda: compatibility_dict(lambda: compatibility_dict(int)))
if not paired_end:
for i, aligned_read_set in enumerate(izip(*sams)):
# sample every 10th read
if i % sample_every != 0:
continue
add_to_pileup_dict(sams, aligned_read_set, pileup_dict)
else:
# the main difference with paired end reads is that bowtie should output
# a read-mate pair, followed by another read-mate pair, in the same order
# regardless of whether it is the genome1 mapping or the genome2 mapping
# but, for a given read-mate pair we don't necessarily know if we are
# getting the read we want or its mate.
# so we have to check, and switch them if necessary
zipped_samfiles = izip(*sams)
for i, aligned_read_set in enumerate(zipped_samfiles):
# take aligned pairs in sets of 2 to also get the mate
# don't forget to assign aligned_pair to next tuple before end of iter
aligned_read_mate_set = next(zipped_samfiles)
# sample rate
if i % sample_every != 0:
aligned_read_set = aligned_read_mate_set
continue
aligned_read_set, aligned_read_mate_set = fix_read_mate_order(aligned_read_set, aligned_read_mate_set)
for aligned_read_set_generic in [aligned_read_set,aligned_read_mate_set]:
add_to_pileup_dict(sams, aligned_read_set_generic, pileup_dict)
# skip the next pair because we already grabbed it as the mate
aligned_read_set = aligned_read_mate_set
for sam in sams:
sam.close()
quality_score_match_counter = compatibility_dict(int)
quality_score_mismatch_counter = compatibility_dict(int)
# transition matrix at [qual][X][Y] stores, for a given quality score qual,
# the number of observed transions from X to Y
transition_matrix = compatibility_dict(lambda: compatibility_dict(int))
with open(os.path.join(output_dir, 'pileup_counts'), 'w') as outfile:
print('genomic_coordinates\tbase_count[A]\tbase_count[C]\tbase_count[G]\tbase_count[T]\tbase_count[N]', file=outfile)
for genomic_locs, nuc_to_qual_dict in pileup_dict.items():
# iterate through genome coordinates and their nested dictionaries
# want to know: what is the consensus base at this coord pair?
# keep a count of how many bases seen at this coord pair
base_count = compatibility_dict(int)
for nuc, qual_dict in nuc_to_qual_dict.items():
# for a given coordinate pair, iterate through its nucleotides and
# the dictionary that pairs qualities to number of matches
# iterate through nucleotides and their quality dictionaries
for qual, num in qual_dict.items():
# iterate over qualities and number of matches
# AT THIS POINT we have:
# a coordinate pair, a nucleotide at that coordinate pair,
# a quality score, and the number of matches at that quality score
base_count[nuc] += 1
total_bases = sum([v for k,v in base_count.items()])
cutoff = 0.75
consensus = 'N'
if total_bases >= pileup_height:
# if more than 75% of reads agree on a base, and if both genomic sequences
# in that position is that same base, then it is the consensus.
for base, count in base_count.items():
if count > cutoff * total_bases and False not in [base == loc[2] for loc in genomic_locs]:
consensus = base
log_msg = '{}\t{}\t{}\t{}\t{}\t{}'.format(genomic_locs, base_count['A']+base_count['a'], base_count['C']+base_count['c'],
base_count['G']+base_count['g'],base_count['T']+base_count['t'], base_count['N']+base_count['n'])
print(log_msg, file=outfile)
for nuc, qual_dict in nuc_to_qual_dict.items():
# for a given coordinate pair, iterate through its nucleotides and
# the dictionary that pairs qualities to number of matches
# iterate through nucleotides and their quality dictionaries
for qual, num in qual_dict.items():
# iterate over qualities and number of matches
# AT THIS POINT we have:
# a coordinate pair, a nucleotide at that coordinate pair,
# a quality score, and the number of matches at that quality score
if consensus != 'N':
if nuc == consensus:
quality_score_match_counter[qual] += num
else:
quality_score_mismatch_counter[qual] += num
transition_matrix[consensus][nuc] += num
mismatch_prob_dict = {}
mismatch_prob_total_values = {}
transition_prob_dict = {}
for qual, mismatch_count in quality_score_mismatch_counter.items():
mismatch_prob = mismatch_count * 1.0 / ((mismatch_count + quality_score_match_counter[qual])*1.0)
mismatch_prob_dict[qual] = mismatch_prob
mismatch_prob_total_values[qual] = mismatch_count + quality_score_match_counter[qual]
for base1, base_to_count_dict in transition_matrix.items():
for base2, num_trans_base1_base2 in base_to_count_dict.items():
# we have a qual, base1, base2, and a count of transitions
# from base1 to base2 at that qual
total_trans_from_base1 = sum(transition_matrix[base1].values())
transition_prob_dict[(base1, base2)] = num_trans_base1_base2 / (total_trans_from_base1 * 1.0)
# For each phred, print observed probability of mismatch and number of bases observed in creating that probability
if mismatch_prob_dict and mismatch_prob_total_values:
with open(os.path.join(output_dir, 'mismatch_prob_info.txt'), 'w') as outputfile:
for k in mismatch_prob_dict.keys():
print ('{}\t{}\t{}'.format(k,mismatch_prob_dict[k],mismatch_prob_total_values[k]), file=outputfile)
with open(os.path.join(output_dir, 'transition_prob_info.txt'), 'w') as outputfile:
for k, v in transition_prob_dict.items():
base1, base2 = k
print ('{}\t{}\t{}'.format(base1, base2, v), file=outputfile)
return mismatch_prob_dict, mismatch_prob_total_values, transition_prob_dict
# main logic
# call compare_mappings() on samfile1 and samfile2 from standard input
def main():
t1 = time.time()
# get command line args
args = parseargs()
samfiles = args.samfiles
output_dir = args.output_dir
mismatch_prob_dict, mismatch_prob_total_values, transition_prob_dict = create_mismatch_prob_dict(samfiles, output_dir)
t2 = time.time()
print('TOTAL TIME: {}'.format(t2-t1))
if __name__ == '__main__':
main()