def bioseq_to_bwa_seq(self): nseq = 100 for fmt in "fastq-sanger", "fastq-illumina", "fastq-solexa": qkey = "solexa_quality" if fmt == "fastq-solexa" else "phred_quality" for unknown in False, True: g = u.random_reads_generator(nseq, fmt=fmt, unknown=unknown) for i, (seq, q) in enumerate(g): name = "foo-%d" % i seq_str = "".join(seq) bioseq = SeqRecord(Seq(seq_str, single_letter_alphabet), id=name, name=name, description=name) bioseq.letter_annotations[qkey] = q n, m = len(bioseq), len(bioseq.name) bwseq = bwa.alloc_seq(1, n, m)[0] bwa.bioseq_to_bwa_seq(bioseq, bwseq, n, m, fmt) self.assertEqual(bioseq.name, bwseq.get_name()) self.assertEqual(bioseq.seq.data[::-1], bwseq.get_seq()) self.assertEqual(reverse_complement(bioseq.seq.data), bwseq.get_rseq()) # check that quality has been converted to sanger if fmt == "fastq-solexa": exp_q = [int(round(x+10*math.log10(1+10**(-x/10.)))) for x in q] else: exp_q = q exp_qstr = "".join(chr(x+sg.Q_OFFSET["fastq-sanger"]) for x in exp_q) self.assertEqual(bwseq.get_qual(), exp_qstr)
def bioseq_to_bwa_seq(self): nseq = 100 for fmt in "fastq-sanger", "fastq-illumina", "fastq-solexa": qkey = "solexa_quality" if fmt == "fastq-solexa" else "phred_quality" for unknown in False, True: g = u.random_reads_generator(nseq, fmt=fmt, unknown=unknown) for i, (seq, q) in enumerate(g): name = "foo-%d" % i seq_str = "".join(seq) bioseq = SeqRecord(Seq(seq_str, single_letter_alphabet), id=name, name=name, description=name) bioseq.letter_annotations[qkey] = q n, m = len(bioseq), len(bioseq.name) bwseq = bwa.alloc_seq(1, n, m)[0] bwa.bioseq_to_bwa_seq(bioseq, bwseq, n, m, fmt) self.assertEqual(bioseq.name, bwseq.get_name()) self.assertEqual(bioseq.seq.data[::-1], bwseq.get_seq()) self.assertEqual(reverse_complement(bioseq.seq.data), bwseq.get_rseq()) # check that quality has been converted to sanger if fmt == "fastq-solexa": exp_q = [ int(round(x + 10 * math.log10(1 + 10**(-x / 10.)))) for x in q ] else: exp_q = q exp_qstr = "".join( chr(x + sg.Q_OFFSET["fastq-sanger"]) for x in exp_q) self.assertEqual(bwseq.get_qual(), exp_qstr)
def build_bws_array(self): # FIXME: plenty of UGLY code nseq = 100 for fmt in "fastq-sanger", "fastq-illumina", "fastq-solexa": qkey = "solexa_quality" if fmt == "fastq-solexa" else "phred_quality" for unknown in False, True: g = u.random_reads_generator(nseq, fmt=fmt, unknown=unknown, pe=True) bioseq_pairs, qseq_pairs = [], [] for i, read_pair in enumerate(g): base_name = "foo-%d" % i names = ["%s/%d" % (base_name, j) for j in 1, 2] seq_strings = ["".join(read_pair[j][0]) for j in 0, 1] q_strings = [ "".join( chr(x + sg.Q_OFFSET[fmt]) for x in read_pair[j][1]) for j in 0, 1 ] bioseq_p = [ SeqRecord(Seq(seq_strings[j], single_letter_alphabet), id=names[j], name=names[j], description=names[j]) for j in 0, 1 ] for j in 0, 1: bioseq_p[j].letter_annotations[qkey] = read_pair[j][1] bioseq_pairs.append(bioseq_p) qseq_pairs.append([base_name]) for j in 0, 1: for l in seq_strings, q_strings: qseq_pairs[-1].append(l[j]) n = len(bioseq_pairs[0][0]) assert len(qseq_pairs[0][1]) == n for src in "bioseq", "qseq": seq_pairs = bioseq_pairs if src == "bioseq" else qseq_pairs bwsa = bwa.build_bws_array(seq_pairs, qtype=fmt, src=src) for j in 0, 1: self.assertTrue(type(bwsa[j]) is bwa.bwa_seq_p_t) for i in xrange(nseq): bwseq = bwsa[j][i] self.assertEqual(bwseq.len, n) if src == "bioseq": exp_name = bioseq_pairs[i][0].name else: exp_name = "%s/%d" % (qseq_pairs[i][0], (j + 1)) self.assertEqual(len(bwseq.get_name()), len(exp_name))
def build_bws_array(self): # FIXME: plenty of UGLY code nseq = 100 for fmt in "fastq-sanger", "fastq-illumina", "fastq-solexa": qkey = "solexa_quality" if fmt == "fastq-solexa" else "phred_quality" for unknown in False, True: g = u.random_reads_generator(nseq, fmt=fmt, unknown=unknown, pe=True) bioseq_pairs, qseq_pairs = [], [] for i, read_pair in enumerate(g): base_name = "foo-%d" % i names = ["%s/%d" % (base_name, j) for j in 1, 2] seq_strings = ["".join(read_pair[j][0]) for j in 0, 1] q_strings = ["".join(chr(x+sg.Q_OFFSET[fmt]) for x in read_pair[j][1]) for j in 0, 1] bioseq_p = [SeqRecord(Seq(seq_strings[j], single_letter_alphabet), id=names[j], name=names[j], description=names[j]) for j in 0, 1] for j in 0, 1: bioseq_p[j].letter_annotations[qkey] = read_pair[j][1] bioseq_pairs.append(bioseq_p) qseq_pairs.append([base_name]) for j in 0, 1: for l in seq_strings, q_strings: qseq_pairs[-1].append(l[j]) n = len(bioseq_pairs[0][0]) assert len(qseq_pairs[0][1]) == n for src in "bioseq", "qseq": seq_pairs = bioseq_pairs if src == "bioseq" else qseq_pairs bwsa = bwa.build_bws_array(seq_pairs, qtype=fmt, src=src) for j in 0, 1: self.assertTrue(type(bwsa[j]) is bwa.bwa_seq_p_t) for i in xrange(nseq): bwseq = bwsa[j][i] self.assertEqual(bwseq.len, n) if src == "bioseq": exp_name = bioseq_pairs[i][0].name else: exp_name = "%s/%d" % (qseq_pairs[i][0], (j+1)) self.assertEqual(len(bwseq.get_name()), len(exp_name))