def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file''' bad_files = [ 'fastn_unittest_fail_no_AT.fq', 'fastn_unittest_fail_no_seq.fq', 'fastn_unittest_fail_no_plus.fq', 'fastn_unittest_fail_no_qual.fq' ] for fname in bad_files: f_in = utils.open_file_read(fname) fq = fastn.Fastq() with self.assertRaises(fastn.Error): while fq.get_next_from_file(f_in): pass utils.close(f_in) fname = 'fastn_unittest_good_file.fq' try: f_in = open(fname) except IOError: print("Error opening '" + fname + "'", file=sys.stderr) sys.exit(1) fq = fastn.Fastq() while fq.get_next_from_file(f_in): self.assertEqual(fq, fastn.Fastq('ID', 'ACGTA', 'IIIII')) utils.close(f_in)
def test_to_fastn(self): '''Check conversion to fastq with to_fastq()''' sams = [ sam.SamRecord( 'ID\t0\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'), sam.SamRecord( 'ID\t16\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'), sam.SamRecord( 'ID\t65\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'), sam.SamRecord( 'ID\t129\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'), sam.SamRecord('ID\t0\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\t*'), sam.SamRecord('ID\t16\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\t*') ] seqs = [ fastn.Fastq('ID', 'ACGTA', 'IIIII'), fastn.Fastq('ID', 'TACGT', 'IIIII'), fastn.Fastq('ID/1', 'ACGTA', 'IIIII'), fastn.Fastq('ID/2', 'ACGTA', 'IIIII'), fastn.Fasta('ID', 'ACGTA'), fastn.Fasta('ID', 'TACGT') ] for i in range(len(sams)): self.assertEqual(seqs[i], sams[i].to_fastn())
def test_translate(self): '''Test nucleatide -> amino acid conversion works on Fasta''' fq = fastn.Fastq( 'ID', 'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII' ) self.assertEqual( fastn.Fastq( 'ID', 'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII' ), fq.translate())
def test_trim(self): '''trim() should trim the right number of bases off start and end''' fq = fastn.Fastq('ID', '1234567890', '1234567890') fq.trim(0, 0) self.assertEqual(fq, fastn.Fastq('ID', '1234567890', '1234567890')) fq = fastn.Fastq('ID', '1234567890', '1234567890') fq.trim(1, 0) self.assertEqual(fq, fastn.Fastq('ID', '234567890', '234567890')) fq = fastn.Fastq('ID', '1234567890', '1234567890') fq.trim(0, 1) self.assertEqual(fq, fastn.Fastq('ID', '123456789', '123456789')) fq = fastn.Fastq('ID', '1234567890', '1234567890') fq.trim(2, 2) self.assertEqual(fq, fastn.Fastq('ID', '345678', '345678'))
def to_fastn(self): if self.qual == '*': seq = fastn.Fasta(self.id, self.seq) else: seq = fastn.Fastq(self.id, self.seq, self.qual) if self.query_strand() == '-': seq.revcomp() if self.is_first_of_pair(): seq.id += '/1' elif self.is_second_of_pair(): seq.id += '/2' return seq
def test_trim_Ns(self): '''trim_Ns() should do the right trimming of a fastq sequence''' fq = fastn.Fastq('ID', 'ANNANA', '111111') test_seqs = [ fastn.Fastq('ID', 'ANNANA', '111111'), fastn.Fastq('ID', 'NANNANA', '1111111'), fastn.Fastq('ID', 'NANNANAN', '11111111'), fastn.Fastq('ID', 'ANNANAN', '1111111'), fastn.Fastq('ID', 'NNNNNNANNANAN', '1111111111111'), fastn.Fastq('ID', 'NNANNANANn', '1111111111') ] for s in test_seqs: s.trim_Ns() self.assertEqual(fq, s)
def test_file_reader_fastq(self): '''file_reader should iterate through a fastq file correctly''' reader = fastn.file_reader('fastn_unittest_good_file.fq') for seq in reader: self.assertEqual(seq, fastn.Fastq('ID', 'ACGTA', 'IIIII'))
def test_to_Fasta_and_qual(self): '''Check to_Fasta_and_qual converts quality scores correctly''' fq = fastn.Fastq('ID', 'ACGT', '>ADI') (fa, qual) = fq.to_Fasta_and_qual() self.assertEqual(fa, fastn.Fasta('ID', 'ACGT')) self.assertListEqual(qual, [29, 32, 35, 40])
def test_revcomp(self): '''revcomp() should correctly reverse complement a sequence''' fq = fastn.Fastq('ID', 'ACGTNacgtn', '1234567890') fq.revcomp() self.assertEqual(fq, fastn.Fastq('ID', 'nacgtNACGT', '0987654321'))
def test_init_length_mismatch(self): '''__init__ should raise an error when length of seq and quality not the same''' with self.assertRaises(fastn.Error): fastn.Fastq('X', 'A', 'II')
def setUp(self): self.fastq = fastn.Fastq('ID', 'ACGTA', 'IIIII')
def test_to_Fastq(self): '''Check to_Fastq converts OK, including out of range quality scores''' fa = fastn.Fasta('X', 'AAAAA') quals = [-1, 0, 40, 93, 94] self.assertEqual(fastn.Fastq('X', 'AAAAA', '!!I~~'), fa.to_Fastq(quals))
options = parser.parse_args() ref_seqs = {} fastn.file_to_dict(options.ref_in, ref_seqs) sam_reader = sam.file_reader(options.bam_in) reads = {} f1 = utils.open_file_write(options.outprefix + '_1.fq') f2 = utils.open_file_write(options.outprefix + '_2.fq') for s in sam_reader: if s.is_mapped() and s.is_mate_mapped(): end = min(s.pos + options.length - 1, len(ref_seqs[s.rname]) - 1) start = max(end - options.length + 1, 0) read = fastn.Fastq(s.id, ref_seqs[s.rname][start:end + 1], 'I' * (end - start + 1)) if not s.is_forward_strand(): read.revcomp() if read.id in reads: if s.is_first_of_pair(): read.id += '/1' mate = reads[s.id] mate.id += '/2' print(read, file=f1) print(mate, file=f2) else: read.id += '/2' mate = reads[s.id] mate.id += '/1'
if len(seq) < options.read_length: end_range = 1 for i in range(0, end_range, options.read_step): if len(seq) <= options.read_length: start = 0 end = len(seq) - 1 else: start = i end = start + options.read_length - 1 if end > len(seq) - 1: end = len(seq) - 1 start = end - options.read_length + 1 read = fastn.Fastq( options.read_prefix + ':' + seq.id + ':' + str(start + 1) + ':' + str(end + 1), seq[start:end + 1], 'I' * (end - start + 1)) print('\t'.join([ read.id, '0', seq.id, str(start + 1), '60', str(len(read)) + 'M', '*', '*', '*', read.seq, read.qual, 'RG:Z:' + options.read_group ]), file=f) if end == len(seq) - 1: break f.close()
while isize > len(ref) and isize > options.readlength: isize = int(random.normalvariate(options.mean_insert, options.insert_std)) middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize)) read_start1 = int(middle_pos - ceil(0.5 * isize)) read_start2 = read_start1 + isize - options.readlength readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)]) fragment = (middle_pos, isize) if fragment in used_fragments: used_fragments[fragment] += 1 readname += '.dup.' + str(used_fragments[fragment]) else: used_fragments[fragment] = 1 read1 = fastn.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength) read2 = fastn.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength) if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq): continue read2.revcomp() print(read1, file=fout) print(read2, file=fout) pair_counter += 1 x += 1 utils.close(fout)