Esempio n. 1
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file'''
        bad_files = [
            'fastn_unittest_fail_no_AT.fq', 'fastn_unittest_fail_no_seq.fq',
            'fastn_unittest_fail_no_plus.fq', 'fastn_unittest_fail_no_qual.fq'
        ]

        for fname in bad_files:
            f_in = utils.open_file_read(fname)
            fq = fastn.Fastq()
            with self.assertRaises(fastn.Error):
                while fq.get_next_from_file(f_in):
                    pass

            utils.close(f_in)

        fname = 'fastn_unittest_good_file.fq'
        try:
            f_in = open(fname)
        except IOError:
            print("Error opening '" + fname + "'", file=sys.stderr)
            sys.exit(1)

        fq = fastn.Fastq()
        while fq.get_next_from_file(f_in):
            self.assertEqual(fq, fastn.Fastq('ID', 'ACGTA', 'IIIII'))
        utils.close(f_in)
Esempio n. 2
0
    def test_to_fastn(self):
        '''Check conversion to fastq with to_fastq()'''
        sams = [
            sam.SamRecord(
                'ID\t0\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'),
            sam.SamRecord(
                'ID\t16\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'),
            sam.SamRecord(
                'ID\t65\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'),
            sam.SamRecord(
                'ID\t129\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\tIIIII'),
            sam.SamRecord('ID\t0\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\t*'),
            sam.SamRecord('ID\t16\tref\t1\t47\t2S73M\t=\t362\t438\tACGTA\t*')
        ]
        seqs = [
            fastn.Fastq('ID', 'ACGTA', 'IIIII'),
            fastn.Fastq('ID', 'TACGT', 'IIIII'),
            fastn.Fastq('ID/1', 'ACGTA', 'IIIII'),
            fastn.Fastq('ID/2', 'ACGTA', 'IIIII'),
            fastn.Fasta('ID', 'ACGTA'),
            fastn.Fasta('ID', 'TACGT')
        ]

        for i in range(len(sams)):
            self.assertEqual(seqs[i], sams[i].to_fastn())
Esempio n. 3
0
    def test_translate(self):
        '''Test nucleatide -> amino acid conversion works on Fasta'''
        fq = fastn.Fastq(
            'ID',
            'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA',
            'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII'
        )

        self.assertEqual(
            fastn.Fastq(
                'ID',
                'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***',
                'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII'
            ), fq.translate())
Esempio n. 4
0
    def test_trim(self):
        '''trim() should trim the right number of bases off start and end'''
        fq = fastn.Fastq('ID', '1234567890', '1234567890')
        fq.trim(0, 0)
        self.assertEqual(fq, fastn.Fastq('ID', '1234567890', '1234567890'))

        fq = fastn.Fastq('ID', '1234567890', '1234567890')
        fq.trim(1, 0)
        self.assertEqual(fq, fastn.Fastq('ID', '234567890', '234567890'))

        fq = fastn.Fastq('ID', '1234567890', '1234567890')
        fq.trim(0, 1)
        self.assertEqual(fq, fastn.Fastq('ID', '123456789', '123456789'))

        fq = fastn.Fastq('ID', '1234567890', '1234567890')
        fq.trim(2, 2)
        self.assertEqual(fq, fastn.Fastq('ID', '345678', '345678'))
Esempio n. 5
0
    def to_fastn(self):
        if self.qual == '*':
            seq = fastn.Fasta(self.id, self.seq)
        else:
            seq = fastn.Fastq(self.id, self.seq, self.qual)

        if self.query_strand() == '-':
            seq.revcomp()

        if self.is_first_of_pair():
            seq.id += '/1'
        elif self.is_second_of_pair():
            seq.id += '/2'

        return seq
Esempio n. 6
0
    def test_trim_Ns(self):
        '''trim_Ns() should do the right trimming of a fastq sequence'''
        fq = fastn.Fastq('ID', 'ANNANA', '111111')
        test_seqs = [
            fastn.Fastq('ID', 'ANNANA', '111111'),
            fastn.Fastq('ID', 'NANNANA', '1111111'),
            fastn.Fastq('ID', 'NANNANAN', '11111111'),
            fastn.Fastq('ID', 'ANNANAN', '1111111'),
            fastn.Fastq('ID', 'NNNNNNANNANAN', '1111111111111'),
            fastn.Fastq('ID', 'NNANNANANn', '1111111111')
        ]

        for s in test_seqs:
            s.trim_Ns()
            self.assertEqual(fq, s)
Esempio n. 7
0
 def test_file_reader_fastq(self):
     '''file_reader should iterate through a fastq file correctly'''
     reader = fastn.file_reader('fastn_unittest_good_file.fq')
     for seq in reader:
         self.assertEqual(seq, fastn.Fastq('ID', 'ACGTA', 'IIIII'))
Esempio n. 8
0
 def test_to_Fasta_and_qual(self):
     '''Check to_Fasta_and_qual converts quality scores correctly'''
     fq = fastn.Fastq('ID', 'ACGT', '>ADI')
     (fa, qual) = fq.to_Fasta_and_qual()
     self.assertEqual(fa, fastn.Fasta('ID', 'ACGT'))
     self.assertListEqual(qual, [29, 32, 35, 40])
Esempio n. 9
0
 def test_revcomp(self):
     '''revcomp() should correctly reverse complement a sequence'''
     fq = fastn.Fastq('ID', 'ACGTNacgtn', '1234567890')
     fq.revcomp()
     self.assertEqual(fq, fastn.Fastq('ID', 'nacgtNACGT', '0987654321'))
Esempio n. 10
0
 def test_init_length_mismatch(self):
     '''__init__ should raise an error when length of seq and quality not the same'''
     with self.assertRaises(fastn.Error):
         fastn.Fastq('X', 'A', 'II')
Esempio n. 11
0
 def setUp(self):
     self.fastq = fastn.Fastq('ID', 'ACGTA', 'IIIII')
Esempio n. 12
0
 def test_to_Fastq(self):
     '''Check to_Fastq converts OK, including out of range quality scores'''
     fa = fastn.Fasta('X', 'AAAAA')
     quals = [-1, 0, 40, 93, 94]
     self.assertEqual(fastn.Fastq('X', 'AAAAA', '!!I~~'),
                      fa.to_Fastq(quals))
Esempio n. 13
0
options = parser.parse_args()

ref_seqs = {}
fastn.file_to_dict(options.ref_in, ref_seqs)

sam_reader = sam.file_reader(options.bam_in)
reads = {}

f1 = utils.open_file_write(options.outprefix + '_1.fq')
f2 = utils.open_file_write(options.outprefix + '_2.fq')

for s in sam_reader:
    if s.is_mapped() and s.is_mate_mapped():
        end = min(s.pos + options.length - 1, len(ref_seqs[s.rname]) - 1)
        start = max(end - options.length + 1, 0)
        read = fastn.Fastq(s.id, ref_seqs[s.rname][start:end + 1],
                           'I' * (end - start + 1))

        if not s.is_forward_strand():
            read.revcomp()

        if read.id in reads:
            if s.is_first_of_pair():
                read.id += '/1'
                mate = reads[s.id]
                mate.id += '/2'
                print(read, file=f1)
                print(mate, file=f2)
            else:
                read.id += '/2'
                mate = reads[s.id]
                mate.id += '/1'
    if len(seq) < options.read_length:
        end_range = 1
    for i in range(0, end_range, options.read_step):
        if len(seq) <= options.read_length:
            start = 0
            end = len(seq) - 1
        else:
            start = i
            end = start + options.read_length - 1

            if end > len(seq) - 1:
                end = len(seq) - 1
                start = end - options.read_length + 1

        read = fastn.Fastq(
            options.read_prefix + ':' + seq.id + ':' + str(start + 1) + ':' +
            str(end + 1), seq[start:end + 1], 'I' * (end - start + 1))

        print('\t'.join([
            read.id, '0', seq.id,
            str(start + 1), '60',
            str(len(read)) + 'M', '*', '*', '*', read.seq, read.qual,
            'RG:Z:' + options.read_group
        ]),
              file=f)

        if end == len(seq) - 1:
            break

f.close()
Esempio n. 15
0
        while isize > len(ref) and isize > options.readlength:
            isize = int(random.normalvariate(options.mean_insert, options.insert_std))
        middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize))
        read_start1 = int(middle_pos - ceil(0.5 * isize))
        read_start2 = read_start1 + isize - options.readlength

        readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)])

        fragment = (middle_pos, isize)
        if fragment in used_fragments:
            used_fragments[fragment] += 1
            readname += '.dup.' + str(used_fragments[fragment])
        else:
            used_fragments[fragment] = 1

        read1 = fastn.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength)
        read2 = fastn.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength)

        if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq):
            continue

        read2.revcomp()

        print(read1, file=fout)
        print(read2, file=fout)

        pair_counter += 1
        x += 1

utils.close(fout)