Esempio n. 1
0
    def test_expand_nucleotides(self):
        '''Test expand_nucleotides'''
        tests = [
            (sequences.Fasta('1', 'A'), [sequences.Fasta('1.1', 'A')]),
            (sequences.Fasta('2', 'C'), [sequences.Fasta('2.1', 'C')]),
            (sequences.Fasta('3', 'G'), [sequences.Fasta('3.1', 'G')]),
            (sequences.Fasta('4', 'T'), [sequences.Fasta('4.1', 'T')]),
            (sequences.Fasta('6', 'R'), [sequences.Fasta('6.1', 'A'), sequences.Fasta('6.2', 'G')]),
            (sequences.Fasta('7', 'Y'), [sequences.Fasta('7.1', 'C'), sequences.Fasta('7.2', 'T')]),
            (sequences.Fasta('8', 'S'), [sequences.Fasta('8.1', 'C'), sequences.Fasta('8.2', 'G')]),
            (sequences.Fasta('9', 'W'), [sequences.Fasta('9.1', 'A'), sequences.Fasta('9.2', 'T')]),
            (sequences.Fasta('10', 'K'), [sequences.Fasta('10.1', 'G'), sequences.Fasta('10.2', 'T')]),
            (sequences.Fasta('11', 'M'), [sequences.Fasta('11.1', 'A'), sequences.Fasta('11.2', 'C')]),
            (sequences.Fasta('12', 'B'), [sequences.Fasta('12.1', 'C'), sequences.Fasta('12.2', 'G'), sequences.Fasta('12.3', 'T')]),
            (sequences.Fasta('13', 'D'), [sequences.Fasta('13.1', 'A'), sequences.Fasta('13.2', 'G'), sequences.Fasta('13.3', 'T')]),
            (sequences.Fasta('14', 'H'), [sequences.Fasta('14.1', 'A'), sequences.Fasta('14.2', 'C'), sequences.Fasta('14.3', 'T')]),
            (sequences.Fasta('15', 'V'), [sequences.Fasta('15.1', 'A'), sequences.Fasta('15.2', 'C'), sequences.Fasta('15.3', 'G')]),
            (sequences.Fasta('16', 'N'), [sequences.Fasta('16.1', 'A'), sequences.Fasta('16.2', 'C'), sequences.Fasta('16.3', 'G'), sequences.Fasta('16.4', 'T')]),
            (sequences.Fasta('17', 'ART'), [sequences.Fasta('17.1', 'AAT'), sequences.Fasta('17.2', 'AGT')]),
            (sequences.Fasta('18', 'ARRT'), [sequences.Fasta('18.1', 'AAAT'), sequences.Fasta('18.2', 'AAGT'), sequences.Fasta('18.3', 'AGAT'), sequences.Fasta('18.4', 'AGGT')]),
            (sequences.Fasta('19', 'ARTR'), [sequences.Fasta('19.1', 'AATA'), sequences.Fasta('19.2', 'AATG'), sequences.Fasta('19.3', 'AGTA'), sequences.Fasta('19.4', 'AGTG')]),
            (sequences.Fastq('20', 'ART', 'GHI'), [sequences.Fastq('20.1', 'AAT', 'GHI'), sequences.Fastq('20.2', 'AGT', 'GHI')]),
        ]

        for t in tests:
            self.assertListEqual(t[0].expand_nucleotides(), t[1])
Esempio n. 2
0
    def test_get_next_from_file(self):
        '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file'''
        bad_files = ['sequences_test_fail_no_AT.fq',
                     'sequences_test_fail_no_seq.fq',
                     'sequences_test_fail_no_plus.fq',
                     'sequences_test_fail_no_qual.fq']

        bad_files = [os.path.join(data_dir, x) for x in bad_files]

        for fname in bad_files:
            f_in = utils.open_file_read(fname)
            fq = sequences.Fastq()
            with self.assertRaises(sequences.Error):
                while fq.get_next_from_file(f_in):
                    pass

            utils.close(f_in)

        fname = os.path.join(data_dir, 'sequences_test_good_file.fq')
        try:
            f_in = open(fname)
        except IOError:
            print("Error opening '" + fname + "'", file=sys.stderr)
            sys.exit(1)

        fq = sequences.Fastq()
        while fq.get_next_from_file(f_in):
            self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
        utils.close(f_in)
Esempio n. 3
0
 def test_subseq(self):
     '''Test subseq'''
     fq = sequences.Fastq('name', 'ACGTA', 'FGHIJ')
     self.assertEqual(fq.subseq(1, 4),
                      sequences.Fastq('name', 'CGT', 'GHI'))
     self.assertEqual(fq.subseq(None, 4),
                      sequences.Fastq('name', 'ACGT', 'FGHI'))
     self.assertEqual(fq.subseq(1, None),
                      sequences.Fastq('name', 'CGTA', 'GHIJ'))
Esempio n. 4
0
    def test_translate(self):
        '''Test nucleatide -> amino acid conversion works on Fasta'''
        fq = sequences.Fastq(
            'ID',
            'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA',
            'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII'
        )

        self.assertEqual(
            sequences.Fastq(
                'ID',
                'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***',
                'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII'
            ), fq.translate())
Esempio n. 5
0
 def test_to_Fastq(self):
     '''Check to_Fastq converts OK, including out of range quality scores'''
     fa = sequences.Fasta('X', 'AAAAA')
     quals = [-1, 0, 40, 93, 94]
     self.assertEqual(sequences.Fastq('X', 'AAAAA', '!!I~~'), fa.to_Fastq(quals))
     with self.assertRaises(sequences.Error):
         fa.to_Fastq('AAAAAAAAAAAAA')
Esempio n. 6
0
    def test_trim_Ns(self):
        '''trim_Ns() should do the right trimming of a fastq sequence'''
        fq = sequences.Fastq('ID', 'ANNANA', '111111')
        test_seqs = [sequences.Fastq('ID', 'ANNANA', '111111'),
                     sequences.Fastq('ID', 'NANNANA', '1111111'),
                     sequences.Fastq('ID', 'NANNANAN', '11111111'),
                     sequences.Fastq('ID', 'ANNANAN', '1111111'),
                     sequences.Fastq('ID', 'NNNNNNANNANAN', '1111111111111'),
                     sequences.Fastq('ID', 'NNANNANANn', '1111111111')]

        for s in test_seqs:
            s.trim_Ns()
            self.assertEqual(fq, s)
Esempio n. 7
0
    def test_trim(self):
        '''trim() should trim the right number of bases off start and end'''
        fq = sequences.Fastq('ID', '1234567890', '1234567890')
        fq.trim(0, 0)
        self.assertEqual(fq, sequences.Fastq('ID', '1234567890', '1234567890'))

        fq = sequences.Fastq('ID', '1234567890', '1234567890')
        fq.trim(1, 0)
        self.assertEqual(fq, sequences.Fastq('ID', '234567890', '234567890'))

        fq = sequences.Fastq('ID', '1234567890', '1234567890')
        fq.trim(0, 1)
        self.assertEqual(fq, sequences.Fastq('ID', '123456789', '123456789'))

        fq = sequences.Fastq('ID', '1234567890', '1234567890')
        fq.trim(2, 2)
        self.assertEqual(fq, sequences.Fastq('ID', '345678', '345678'))
Esempio n. 8
0
def merge_to_one_seq(infile, outfile, seqname='union'):
    '''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order'''
    seq_reader = sequences.file_reader(infile)
    seqs = []

    for seq in seq_reader:
        seqs.append(copy.copy(seq))

    new_seq = ''.join([seq.seq for seq in seqs])

    if type(seqs[0]) == sequences.Fastq:
        new_qual = ''.join([seq.qual for seq in seqs])
        seqs[:] = []
        merged = sequences.Fastq(seqname, new_seq, new_qual)
    else:
        merged = sequences.Fasta(seqname, new_seq)
        seqs[:] = []

    f = utils.open_file_write(outfile)
    print(merged, file=f)
    utils.close(f)
Esempio n. 9
0
    def test_replace_interval(self):
        '''Test replace_interval()'''
        fa = sequences.Fasta('ID', 'ACGTA')
        fa.replace_interval(0, 0, 'NEW')
        self.assertEqual(fa, sequences.Fasta('ID', 'NEWCGTA'))

        fa = sequences.Fasta('ID', 'ACGTA')
        fa.replace_interval(4, 4, 'NEW')
        self.assertEqual(fa, sequences.Fasta('ID', 'ACGTNEW'))

        fa = sequences.Fasta('ID', 'ACGTA')
        fa.replace_interval(2, 3, 'NEW')
        self.assertEqual(fa, sequences.Fasta('ID', 'ACNEWA'))

        fa = sequences.Fasta('ID', 'ACGTA')
        with self.assertRaises(sequences.Error):
            fa.replace_interval(3,2,'x')
        with self.assertRaises(sequences.Error):
            fa.replace_interval(1,5,'x')
        with self.assertRaises(sequences.Error):
            fa.replace_interval(5,10,'x')

        fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE')
        fq.replace_interval(0, 0, 'NEW', 'III')
        self.assertEqual(fq, sequences.Fastq('ID', 'NEWCGTA', 'IIIBCDE'))

        fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE')
        fq.replace_interval(4, 4, 'NEW', 'III')
        self.assertEqual(fq, sequences.Fastq('ID', 'ACGTNEW', 'ABCDIII'))

        fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE')
        fq.replace_interval(2, 3, 'NEW', 'III')
        self.assertEqual(fq, sequences.Fastq('ID', 'ACNEWA', 'ABIIIE'))

        with self.assertRaises(sequences.Error):
            fq.replace_interval(1,1,'x', 'xx')
Esempio n. 10
0
 def test_to_Fasta_and_qual(self):
     '''Check to_Fasta_and_qual converts quality scores correctly'''
     fq = sequences.Fastq('ID', 'ACGT', '>ADI')
     (fa, qual) = fq.to_Fasta_and_qual()
     self.assertEqual(fa, sequences.Fasta('ID', 'ACGT'))
     self.assertListEqual(qual, [29, 32, 35, 40])
Esempio n. 11
0
 def test_revcomp(self):
     '''revcomp() should correctly reverse complement a sequence'''
     fq = sequences.Fastq('ID', 'ACGTNacgtn', '1234567890')
     fq.revcomp()
     self.assertEqual(fq, sequences.Fastq('ID', 'nacgtNACGT', '0987654321'))
Esempio n. 12
0
 def test_init_length_mismatch(self):
     '''__init__ should raise an error when length of seq and quality not the same'''
     with self.assertRaises(sequences.Error):
         sequences.Fastq('X', 'A', 'II')
Esempio n. 13
0
    def test_make_into_gene_fastq(self):
        '''Test make_into_gene fastq'''
        print('sequences.genetic_code', sequences.genetic_code)
        tests = [
            (sequences.Fastq('ID', 'T', '1'), None),
            (sequences.Fastq('ID', 'TT', '12'), None),
            (sequences.Fastq('ID', 'TTT', '123'), None),
            (sequences.Fastq('ID', 'TTG', '123'), None),
            (sequences.Fastq('ID', 'TAA', '123'), None),
            (sequences.Fastq('ID', 'TTGAAATAA', '123456789'), (sequences.Fastq('ID', 'TTGAAATAA', '123456789'), '+', 0)),
            (sequences.Fastq('ID', 'TTGAAATAT', '123456789'), None),
            (sequences.Fastq('ID', 'TTGTAA', '123456'), (sequences.Fastq('ID', 'TTGTAA', '123456'), '+', 0)),
            (sequences.Fastq('ID', 'TTGTAAA', '1234567'), (sequences.Fastq('ID', 'TTGTAA', '123456'), '+', 0)),
            (sequences.Fastq('ID', 'TTGTAAAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '123456'), '+', 0)),
            (sequences.Fastq('ID', 'TTGTAAAAA', '123456789'), None),
            (sequences.Fastq('ID', 'ATTGTAA', '1234567'), (sequences.Fastq('ID', 'TTGTAA', '234567'), '+', 1)),
            (sequences.Fastq('ID', 'ATTGTAAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '234567'), '+', 1)),
            (sequences.Fastq('ID', 'ATTGTAAAA', '123456789'), (sequences.Fastq('ID', 'TTGTAA', '234567'), '+', 1)),
            (sequences.Fastq('ID', 'ATTGTAAAAA', '123456789A'), None),
            (sequences.Fastq('ID', 'AATTGTAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '345678'), '+', 2)),
            (sequences.Fastq('ID', 'AATTGTAAA', '123456789'), (sequences.Fastq('ID', 'TTGTAA', '345678'), '+', 2)),
            (sequences.Fastq('ID', 'AATTGTAAAA', '123456789A'), (sequences.Fastq('ID', 'TTGTAA', '345678'), '+', 2)),
            (sequences.Fastq('ID', 'AATTGTAAAAA', '123456789AB'), None),
            (sequences.Fastq('ID', 'TTACAA', '123456'), (sequences.Fastq('ID', 'TTGTAA', '654321'), '-', 0)),
            (sequences.Fastq('ID', 'ATTACAA', '1234567'), (sequences.Fastq('ID', 'TTGTAA', '765432'), '-', 0)),
            (sequences.Fastq('ID', 'AATTACAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '876543'), '-', 0)),
            (sequences.Fastq('ID', 'AAATTACAA', '123456789'), None),
            (sequences.Fastq('ID', 'TTACAAA', '1234567'), (sequences.Fastq('ID', 'TTGTAA', '654321'), '-', 1)),
            (sequences.Fastq('ID', 'ATTACAAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '765432'), '-', 1)),
            (sequences.Fastq('ID', 'AATTACAAA', '123456789'), (sequences.Fastq('ID', 'TTGTAA', '876543'), '-', 1)),
            (sequences.Fastq('ID', 'AAATTACAAA', '123456789A'), None),
            (sequences.Fastq('ID', 'TTACAAAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '654321'), '-', 2)),
            (sequences.Fastq('ID', 'ATTACAAAA', '123456789'), (sequences.Fastq('ID', 'TTGTAA', '765432'), '-', 2)),
            (sequences.Fastq('ID', 'AATTACAAAA', '123456789A'), (sequences.Fastq('ID', 'TTGTAA', '876543'), '-', 2)),
            (sequences.Fastq('ID', 'AAATTACAAAA', '123456789AB'), None),
        ]

        for seq, expected in tests:
            self.assertEqual(seq.make_into_gene(), expected)
Esempio n. 14
0
def run(description):
    parser = argparse.ArgumentParser(
        description=
        'Takes a sequence file. Makes a BAM file containing perfect (unpaired) reads tiling the whole genome',
        usage=
        'fastaq to_tiling_bam [options] <infile> <read_length> <read_step> <read_prefix> <outfile>',
        epilog='Important: assumes that samtools is in your path')
    parser.add_argument('infile', help='Name of input fasta/q file')
    parser.add_argument('read_length', type=int, help='Length of reads')
    parser.add_argument('read_step',
                        type=int,
                        help='Distance between start of each read')
    parser.add_argument('read_prefix', help='Prefix of read names')
    parser.add_argument('outfile', help='Name of output BAM file')
    parser.add_argument(
        '--read_group',
        help='Add the given read group ID to all reads [%(default)s]',
        default='42')
    options = parser.parse_args()

    # make a header first  - we need to add the @RG line to the default header made by samtools
    tmp_empty_file = options.outfile + '.tmp.empty'
    f = utils.open_file_write(tmp_empty_file)
    utils.close(f)
    try:
        f = os.popen('samtools view -H -T ' + options.infile + ' ' +
                     tmp_empty_file)
    except IOError:
        print('Error making tmp header file', file=sys.stderr)
        sys.exit(1)

    header_lines = f.readlines()
    header_lines.append('@RG\tID:' + options.read_group + '\tSM:FAKE')
    f.close()
    os.unlink(tmp_empty_file)

    seq_reader = sequences.file_reader(options.infile)
    try:
        f = os.popen('samtools view -hbS - > ' + options.outfile, 'w')
    except IOError:
        print("Error opening for writing BAM file '" + options.outfile + "'",
              file=sys.stderr)
        sys.exit(1)

    print(''.join(header_lines), file=f)

    for seq in seq_reader:
        end_range = len(seq)
        if len(seq) < options.read_length:
            end_range = 1
        for i in range(0, end_range, options.read_step):
            if len(seq) <= options.read_length:
                start = 0
                end = len(seq) - 1
            else:
                start = i
                end = start + options.read_length - 1

                if end > len(seq) - 1:
                    end = len(seq) - 1
                    start = end - options.read_length + 1

            read = sequences.Fastq(
                options.read_prefix + ':' + seq.id + ':' + str(start + 1) +
                ':' + str(end + 1), seq[start:end + 1],
                'I' * (end - start + 1))

            print('\t'.join([
                read.id, '0', seq.id,
                str(start + 1), '60',
                str(len(read)) + 'M', '*', '*', '*', read.seq, read.qual,
                'RG:Z:' + options.read_group
            ]),
                  file=f)

            if end == len(seq) - 1:
                break

    f.close()
Esempio n. 15
0
 def test_file_reader_fastq(self):
     '''file_reader should iterate through a fastq file correctly'''
     reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_good_file.fq'))
     for seq in reader:
         self.assertEqual(seq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
Esempio n. 16
0
def run(description):
    parser = argparse.ArgumentParser(
        description = 'Makes perfect paired end fastq reads from a sequence file, with insert sizes sampled from a normal distribution. Read orientation is innies. Output is an interleaved FASTQ file.',
        usage = 'fastaq to_perfect_reads [options] <infile> <outfile> <mean insert size> <insert std deviation> <mean coverage> <read length>')
    parser.add_argument('infile', help='Name of input file')
    parser.add_argument('outfile', help='Name of output file')
    parser.add_argument('mean_insert', type=int, help='Mean insert size of read pairs', metavar='mean insert size')
    parser.add_argument('insert_std', type=float, help='Standard devation of insert size', metavar='insert std deviation')
    parser.add_argument('coverage', type=float, help='Mean coverage of the reads', metavar='mean coverage')
    parser.add_argument('readlength', type=int, help='Length of each read', metavar='read length')
    parser.add_argument('--fragments', help='Write FASTA sequences of fragments (i.e. read pairs plus sequences in between them) to the given filename', metavar='FILENAME')
    parser.add_argument('--no_n', action='store_true', help='Don\'t allow any N or n characters in the reads')
    parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None, metavar='INT')
    options = parser.parse_args()

    random.seed(a=options.seed)

    seq_reader = sequences.file_reader(options.infile)
    fout = utils.open_file_write(options.outfile)
    pair_counter = 1

    if options.fragments:
        fout_frags = utils.open_file_write(options.fragments)

    for ref in seq_reader:
        # check if current seq is long enough
        if len(ref) < options.mean_insert + 4 * options.insert_std:
            print('Warning, sequence ', ref.id, ' too short.  Skipping it...', file=sys.stderr)
            continue

        # work out how many reads to simulate
        read_pairs = int(0.5 * options.coverage * len(ref) / options.readlength)

        # it's possible that we pick the same fragment twice, in which case the
        # reads would get the same name. So remember the frag coords
        used_fragments = {}  # (middle_position, length) => count

        # do the simulation:  pick insert size from normal distribution, and
        # position in genome from uniform distribution
        x = 0
        while x < read_pairs:
            isize = int(random.normalvariate(options.mean_insert, options.insert_std))
            while isize > len(ref) or isize < options.readlength:
                isize = int(random.normalvariate(options.mean_insert, options.insert_std))
            middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize))
            read_start1 = int(middle_pos - ceil(0.5 * isize))
            read_start2 = read_start1 + isize - options.readlength

            readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)])

            fragment = (middle_pos, isize)
            if fragment in used_fragments:
                used_fragments[fragment] += 1
                readname += '.dup.' + str(used_fragments[fragment])
            else:
                used_fragments[fragment] = 1

            read1 = sequences.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength)
            read2 = sequences.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength)


            if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq):
                continue

            read2.revcomp()

            print(read1, file=fout)
            print(read2, file=fout)

            if options.fragments:
                frag = sequences.Fasta(readname, ref.seq[read_start1:read_start2 + options.readlength])
                print(frag, file=fout_frags)

            pair_counter += 1
            x += 1

    utils.close(fout)
    if options.fragments:
        utils.close(fout_frags)
Esempio n. 17
0
 def setUp(self):
     self.fastq = sequences.Fastq('ID', 'ACGTA', 'IIIII')