def test_expand_nucleotides(self): '''Test expand_nucleotides''' tests = [ (sequences.Fasta('1', 'A'), [sequences.Fasta('1.1', 'A')]), (sequences.Fasta('2', 'C'), [sequences.Fasta('2.1', 'C')]), (sequences.Fasta('3', 'G'), [sequences.Fasta('3.1', 'G')]), (sequences.Fasta('4', 'T'), [sequences.Fasta('4.1', 'T')]), (sequences.Fasta('6', 'R'), [sequences.Fasta('6.1', 'A'), sequences.Fasta('6.2', 'G')]), (sequences.Fasta('7', 'Y'), [sequences.Fasta('7.1', 'C'), sequences.Fasta('7.2', 'T')]), (sequences.Fasta('8', 'S'), [sequences.Fasta('8.1', 'C'), sequences.Fasta('8.2', 'G')]), (sequences.Fasta('9', 'W'), [sequences.Fasta('9.1', 'A'), sequences.Fasta('9.2', 'T')]), (sequences.Fasta('10', 'K'), [sequences.Fasta('10.1', 'G'), sequences.Fasta('10.2', 'T')]), (sequences.Fasta('11', 'M'), [sequences.Fasta('11.1', 'A'), sequences.Fasta('11.2', 'C')]), (sequences.Fasta('12', 'B'), [sequences.Fasta('12.1', 'C'), sequences.Fasta('12.2', 'G'), sequences.Fasta('12.3', 'T')]), (sequences.Fasta('13', 'D'), [sequences.Fasta('13.1', 'A'), sequences.Fasta('13.2', 'G'), sequences.Fasta('13.3', 'T')]), (sequences.Fasta('14', 'H'), [sequences.Fasta('14.1', 'A'), sequences.Fasta('14.2', 'C'), sequences.Fasta('14.3', 'T')]), (sequences.Fasta('15', 'V'), [sequences.Fasta('15.1', 'A'), sequences.Fasta('15.2', 'C'), sequences.Fasta('15.3', 'G')]), (sequences.Fasta('16', 'N'), [sequences.Fasta('16.1', 'A'), sequences.Fasta('16.2', 'C'), sequences.Fasta('16.3', 'G'), sequences.Fasta('16.4', 'T')]), (sequences.Fasta('17', 'ART'), [sequences.Fasta('17.1', 'AAT'), sequences.Fasta('17.2', 'AGT')]), (sequences.Fasta('18', 'ARRT'), [sequences.Fasta('18.1', 'AAAT'), sequences.Fasta('18.2', 'AAGT'), sequences.Fasta('18.3', 'AGAT'), sequences.Fasta('18.4', 'AGGT')]), (sequences.Fasta('19', 'ARTR'), [sequences.Fasta('19.1', 'AATA'), sequences.Fasta('19.2', 'AATG'), sequences.Fasta('19.3', 'AGTA'), sequences.Fasta('19.4', 'AGTG')]), (sequences.Fastq('20', 'ART', 'GHI'), [sequences.Fastq('20.1', 'AAT', 'GHI'), sequences.Fastq('20.2', 'AGT', 'GHI')]), ] for t in tests: self.assertListEqual(t[0].expand_nucleotides(), t[1])
def test_get_next_from_file(self): '''get_next_from_file() should read seqs from OK, and raise error at badly formatted file''' bad_files = ['sequences_test_fail_no_AT.fq', 'sequences_test_fail_no_seq.fq', 'sequences_test_fail_no_plus.fq', 'sequences_test_fail_no_qual.fq'] bad_files = [os.path.join(data_dir, x) for x in bad_files] for fname in bad_files: f_in = utils.open_file_read(fname) fq = sequences.Fastq() with self.assertRaises(sequences.Error): while fq.get_next_from_file(f_in): pass utils.close(f_in) fname = os.path.join(data_dir, 'sequences_test_good_file.fq') try: f_in = open(fname) except IOError: print("Error opening '" + fname + "'", file=sys.stderr) sys.exit(1) fq = sequences.Fastq() while fq.get_next_from_file(f_in): self.assertEqual(fq, sequences.Fastq('ID', 'ACGTA', 'IIIII')) utils.close(f_in)
def test_subseq(self): '''Test subseq''' fq = sequences.Fastq('name', 'ACGTA', 'FGHIJ') self.assertEqual(fq.subseq(1, 4), sequences.Fastq('name', 'CGT', 'GHI')) self.assertEqual(fq.subseq(None, 4), sequences.Fastq('name', 'ACGT', 'FGHI')) self.assertEqual(fq.subseq(1, None), sequences.Fastq('name', 'CGTA', 'GHIJ'))
def test_translate(self): '''Test nucleatide -> amino acid conversion works on Fasta''' fq = sequences.Fastq( 'ID', 'GCAGCCGCGGCTAGAAGGCGACGCCGGCGTAACAATGACGATTGCTGTGAAGAGCAACAGGGAGGCGGGGGTCACCATATAATCATTTTATTGCTACTCCTGCTTAAAAAGATGTTCTTTCCACCCCCGCCTAGCAGTTCATCCTCGTCTACAACCACGACTTGGTACTATGTAGTCGTGGTTTAATAGTGA', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII' ) self.assertEqual( sequences.Fastq( 'ID', 'AAAARRRRRRNNDDCCEEQQGGGGHHIIILLLLLLKKMFFPPPPSSSSSSTTTTWYYVVVV***', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII' ), fq.translate())
def test_to_Fastq(self): '''Check to_Fastq converts OK, including out of range quality scores''' fa = sequences.Fasta('X', 'AAAAA') quals = [-1, 0, 40, 93, 94] self.assertEqual(sequences.Fastq('X', 'AAAAA', '!!I~~'), fa.to_Fastq(quals)) with self.assertRaises(sequences.Error): fa.to_Fastq('AAAAAAAAAAAAA')
def test_trim_Ns(self): '''trim_Ns() should do the right trimming of a fastq sequence''' fq = sequences.Fastq('ID', 'ANNANA', '111111') test_seqs = [sequences.Fastq('ID', 'ANNANA', '111111'), sequences.Fastq('ID', 'NANNANA', '1111111'), sequences.Fastq('ID', 'NANNANAN', '11111111'), sequences.Fastq('ID', 'ANNANAN', '1111111'), sequences.Fastq('ID', 'NNNNNNANNANAN', '1111111111111'), sequences.Fastq('ID', 'NNANNANANn', '1111111111')] for s in test_seqs: s.trim_Ns() self.assertEqual(fq, s)
def test_trim(self): '''trim() should trim the right number of bases off start and end''' fq = sequences.Fastq('ID', '1234567890', '1234567890') fq.trim(0, 0) self.assertEqual(fq, sequences.Fastq('ID', '1234567890', '1234567890')) fq = sequences.Fastq('ID', '1234567890', '1234567890') fq.trim(1, 0) self.assertEqual(fq, sequences.Fastq('ID', '234567890', '234567890')) fq = sequences.Fastq('ID', '1234567890', '1234567890') fq.trim(0, 1) self.assertEqual(fq, sequences.Fastq('ID', '123456789', '123456789')) fq = sequences.Fastq('ID', '1234567890', '1234567890') fq.trim(2, 2) self.assertEqual(fq, sequences.Fastq('ID', '345678', '345678'))
def merge_to_one_seq(infile, outfile, seqname='union'): '''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order''' seq_reader = sequences.file_reader(infile) seqs = [] for seq in seq_reader: seqs.append(copy.copy(seq)) new_seq = ''.join([seq.seq for seq in seqs]) if type(seqs[0]) == sequences.Fastq: new_qual = ''.join([seq.qual for seq in seqs]) seqs[:] = [] merged = sequences.Fastq(seqname, new_seq, new_qual) else: merged = sequences.Fasta(seqname, new_seq) seqs[:] = [] f = utils.open_file_write(outfile) print(merged, file=f) utils.close(f)
def test_replace_interval(self): '''Test replace_interval()''' fa = sequences.Fasta('ID', 'ACGTA') fa.replace_interval(0, 0, 'NEW') self.assertEqual(fa, sequences.Fasta('ID', 'NEWCGTA')) fa = sequences.Fasta('ID', 'ACGTA') fa.replace_interval(4, 4, 'NEW') self.assertEqual(fa, sequences.Fasta('ID', 'ACGTNEW')) fa = sequences.Fasta('ID', 'ACGTA') fa.replace_interval(2, 3, 'NEW') self.assertEqual(fa, sequences.Fasta('ID', 'ACNEWA')) fa = sequences.Fasta('ID', 'ACGTA') with self.assertRaises(sequences.Error): fa.replace_interval(3,2,'x') with self.assertRaises(sequences.Error): fa.replace_interval(1,5,'x') with self.assertRaises(sequences.Error): fa.replace_interval(5,10,'x') fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') fq.replace_interval(0, 0, 'NEW', 'III') self.assertEqual(fq, sequences.Fastq('ID', 'NEWCGTA', 'IIIBCDE')) fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') fq.replace_interval(4, 4, 'NEW', 'III') self.assertEqual(fq, sequences.Fastq('ID', 'ACGTNEW', 'ABCDIII')) fq = sequences.Fastq('ID', 'ACGTA', 'ABCDE') fq.replace_interval(2, 3, 'NEW', 'III') self.assertEqual(fq, sequences.Fastq('ID', 'ACNEWA', 'ABIIIE')) with self.assertRaises(sequences.Error): fq.replace_interval(1,1,'x', 'xx')
def test_to_Fasta_and_qual(self): '''Check to_Fasta_and_qual converts quality scores correctly''' fq = sequences.Fastq('ID', 'ACGT', '>ADI') (fa, qual) = fq.to_Fasta_and_qual() self.assertEqual(fa, sequences.Fasta('ID', 'ACGT')) self.assertListEqual(qual, [29, 32, 35, 40])
def test_revcomp(self): '''revcomp() should correctly reverse complement a sequence''' fq = sequences.Fastq('ID', 'ACGTNacgtn', '1234567890') fq.revcomp() self.assertEqual(fq, sequences.Fastq('ID', 'nacgtNACGT', '0987654321'))
def test_init_length_mismatch(self): '''__init__ should raise an error when length of seq and quality not the same''' with self.assertRaises(sequences.Error): sequences.Fastq('X', 'A', 'II')
def test_make_into_gene_fastq(self): '''Test make_into_gene fastq''' print('sequences.genetic_code', sequences.genetic_code) tests = [ (sequences.Fastq('ID', 'T', '1'), None), (sequences.Fastq('ID', 'TT', '12'), None), (sequences.Fastq('ID', 'TTT', '123'), None), (sequences.Fastq('ID', 'TTG', '123'), None), (sequences.Fastq('ID', 'TAA', '123'), None), (sequences.Fastq('ID', 'TTGAAATAA', '123456789'), (sequences.Fastq('ID', 'TTGAAATAA', '123456789'), '+', 0)), (sequences.Fastq('ID', 'TTGAAATAT', '123456789'), None), (sequences.Fastq('ID', 'TTGTAA', '123456'), (sequences.Fastq('ID', 'TTGTAA', '123456'), '+', 0)), (sequences.Fastq('ID', 'TTGTAAA', '1234567'), (sequences.Fastq('ID', 'TTGTAA', '123456'), '+', 0)), (sequences.Fastq('ID', 'TTGTAAAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '123456'), '+', 0)), (sequences.Fastq('ID', 'TTGTAAAAA', '123456789'), None), (sequences.Fastq('ID', 'ATTGTAA', '1234567'), (sequences.Fastq('ID', 'TTGTAA', '234567'), '+', 1)), (sequences.Fastq('ID', 'ATTGTAAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '234567'), '+', 1)), (sequences.Fastq('ID', 'ATTGTAAAA', '123456789'), (sequences.Fastq('ID', 'TTGTAA', '234567'), '+', 1)), (sequences.Fastq('ID', 'ATTGTAAAAA', '123456789A'), None), (sequences.Fastq('ID', 'AATTGTAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '345678'), '+', 2)), (sequences.Fastq('ID', 'AATTGTAAA', '123456789'), (sequences.Fastq('ID', 'TTGTAA', '345678'), '+', 2)), (sequences.Fastq('ID', 'AATTGTAAAA', '123456789A'), (sequences.Fastq('ID', 'TTGTAA', '345678'), '+', 2)), (sequences.Fastq('ID', 'AATTGTAAAAA', '123456789AB'), None), (sequences.Fastq('ID', 'TTACAA', '123456'), (sequences.Fastq('ID', 'TTGTAA', '654321'), '-', 0)), (sequences.Fastq('ID', 'ATTACAA', '1234567'), (sequences.Fastq('ID', 'TTGTAA', '765432'), '-', 0)), (sequences.Fastq('ID', 'AATTACAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '876543'), '-', 0)), (sequences.Fastq('ID', 'AAATTACAA', '123456789'), None), (sequences.Fastq('ID', 'TTACAAA', '1234567'), (sequences.Fastq('ID', 'TTGTAA', '654321'), '-', 1)), (sequences.Fastq('ID', 'ATTACAAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '765432'), '-', 1)), (sequences.Fastq('ID', 'AATTACAAA', '123456789'), (sequences.Fastq('ID', 'TTGTAA', '876543'), '-', 1)), (sequences.Fastq('ID', 'AAATTACAAA', '123456789A'), None), (sequences.Fastq('ID', 'TTACAAAA', '12345678'), (sequences.Fastq('ID', 'TTGTAA', '654321'), '-', 2)), (sequences.Fastq('ID', 'ATTACAAAA', '123456789'), (sequences.Fastq('ID', 'TTGTAA', '765432'), '-', 2)), (sequences.Fastq('ID', 'AATTACAAAA', '123456789A'), (sequences.Fastq('ID', 'TTGTAA', '876543'), '-', 2)), (sequences.Fastq('ID', 'AAATTACAAAA', '123456789AB'), None), ] for seq, expected in tests: self.assertEqual(seq.make_into_gene(), expected)
def run(description): parser = argparse.ArgumentParser( description= 'Takes a sequence file. Makes a BAM file containing perfect (unpaired) reads tiling the whole genome', usage= 'fastaq to_tiling_bam [options] <infile> <read_length> <read_step> <read_prefix> <outfile>', epilog='Important: assumes that samtools is in your path') parser.add_argument('infile', help='Name of input fasta/q file') parser.add_argument('read_length', type=int, help='Length of reads') parser.add_argument('read_step', type=int, help='Distance between start of each read') parser.add_argument('read_prefix', help='Prefix of read names') parser.add_argument('outfile', help='Name of output BAM file') parser.add_argument( '--read_group', help='Add the given read group ID to all reads [%(default)s]', default='42') options = parser.parse_args() # make a header first - we need to add the @RG line to the default header made by samtools tmp_empty_file = options.outfile + '.tmp.empty' f = utils.open_file_write(tmp_empty_file) utils.close(f) try: f = os.popen('samtools view -H -T ' + options.infile + ' ' + tmp_empty_file) except IOError: print('Error making tmp header file', file=sys.stderr) sys.exit(1) header_lines = f.readlines() header_lines.append('@RG\tID:' + options.read_group + '\tSM:FAKE') f.close() os.unlink(tmp_empty_file) seq_reader = sequences.file_reader(options.infile) try: f = os.popen('samtools view -hbS - > ' + options.outfile, 'w') except IOError: print("Error opening for writing BAM file '" + options.outfile + "'", file=sys.stderr) sys.exit(1) print(''.join(header_lines), file=f) for seq in seq_reader: end_range = len(seq) if len(seq) < options.read_length: end_range = 1 for i in range(0, end_range, options.read_step): if len(seq) <= options.read_length: start = 0 end = len(seq) - 1 else: start = i end = start + options.read_length - 1 if end > len(seq) - 1: end = len(seq) - 1 start = end - options.read_length + 1 read = sequences.Fastq( options.read_prefix + ':' + seq.id + ':' + str(start + 1) + ':' + str(end + 1), seq[start:end + 1], 'I' * (end - start + 1)) print('\t'.join([ read.id, '0', seq.id, str(start + 1), '60', str(len(read)) + 'M', '*', '*', '*', read.seq, read.qual, 'RG:Z:' + options.read_group ]), file=f) if end == len(seq) - 1: break f.close()
def test_file_reader_fastq(self): '''file_reader should iterate through a fastq file correctly''' reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_good_file.fq')) for seq in reader: self.assertEqual(seq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
def run(description): parser = argparse.ArgumentParser( description = 'Makes perfect paired end fastq reads from a sequence file, with insert sizes sampled from a normal distribution. Read orientation is innies. Output is an interleaved FASTQ file.', usage = 'fastaq to_perfect_reads [options] <infile> <outfile> <mean insert size> <insert std deviation> <mean coverage> <read length>') parser.add_argument('infile', help='Name of input file') parser.add_argument('outfile', help='Name of output file') parser.add_argument('mean_insert', type=int, help='Mean insert size of read pairs', metavar='mean insert size') parser.add_argument('insert_std', type=float, help='Standard devation of insert size', metavar='insert std deviation') parser.add_argument('coverage', type=float, help='Mean coverage of the reads', metavar='mean coverage') parser.add_argument('readlength', type=int, help='Length of each read', metavar='read length') parser.add_argument('--fragments', help='Write FASTA sequences of fragments (i.e. read pairs plus sequences in between them) to the given filename', metavar='FILENAME') parser.add_argument('--no_n', action='store_true', help='Don\'t allow any N or n characters in the reads') parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None, metavar='INT') options = parser.parse_args() random.seed(a=options.seed) seq_reader = sequences.file_reader(options.infile) fout = utils.open_file_write(options.outfile) pair_counter = 1 if options.fragments: fout_frags = utils.open_file_write(options.fragments) for ref in seq_reader: # check if current seq is long enough if len(ref) < options.mean_insert + 4 * options.insert_std: print('Warning, sequence ', ref.id, ' too short. Skipping it...', file=sys.stderr) continue # work out how many reads to simulate read_pairs = int(0.5 * options.coverage * len(ref) / options.readlength) # it's possible that we pick the same fragment twice, in which case the # reads would get the same name. So remember the frag coords used_fragments = {} # (middle_position, length) => count # do the simulation: pick insert size from normal distribution, and # position in genome from uniform distribution x = 0 while x < read_pairs: isize = int(random.normalvariate(options.mean_insert, options.insert_std)) while isize > len(ref) or isize < options.readlength: isize = int(random.normalvariate(options.mean_insert, options.insert_std)) middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize)) read_start1 = int(middle_pos - ceil(0.5 * isize)) read_start2 = read_start1 + isize - options.readlength readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)]) fragment = (middle_pos, isize) if fragment in used_fragments: used_fragments[fragment] += 1 readname += '.dup.' + str(used_fragments[fragment]) else: used_fragments[fragment] = 1 read1 = sequences.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength) read2 = sequences.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength) if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq): continue read2.revcomp() print(read1, file=fout) print(read2, file=fout) if options.fragments: frag = sequences.Fasta(readname, ref.seq[read_start1:read_start2 + options.readlength]) print(frag, file=fout_frags) pair_counter += 1 x += 1 utils.close(fout) if options.fragments: utils.close(fout_frags)
def setUp(self): self.fastq = sequences.Fastq('ID', 'ACGTA', 'IIIII')