def interleave(infile_1, infile_2, outfile): seq_reader_1 = sequences.file_reader(infile_1) seq_reader_2 = sequences.file_reader(infile_2) f_out = utils.open_file_write(outfile) for seq_1 in seq_reader_1: try: seq_2 = next(seq_reader_2) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') print(seq_1, file=f_out) print(seq_2, file=f_out) try: seq_2 = next(seq_reader_2) except: seq_2 = None if seq_2 is not None: utils.close(f_out) raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue') utils.close(f_out)
def interleave(infile_1, infile_2, outfile, suffix1=None, suffix2=None): '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end of every sequence name in infile_1, unless it already ends with suffix1. Similar for sufffix2.''' seq_reader_1 = sequences.file_reader(infile_1) seq_reader_2 = sequences.file_reader(infile_2) f_out = utils.open_file_write(outfile) for seq_1 in seq_reader_1: try: seq_2 = next(seq_reader_2) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') if suffix1 is not None and not seq_1.id.endswith(suffix1): seq_1.id += suffix1 if suffix2 is not None and not seq_2.id.endswith(suffix2): seq_2.id += suffix2 print(seq_1, file=f_out) print(seq_2, file=f_out) try: seq_2 = next(seq_reader_2) except: seq_2 = None if seq_2 is not None: utils.close(f_out) raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue') utils.close(f_out)
def test_file_reader_gff(self): '''Test read gff file''' good_files = [ 'sequences_test_gffv3.gff', 'sequences_test_gffv3.no_FASTA_line.gff' ] good_files = [os.path.join(data_dir, x) for x in good_files] for f in good_files: reader = sequences.file_reader(f) counter = 1 for seq in reader: self.assertEqual(seq, sequences.Fasta('seq' + str(counter), 'ACGTACGTAC')) counter += 1 bad_files = [ 'sequences_test_gffv3.no_seq.gff', 'sequences_test_gffv3.no_seq.2.gff' ] bad_files = [os.path.join(data_dir, x) for x in bad_files] for filename in bad_files: with self.assertRaises(sequences.Error): reader = sequences.file_reader(filename) for seq in reader: pass
def run(description): parser = argparse.ArgumentParser( description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' + 'from a mates file. Output is interleaved if mates file given', usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>') parser.add_argument('--mate_file', help='Name of mates file') parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT') parser.add_argument('infile', help='Name of input file') parser.add_argument('outfile', help='Name of output file') parser.add_argument('percent', type=float, help='Per cent probability of keeping any given read (pair) in [0,100]', metavar='FLOAT') options = parser.parse_args() random.seed(a=options.seed) seq_reader = sequences.file_reader(options.infile) fout = utils.open_file_write(options.outfile) if options.mate_file: mate_seq_reader = sequences.file_reader(options.mate_file) for seq in seq_reader: if options.mate_file: try: mate_seq = next(mate_seq_reader) except StopIteration: print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr) sys.exit(1) if 100 * random.random() <= options.percent: print(seq, file=fout) if options.mate_file: print(mate_seq, file=fout) utils.close(fout)
def filter( infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False, mate_in=None, mate_out=None, both_mates_pass=True, ): ids_from_file = set() if ids_file is not None: f = utils.open_file_read(ids_file) for line in f: ids_from_file.add(line.rstrip()) utils.close(f) if mate_in: if mate_out is None: raise Error('Error in filter! mate_in provided. Must also provide mate_out') seq_reader_mate = sequences.file_reader(mate_in) f_out_mate = utils.open_file_write(mate_out) seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) if regex is not None: r = re.compile(regex) def passes(seq): return minlength <= len(seq) <= maxlength \ and (regex is None or r.search(seq.id) is not None) \ and (ids_file is None or seq.id in ids_from_file) for seq in seq_reader: seq_passes = passes(seq) if mate_in: try: seq_mate = next(seq_reader_mate) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue') mate_passes = passes(seq_mate) want_the_pair = (seq_passes and mate_passes) \ or (( seq_passes or mate_passes) and not both_mates_pass) if want_the_pair != invert: print(seq, file=f_out) print(seq_mate, file=f_out_mate) elif seq_passes != invert: print(seq, file=f_out) utils.close(f_out) if mate_in: utils.close(f_out_mate)
def fasta_to_fastq(fasta_in, qual_in, outfile): fa_reader = sequences.file_reader(fasta_in) qual_reader = sequences.file_reader(qual_in, read_quals=True) f_out = utils.open_file_write(outfile) for seq in fa_reader: qual = next(qual_reader) if seq.id != qual.id: utils.close(f_out) raise Error('Mismatch in names from fasta and qual file', seq.id, qual.id) qual.seq = [int(x) for x in qual.seq.split()] print(seq.to_Fastq(qual.seq), file=f_out) utils.close(f_out)
def trim_contigs(infile, outfile, trim): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: if len(seq) < 2 * trim: continue gaps = seq.gaps() bases = list(seq.seq) # extend the length of each gap for gap in gaps: left_start = max(gap.start - trim, 0) right_end = min(gap.end + trim + 1, len(seq)) for i in range(left_start, gap.start): bases[i] = 'N' for i in range(gap.end, right_end): bases[i] = 'N' seq.seq = ''.join(bases) # trim start/end bases and tidy up any resulting Ns at either end of the trimmed seq seq.trim(trim, trim) seq.trim_Ns() # check that there is some non-N sequence left over regex = re.compile('[^nN]') if regex.search(seq.seq) is not None: print(seq, file=fout) utils.close(fout)
def count_sequences(infile): '''Returns the number of sequences in a file''' seq_reader = sequences.file_reader(infile) n = 0 for seq in seq_reader: n += 1 return n
def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False, check_unique=False): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) original_line_length = sequences.Fasta.line_length sequences.Fasta.line_length = line_length if check_unique: used_names = {} for seq in seq_reader: if strip_after_first_whitespace: seq.strip_after_first_whitespace() if check_unique: used_names[seq.id] = used_names.get(seq.id, 0) + 1 if type(seq) == sequences.Fastq: print(sequences.Fasta(seq.id, seq.seq), file=f_out) else: print(seq, file=f_out) utils.close(f_out) sequences.Fasta.line_length = original_line_length if check_unique: all_unique = True for name, count in used_names.items(): if count > 1: print('Sequence name "' + name + '" not unique. Found', count, 'times', file=sys.stderr) all_unique = False if not all_unique: raise Error('Not all sequence names unique. Cannot continue')
def acgtn_only(infile, outfile): '''Replace every non-acgtn (case insensitve) character with an N''' f = utils.open_file_write(outfile) for seq in sequences.file_reader(infile): seq.replace_non_acgt() print(seq, file=f) utils.close(f)
def test_file_reader_fasta(self): '''file_reader should iterate through a fasta file correctly''' reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.fa')) counter = 1 for seq in reader: self.assertEqual(seq, sequences.Fasta(str(counter), 'ACGTA')) counter += 1
def translate(infile, outfile, frame=0): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: print(seq.translate(frame=frame), file=fout) utils.close(fout)
def reverse_complement(infile, outfile): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.revcomp() print(seq, file=fout) utils.close(fout)
def replace_bases(infile, outfile, old, new): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: seq.replace_bases(old, new) print(seq, file=f_out) utils.close(f_out)
def strip_illumina_suffix(infile, outfile): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: seq.strip_illumina_suffix() print(seq, file=f_out) utils.close(f_out)
def split_by_fixed_size(infile, outfiles_prefix, chunk_size, tolerance, skip_if_all_Ns=False): '''Splits fasta/q file into separate files, with up to (chunk_size + tolerance) bases in each file''' file_count = 1 coords = [] small_sequences = [] # sequences shorter than chunk_size seq_reader = sequences.file_reader(infile) f_coords = utils.open_file_write(outfiles_prefix + '.coords') for seq in seq_reader: if skip_if_all_Ns and seq.is_all_Ns(): continue if len(seq) < chunk_size: small_sequences.append(copy.copy(seq)) elif len(seq) <= chunk_size + tolerance: f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) print(seq, file=f) utils.close(f) file_count += 1 else: # make list of chunk coords chunks = [(x,x+chunk_size) for x in range(0, len(seq), chunk_size)] if chunks[-1][1] - 1 > len(seq): chunks[-1] = (chunks[-1][0], len(seq)) if len(chunks) > 1 and (chunks[-1][1] - chunks[-1][0]) <= tolerance: chunks[-2] = (chunks[-2][0], chunks[-1][1]) chunks.pop() # write one output file per chunk offset = 0 for chunk in chunks: if not(skip_if_all_Ns and seq.is_all_Ns(start=chunk[0], end=chunk[1]-1)): f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) chunk_id = seq.id + ':' + str(chunk[0]+1) + '-' + str(chunk[1]) print(sequences.Fasta(chunk_id, seq[chunk[0]:chunk[1]]), file=f) print(chunk_id, seq.id, offset, sep='\t', file=f_coords) utils.close(f) file_count += 1 offset += chunk[1] - chunk[0] # write files of small sequences if len(small_sequences): f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) file_count += 1 base_count = 0 for seq in small_sequences: if base_count > 0 and base_count + len(seq) > chunk_size + tolerance: utils.close(f) f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) file_count += 1 base_count = 0 print(seq, file=f) base_count += len(seq) utils.close(f)
def search_for_seq(infile, outfile, search_string): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: hits = seq.search(search_string) for hit in hits: print(seq.id, hit[0]+1, hit[1], sep='\t', file=fout) utils.close(fout)
def trim_Ns_at_end(infile, outfile): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.trim_Ns() if len(seq): print(seq, file=fout) utils.close(fout)
def trim(infile, outfile, start, end): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.trim(start, end) if len(seq): print(seq, file=fout) utils.close(fout)
def to_fasta_union(infile, outfile, seqname='union'): seq_reader = sequences.file_reader(infile) new_seq = [] for seq in seq_reader: new_seq.append(seq.seq) f_out = utils.open_file_write(outfile) print(sequences.Fasta(seqname, ''.join(new_seq)), file=f_out) utils.close(f_out)
def to_boulderio(infile, outfile): '''Converts input sequence file into a "Boulder-IO format", as used by primer3''' seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for sequence in seq_reader: print("SEQUENCE_ID=" + sequence.id, file=f_out) print("SEQUENCE_TEMPLATE=" + sequence.seq, file=f_out) print("=", file=f_out) utils.close(f_out)
def test_file_reader_embl(self): '''Test read embl file''' reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test.embl')) counter = 1 for seq in reader: self.assertEqual(seq, sequences.Fasta('seq' + str(counter), expected_embl[counter-1])) counter += 1 bad_files = [ 'sequences_test.embl.bad', 'sequences_test.embl.bad2', ] bad_files = [os.path.join(data_dir, x) for x in bad_files] for filename in bad_files: with self.assertRaises(sequences.Error): reader = sequences.file_reader(filename) for seq in reader: pass
def expand_nucleotides(infile, outfile): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seqs = seq.expand_nucleotides() if len(seqs) > 1: for s in seqs: print(s, file=fout) else: print(seq, file=fout)
def make_long_reads(infile, outfile, method='tiling', fixed_read_length=20000, tile_step=10000, gamma_shape=1.2, gamma_scale=6000, coverage=10, gamma_min_length=20000, seed=None, ins_skip=None, ins_window=None,): assert method in ['tiling', 'gamma', 'uniform'] assert ins_skip == ins_window == None or None not in [ins_skip, ins_window] if seed is not None: random.seed(a=seed) seq_reader = sequences.file_reader(infile) f = utils.open_file_write(outfile) for seq in seq_reader: if method == 'tiling': if len(seq) < fixed_read_length: print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) continue for i in range(0, len(seq), tile_step): end = min(len(seq), i + fixed_read_length) fa = sequences.Fasta('_'.join([seq.id, str(i + 1), str(end)]), seq[i:end]) if ins_skip: fa.add_insertions(skip=ins_skip, window=ins_window) print(fa, file=f) if end >= len(seq): break elif method == 'gamma': if len(seq) < gamma_min_length: print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) continue total_read_length = 0 while total_read_length < coverage * len(seq) - 0.5 * gamma_min_length: read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale)) while read_length < gamma_min_length or read_length > len(seq): read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale)) start = random.randint(0, len(seq) - read_length) end = start + read_length - 1 fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1]) total_read_length += len(fa) if ins_skip: fa.add_insertions(skip=ins_skip, window=ins_window) print(fa, file=f) elif method == 'uniform': if len(seq) < fixed_read_length: print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) continue total_read_length = 0 while total_read_length < coverage * len(seq) - 0.5 * fixed_read_length: start = random.randint(0, len(seq) - fixed_read_length) end = start + fixed_read_length - 1 fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1]) total_read_length += len(fa) if ins_skip: fa.add_insertions(skip=ins_skip, window=ins_window) print(fa, file=f) utils.close(f)
def sequence_trim(infile_1, infile_2, outfile_1, outfile_2, to_trim_file, min_length=50, check_revcomp=False): to_trim_seqs = {} file_to_dict(to_trim_file, to_trim_seqs) trim_seqs = [x.seq for x in to_trim_seqs.values()] if check_revcomp: for seq in to_trim_seqs.values(): seq.revcomp() trim_seqs_revcomp = [x.seq for x in to_trim_seqs.values()] else: trim_seqs_revcomp = [] seq_reader_1 = sequences.file_reader(infile_1) seq_reader_2 = sequences.file_reader(infile_2) f_out_1 = utils.open_file_write(outfile_1) f_out_2 = utils.open_file_write(outfile_2) for seq_1 in seq_reader_1: try: seq_2 = next(seq_reader_2) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') for seq in seq_1, seq_2: for trim_seq in trim_seqs: if seq.seq.startswith(trim_seq): seq.trim(len(trim_seq),0) break for trim_seq in trim_seqs_revcomp: if seq.seq.endswith(trim_seq): seq.trim(0,len(trim_seq)) break if len(seq_1) >= min_length and len(seq_2) >= min_length: print(seq_1, file=f_out_1) print(seq_2, file=f_out_2) utils.close(f_out_1) utils.close(f_out_2)
def test_file_reader_phylip(self): '''Test read phylip file''' test_files = [ 'sequences_test_phylip.interleaved', 'sequences_test_phylip.interleaved2', 'sequences_test_phylip.sequential' ] test_files = [os.path.join(data_dir, f) for f in test_files] expected_seqs = [ sequences.Fasta('Turkey', 'AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT'), sequences.Fasta('Salmo_gair', 'AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT'), sequences.Fasta('H. Sapiens', 'ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA') ] for fname in test_files: reader = sequences.file_reader(fname) i = 0 for seq in reader: self.assertEqual(expected_seqs[i], seq) i += 1 # files made by seaview are a little different in the first line. # Test one of these expected_seqs = [ sequences.Fasta('seq1', 96 * 'G' + 'T'), sequences.Fasta('seq2', 94 * 'A' + 'G') ] reader = sequences.file_reader( os.path.join(data_dir, 'sequences_test_phylip.made_by_seaview')) i = 0 for seq in reader: print(seq) self.assertEqual(expected_seqs[i], seq) i += 1
def fastaq_to_fake_qual(infile, outfile, q=40): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: print('>' + seq.id, file=fout) if sequences.Fasta.line_length == 0: print(' '.join([str(q)] * len(seq)), file=fout) else: for i in range(0, len(seq), sequences.Fasta.line_length): print(' '.join([str(q)] * min(sequences.Fasta.line_length, len(seq) - i)), file=fout) utils.close(fout)
def mean_length(infile, limit=None): '''Returns the mean length of the sequences in the input file. By default uses all sequences. To limit to the first N sequences, use limit=N''' total = 0 count = 0 seq_reader = sequences.file_reader(infile) for seq in seq_reader: total += len(seq) count += 1 if limit is not None and count >= limit: break assert count > 0 return total / count
def fastaq_to_orfs_gff(infile, outfile, min_length=300, tool_name='fastaq'): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: orfs = seq.all_orfs(min_length=min_length) for coords, revcomp in orfs: if revcomp: strand = '-' else: strand = '+' print(seq.id, tool_name, 'CDS', coords.start+1, coords.end+1, '.', strand, '.', sep='\t', file=fout) utils.close(fout)
def run(description): parser = argparse.ArgumentParser( description= 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' + 'from a mates file. Ouptut is interleaved if mates file given', usage='fastaq to_random_subset [options] <infile> <outfile> <percent>') parser.add_argument('--mate_file', help='Name of mates file') parser.add_argument('infile', help='Name of input file') parser.add_argument('outfile', help='Name of output file') parser.add_argument( 'percent', type=int, help='Per cent probability of keeping any given read (pair) in [0,100]', metavar='INT') options = parser.parse_args() seq_reader = sequences.file_reader(options.infile) fout = utils.open_file_write(options.outfile) if options.mate_file: mate_seq_reader = sequences.file_reader(options.mate_file) for seq in seq_reader: if options.mate_file: try: mate_seq = next(mate_seq_reader) except StopIteration: print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr) sys.exit(1) if random.randint(0, 100) <= options.percent: print(seq, file=fout) if options.mate_file: print(mate_seq, file=fout) utils.close(fout)
def test_file_reader_phylip(self): '''Test read phylip file''' test_files = [ 'sequences_test_phylip.interleaved', 'sequences_test_phylip.interleaved2', 'sequences_test_phylip.sequential' ] test_files = [os.path.join(data_dir, f) for f in test_files] expected_seqs = [ sequences.Fasta('Turkey', 'AACTNGGGCATTTCAGGGTGAGCCCGGGCAATACAGGGTAT'), sequences.Fasta('Salmo_gair', 'AAGCCTTGGCAGTGCAGGGTGAGCCGTGGCCGGGCACGGTAT'), sequences.Fasta('H. Sapiens', 'ACCGGTTGGCCGTTCAGGGTACAGGTTGGCCGTTCAGGGTAA') ] for fname in test_files: reader = sequences.file_reader(fname) i = 0 for seq in reader: self.assertEqual(expected_seqs[i], seq) i += 1 # files made by seaview are a little different in the first line. # Test one of these expected_seqs = [ sequences.Fasta('seq1', 96 * 'G' + 'T'), sequences.Fasta('seq2', 94 * 'A' + 'G') ] reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_phylip.made_by_seaview')) i = 0 for seq in reader: print(seq) self.assertEqual(expected_seqs[i], seq) i += 1
def fastaq_to_mira_xml(infile, outfile): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) print('<?xml version="1.0"?>', '<trace_volume>', sep='\n', file=fout) for seq in seq_reader: print(' <trace>', ' <trace_name>' + seq.id + '</trace_name>', ' <clip_quality_right>' + str(len(seq)) + '</clip_quality_right>', ' <clip_vector_left>1</clip_vector_left>', ' </trace>', sep='\n', file=fout) print('</trace_volume>', file=fout) utils.close(fout)
def test_file_reader_embl(self): '''Test read embl file''' reader = sequences.file_reader( os.path.join(data_dir, 'sequences_test.embl')) counter = 1 for seq in reader: self.assertEqual( seq, sequences.Fasta('seq' + str(counter), expected_embl[counter - 1])) counter += 1 bad_files = [ 'sequences_test.embl.bad', 'sequences_test.embl.bad2', ] bad_files = [os.path.join(data_dir, x) for x in bad_files] for filename in bad_files: with self.assertRaises(sequences.Error): reader = sequences.file_reader(filename) for seq in reader: pass
def test_print_line_length(self): '''__str__ should be formatted correctly with the right number of chars per line of sequence''' line_lengths = [0, 3] correct_files = [os.path.join(data_dir, x) for x in ['sequences_test_one-per-line.fa', 'sequences_test_3-per-line.fa']] for i in range(len(line_lengths)): seq_reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_one-per-line.fa')) sequences.Fasta.line_length = line_lengths[i] tmp_out = 'tmp.line_length_test.fa' f = utils.open_file_write(tmp_out) for s in seq_reader: print(s, file=f) utils.close(f) self.assertTrue(filecmp.cmp(correct_files[i], tmp_out)) os.unlink(tmp_out) sequences.Fasta.line_length = 60
def capillary_to_pairs(infile, outprefix): # hash the sequences, only taking longest where an end has been sequenced more than once seq_reader = sequences.file_reader(infile) fwd_seqs = {} rev_seqs = {} unpaired_seqs = {} for seq in seq_reader: id_info = seq.split_capillary_id() if id_info['dir'] == 'fwd': seq.id = id_info['prefix'] + '/1' h = fwd_seqs elif id_info['dir'] == 'rev': seq.id = id_info['prefix'] + '/2' h = rev_seqs else: seq.id = id_info['prefix'] h = unpaired_seqs key = id_info['prefix'] if key not in h or len(h[key]) < len(seq): h[key] = copy.copy(seq) # write the output files f_pe = utils.open_file_write(outprefix + '.paired.gz') f_up = utils.open_file_write(outprefix + '.unpaired.gz') for id in fwd_seqs: if id in rev_seqs: print(fwd_seqs[id], file=f_pe) print(rev_seqs[id], file=f_pe) del rev_seqs[id] else: print(fwd_seqs[id], file=f_up) for seq in rev_seqs.values(): print(seq, file=f_up) for seq in unpaired_seqs.values(): print(seq, file=f_up) utils.close(f_pe) utils.close(f_up)
def scaffolds_to_contigs(infile, outfile, number_contigs=False): '''Makes a file of contigs from scaffolds by splitting at every N. Use number_contigs=True to add .1, .2, etc onto end of each contig, instead of default to append coordinates.''' seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: contigs = seq.contig_coords() counter = 1 for contig in contigs: if number_contigs: name = seq.id + '.' + str(counter) counter += 1 else: name = '.'.join([seq.id, str(contig.start + 1), str(contig.end + 1)]) print(sequences.Fasta(name, seq[contig.start:contig.end+1]), file=fout) utils.close(fout)
def get_fast5(fastq, fast5, subset, batch_size, outdir, extension, threads): """ Get Fast5 reads from basecalled Fastq """ outdir.mkdir(parents=True, exist_ok=True) fq_files = fastq.glob(f"*{extension}") if subset: df = pandas.read_csv(subset, sep='\t') names = df.iloc[:, 0].tolist() print(f"Subset: {names}") else: names = [] for fq in fq_files: name = fq.stem # Subset checks if names: if name in names: pass else: print(f"{name} not in subset, ignoring") continue # Create files: read_ids = [seq.id for seq in sequences.file_reader(str(fq))] (outdir / name).mkdir(parents=True, exist_ok=True) read_id_list = outdir / name / f"{name}.txt" with open(read_id_list, 'w') as outfile: for read_id in read_ids: outfile.write(read_id + '\n') # Run ONT Fast5 API: print(f"Fetching Fast5 for: {fq}") run_cmd( f"fast5_subset --input {fast5} --save_path {outdir / name} --read_id_list {read_id_list} " f"--batch_size {batch_size} --recursive --filename_base {name}_ --threads {threads}" )
def get_seqs_flanking_gaps(infile, outfile, left, right): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) print('#id', 'gap_start', 'gap_end', 'left_bases', 'right_bases', sep='\t', file=fout) for seq in seq_reader: gaps = seq.gaps() for gap in gaps: left_start = max(gap.start - left, 0) right_end = min(gap.end + right + 1, len(seq)) print(seq.id, gap.start + 1, gap.end + 1, seq.seq[left_start:gap.start], seq.seq[gap.end + 1:right_end], sep='\t', file=fout) utils.close(fout)
def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) original_line_length = sequences.Fasta.line_length sequences.Fasta.line_length = line_length for seq in seq_reader: if strip_after_first_whitespace: seq.strip_after_first_whitespace() if type(seq) == sequences.Fastq: print(sequences.Fasta(seq.id, seq.seq), file=f_out) else: print(seq, file=f_out) utils.close(f_out) sequences.Fasta.line_length = original_line_length
def enumerate_names(infile, outfile, start_index=1, keep_illumina_suffix=False, rename_file=None, suffix=None): seq_reader = sequences.file_reader(infile) fout_seqs = utils.open_file_write(outfile) counter = start_index if keep_illumina_suffix: sequence_suffixes = ['/1', '/2'] else: sequence_suffixes = [] if rename_file is not None: fout_rename = utils.open_file_write(rename_file) print('#old\tnew', file=fout_rename) for seq in seq_reader: old_id = seq.id seq.id = str(counter) for suff in sequence_suffixes: if old_id.endswith(suff): seq.id += suff break if rename_file is not None: print(old_id, seq.id, sep='\t', file=fout_rename) if suffix is not None: seq.id += suffix print(seq, file=fout_seqs) counter += 1 utils.close(fout_seqs) if rename_file is not None: utils.close(fout_rename)
def merge_to_one_seq(infile, outfile, seqname='union'): '''Takes a multi fasta or fastq file and writes a new file that contains just one sequence, with the original sequences catted together, preserving their order''' seq_reader = sequences.file_reader(infile) seqs = [] for seq in seq_reader: seqs.append(copy.copy(seq)) new_seq = ''.join([seq.seq for seq in seqs]) if type(seqs[0]) == sequences.Fastq: new_qual = ''.join([seq.qual for seq in seqs]) seqs[:] = [] merged = sequences.Fastq(seqname, new_seq, new_qual) else: merged = sequences.Fasta(seqname, new_seq) seqs[:] = [] f = utils.open_file_write(outfile) print(merged, file=f) utils.close(f)
def split_by_fixed_size_onefile(infile, outfile, chunk_size, tolerance, skip_if_all_Ns=False): '''Splits each sequence in infile into chunks of fixed size, last chunk can be up to (chunk_size + tolerance) in length''' seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: for i in range(0, len(seq), chunk_size): if i + chunk_size + tolerance >= len(seq): end = len(seq) else: end = i + chunk_size subseq = seq.subseq(i, end) if not (skip_if_all_Ns and subseq.is_all_Ns()): subseq.id += '.' + str(i+1) + '_' + str(end) print(subseq, file=f_out) if end == len(seq): break utils.close(f_out)
def to_unique_by_id(infile, outfile): seq_reader = sequences.file_reader(infile) seqs = {} ids_in_order = [] # has the reads, keeping the longest one when we get the same # name more than once for seq in seq_reader: if len(seq) == 0: continue if seq.id not in seqs: seqs[seq.id] = copy.copy(seq) ids_in_order.append(seq.id) elif len(seqs[seq.id]) < len(seq): seqs[seq.id] = copy.copy(seq) # write the output f_out = utils.open_file_write(outfile) for id in ids_in_order: print(seqs[id], file=f_out) utils.close(f_out)
def deinterleave(infile, outfile_1, outfile_2, fasta_out=False): seq_reader = sequences.file_reader(infile) f_1 = utils.open_file_write(outfile_1) f_2 = utils.open_file_write(outfile_2) for seq in seq_reader: if fasta_out: print(sequences.Fasta(seq.id, seq.seq), file=f_1) else: print(seq, file=f_1) try: next(seq_reader) except StopIteration: utils.close(f_1) utils.close(f_2) raise Error('Error getting mate for sequence. Cannot continue') if fasta_out: print(sequences.Fasta(seq.id, seq.seq), file=f_2) else: print(seq, file=f_2) utils.close(f_1) utils.close(f_2)
def to_fastg(infile, outfile, circular=None): '''Writes a FASTG file in SPAdes format from input file. Currently only whether or not a sequence is circular is supported. Put circular=set of ids, or circular=filename to make those sequences circular in the output. Puts coverage=1 on all contigs''' if circular is None: to_circularise = set() elif type(circular) is not set: f = utils.open_file_read(circular) to_circularise = set([x.rstrip() for x in f.readlines()]) utils.close(f) else: to_circularise = circular seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) nodes = 1 for seq in seq_reader: new_id = '_'.join([ 'NODE', str(nodes), 'length', str(len(seq)), 'cov', '1', 'ID', seq.id ]) if seq.id in to_circularise: seq.id = new_id + ':' + new_id + ';' print(seq, file=fout) seq.revcomp() seq.id = new_id + "':" + new_id + "';" print(seq, file=fout) else: seq.id = new_id + ';' print(seq, file=fout) seq.revcomp() seq.id = new_id + "';" print(seq, file=fout) nodes += 1 utils.close(fout)
def split_by_base_count(infile, outfiles_prefix, max_bases, max_seqs=None): '''Splits a fasta/q file into separate files, file size determined by number of bases. Puts <= max_bases in each split file The exception is a single sequence >=max_bases is put in its own file. This does not split sequences. ''' seq_reader = sequences.file_reader(infile) base_count = 0 file_count = 1 seq_count = 0 fout = None if max_seqs is None: max_seqs = float('inf') for seq in seq_reader: if base_count == 0: fout = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) file_count += 1 if base_count + len(seq) > max_bases or seq_count >= max_seqs: if base_count == 0: print(seq, file=fout) utils.close(fout) else: utils.close(fout) fout = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) print(seq, file=fout) base_count = len(seq) file_count += 1 seq_count = 1 else: base_count += len(seq) seq_count += 1 print(seq, file=fout) utils.close(fout)
def filter( infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False, mate_in=None, mate_out=None, both_mates_pass=True, ): ids_from_file = set() if ids_file is not None: f = utils.open_file_read(ids_file) for line in f: ids_from_file.add(line.rstrip()) utils.close(f) if mate_in: if mate_out is None: raise Error( 'Error in filter! mate_in provided. Must also provide mate_out' ) seq_reader_mate = sequences.file_reader(mate_in) f_out_mate = utils.open_file_write(mate_out) seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) if regex is not None: r = re.compile(regex) def passes(seq, name_regex): # remove trailing comments from FASTQ readname lines matches = name_regex.match(seq.id) if matches is not None: clean_seq_id = matches.group(1) else: clean_seq_id = seq.id return minlength <= len(seq) <= maxlength \ and (regex is None or r.search(clean_seq_id) is not None) \ and (ids_file is None or clean_seq_id in ids_from_file) name_regex = re.compile(r'^([^\s]+).*?$') for seq in seq_reader: seq_passes = passes(seq, name_regex) if mate_in: try: seq_mate = next(seq_reader_mate) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue') mate_passes = passes(seq_mate, name_regex) want_the_pair = (seq_passes and mate_passes) \ or (( seq_passes or mate_passes) and not both_mates_pass) if want_the_pair != invert: print(seq, file=f_out) print(seq_mate, file=f_out_mate) elif seq_passes != invert: print(seq, file=f_out) utils.close(f_out) if mate_in: utils.close(f_out_mate)
def split_by_fixed_size(infile, outfiles_prefix, chunk_size, tolerance, skip_if_all_Ns=False): '''Splits fasta/q file into separate files, with up to (chunk_size + tolerance) bases in each file''' file_count = 1 coords = [] small_sequences = [] # sequences shorter than chunk_size seq_reader = sequences.file_reader(infile) f_coords = utils.open_file_write(outfiles_prefix + '.coords') for seq in seq_reader: if skip_if_all_Ns and seq.is_all_Ns(): continue if len(seq) < chunk_size: small_sequences.append(copy.copy(seq)) elif len(seq) <= chunk_size + tolerance: f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) print(seq, file=f) utils.close(f) file_count += 1 else: # make list of chunk coords chunks = [(x, x + chunk_size) for x in range(0, len(seq), chunk_size)] if chunks[-1][1] - 1 > len(seq): chunks[-1] = (chunks[-1][0], len(seq)) if len(chunks) > 1 and (chunks[-1][1] - chunks[-1][0]) <= tolerance: chunks[-2] = (chunks[-2][0], chunks[-1][1]) chunks.pop() # write one output file per chunk offset = 0 for chunk in chunks: if not (skip_if_all_Ns and seq.is_all_Ns(start=chunk[0], end=chunk[1] - 1)): f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) chunk_id = seq.id + ':' + str(chunk[0] + 1) + '-' + str( chunk[1]) print(sequences.Fasta(chunk_id, seq[chunk[0]:chunk[1]]), file=f) print(chunk_id, seq.id, offset, sep='\t', file=f_coords) utils.close(f) file_count += 1 offset += chunk[1] - chunk[0] # write files of small sequences if len(small_sequences): f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) file_count += 1 base_count = 0 for seq in small_sequences: if base_count > 0 and base_count + len( seq) > chunk_size + tolerance: utils.close(f) f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) file_count += 1 base_count = 0 print(seq, file=f) base_count += len(seq) utils.close(f)
def test_file_reader_bad_format(self): '''file_reader should die properly when not given fasta or fastq file''' with self.assertRaises(sequences.Error): reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_not_a_fastaq_file')) for seq in reader: pass
def test_file_reader_fastq(self): '''file_reader should iterate through a fastq file correctly''' reader = sequences.file_reader(os.path.join(data_dir, 'sequences_test_good_file.fq')) for seq in reader: self.assertEqual(seq, sequences.Fastq('ID', 'ACGTA', 'IIIII'))
def file_to_dict(infile, d): seq_reader = sequences.file_reader(infile) for seq in seq_reader: d[seq.id] = copy.copy(seq)
def get_ids(infile, outfile): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: print(seq.id, file=f_out) utils.close(f_out)
def _get_max_length(self, filename): '''Get the length of the longest contig in file''' contigs = sequences.file_reader(filename) max_length = max([len(x) for x in contigs]) return max_length + 20 # adding 20 for extra allowance when running promer
def run(description): parser = argparse.ArgumentParser( description = 'Makes perfect paired end fastq reads from a sequence file, with insert sizes sampled from a normal distribution. Read orientation is innies. Output is an interleaved FASTQ file.', usage = 'fastaq to_perfect_reads [options] <infile> <outfile> <mean insert size> <insert std deviation> <mean coverage> <read length>') parser.add_argument('infile', help='Name of input file') parser.add_argument('outfile', help='Name of output file') parser.add_argument('mean_insert', type=int, help='Mean insert size of read pairs', metavar='mean insert size') parser.add_argument('insert_std', type=float, help='Standard devation of insert size', metavar='insert std deviation') parser.add_argument('coverage', type=float, help='Mean coverage of the reads', metavar='mean coverage') parser.add_argument('readlength', type=int, help='Length of each read', metavar='read length') parser.add_argument('--fragments', help='Write FASTA sequences of fragments (i.e. read pairs plus sequences in between them) to the given filename', metavar='FILENAME') parser.add_argument('--no_n', action='store_true', help='Don\'t allow any N or n characters in the reads') parser.add_argument('--seed', type=int, help='Seed for random number generator. Default is to use python\'s default', default=None, metavar='INT') options = parser.parse_args() random.seed(a=options.seed) seq_reader = sequences.file_reader(options.infile) fout = utils.open_file_write(options.outfile) pair_counter = 1 if options.fragments: fout_frags = utils.open_file_write(options.fragments) for ref in seq_reader: # check if current seq is long enough if len(ref) < options.mean_insert + 4 * options.insert_std: print('Warning, sequence ', ref.id, ' too short. Skipping it...', file=sys.stderr) continue # work out how many reads to simulate read_pairs = int(0.5 * options.coverage * len(ref) / options.readlength) # it's possible that we pick the same fragment twice, in which case the # reads would get the same name. So remember the frag coords used_fragments = {} # (middle_position, length) => count # do the simulation: pick insert size from normal distribution, and # position in genome from uniform distribution x = 0 while x < read_pairs: isize = int(random.normalvariate(options.mean_insert, options.insert_std)) while isize > len(ref) or isize < options.readlength: isize = int(random.normalvariate(options.mean_insert, options.insert_std)) middle_pos = random.randint(ceil(0.5 *isize), floor(len(ref) - 0.5 * isize)) read_start1 = int(middle_pos - ceil(0.5 * isize)) read_start2 = read_start1 + isize - options.readlength readname = ':'.join([ref.id, str(pair_counter), str(read_start1+1), str(read_start2+1)]) fragment = (middle_pos, isize) if fragment in used_fragments: used_fragments[fragment] += 1 readname += '.dup.' + str(used_fragments[fragment]) else: used_fragments[fragment] = 1 read1 = sequences.Fastq(readname + '/1', ref.seq[read_start1:read_start1 + options.readlength], 'I' * options.readlength) read2 = sequences.Fastq(readname + '/2', ref.seq[read_start2:read_start2 + options.readlength], 'I' * options.readlength) if options.no_n and ('n' in read1.seq or 'N' in read1.seq or 'n' in read2.seq or 'N' in read2.seq): continue read2.revcomp() print(read1, file=fout) print(read2, file=fout) if options.fragments: frag = sequences.Fasta(readname, ref.seq[read_start1:read_start2 + options.readlength]) print(frag, file=fout_frags) pair_counter += 1 x += 1 utils.close(fout) if options.fragments: utils.close(fout_frags)
def run(description): parser = argparse.ArgumentParser( description= 'Takes a sequence file. Makes a BAM file containing perfect (unpaired) reads tiling the whole genome', usage= 'fastaq to_tiling_bam [options] <infile> <read_length> <read_step> <read_prefix> <outfile>', epilog='Important: assumes that samtools is in your path') parser.add_argument('infile', help='Name of input fasta/q file') parser.add_argument('read_length', type=int, help='Length of reads') parser.add_argument('read_step', type=int, help='Distance between start of each read') parser.add_argument('read_prefix', help='Prefix of read names') parser.add_argument('outfile', help='Name of output BAM file') parser.add_argument( '--read_group', help='Add the given read group ID to all reads [%(default)s]', default='42') options = parser.parse_args() # make a header first - we need to add the @RG line to the default header made by samtools tmp_empty_file = options.outfile + '.tmp.empty' f = utils.open_file_write(tmp_empty_file) utils.close(f) try: f = os.popen('samtools view -H -T ' + options.infile + ' ' + tmp_empty_file) except IOError: print('Error making tmp header file', file=sys.stderr) sys.exit(1) header_lines = f.readlines() header_lines.append('@RG\tID:' + options.read_group + '\tSM:FAKE') f.close() os.unlink(tmp_empty_file) seq_reader = sequences.file_reader(options.infile) try: f = os.popen('samtools view -hbS - > ' + options.outfile, 'w') except IOError: print("Error opening for writing BAM file '" + options.outfile + "'", file=sys.stderr) sys.exit(1) print(''.join(header_lines), file=f) for seq in seq_reader: end_range = len(seq) if len(seq) < options.read_length: end_range = 1 for i in range(0, end_range, options.read_step): if len(seq) <= options.read_length: start = 0 end = len(seq) - 1 else: start = i end = start + options.read_length - 1 if end > len(seq) - 1: end = len(seq) - 1 start = end - options.read_length + 1 read = sequences.Fastq( options.read_prefix + ':' + seq.id + ':' + str(start + 1) + ':' + str(end + 1), seq[start:end + 1], 'I' * (end - start + 1)) print('\t'.join([ read.id, '0', seq.id, str(start + 1), '60', str(len(read)) + 'M', '*', '*', '*', read.seq, read.qual, 'RG:Z:' + options.read_group ]), file=f) if end == len(seq) - 1: break f.close()