def filter( infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False, mate_in=None, mate_out=None, both_mates_pass=True, ): ids_from_file = set() if ids_file is not None: f = utils.open_file_read(ids_file) for line in f: ids_from_file.add(line.rstrip()) utils.close(f) if mate_in: if mate_out is None: raise Error( 'Error in filter! mate_in provided. Must also provide mate_out' ) seq_reader_mate = sequences.file_reader(mate_in) f_out_mate = utils.open_file_write(mate_out) seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) if regex is not None: r = re.compile(regex) def passes(seq): return minlength <= len(seq) <= maxlength \ and (regex is None or r.search(seq.id) is not None) \ and (ids_file is None or seq.id in ids_from_file) for seq in seq_reader: seq_passes = passes(seq) if mate_in: try: seq_mate = next(seq_reader_mate) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue') mate_passes = passes(seq_mate) want_the_pair = (seq_passes and mate_passes) \ or (( seq_passes or mate_passes) and not both_mates_pass) if want_the_pair != invert: print(seq, file=f_out) print(seq_mate, file=f_out_mate) elif seq_passes != invert: print(seq, file=f_out) utils.close(f_out) if mate_in: utils.close(f_out_mate)
def filter( infile, outfile, minlength=0, maxlength=float('inf'), regex=None, ids_file=None, invert=False, mate_in=None, mate_out=None, both_mates_pass=True, ): ids_from_file = set() if ids_file is not None: f = utils.open_file_read(ids_file) for line in f: ids_from_file.add(line.rstrip()) utils.close(f) if mate_in: if mate_out is None: raise Error('Error in filter! mate_in provided. Must also provide mate_out') seq_reader_mate = sequences.file_reader(mate_in) f_out_mate = utils.open_file_write(mate_out) seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) if regex is not None: r = re.compile(regex) def passes(seq): return minlength <= len(seq) <= maxlength \ and (regex is None or r.search(seq.id) is not None) \ and (ids_file is None or seq.id in ids_from_file) for seq in seq_reader: seq_passes = passes(seq) if mate_in: try: seq_mate = next(seq_reader_mate) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq.id, ' ... cannot continue') mate_passes = passes(seq_mate) want_the_pair = (seq_passes and mate_passes) \ or (( seq_passes or mate_passes) and not both_mates_pass) if want_the_pair != invert: print(seq, file=f_out) print(seq_mate, file=f_out_mate) elif seq_passes != invert: print(seq, file=f_out) utils.close(f_out) if mate_in: utils.close(f_out_mate)
def split_by_fixed_size(infile, outfiles_prefix, chunk_size, tolerance, skip_if_all_Ns=False): '''Splits fasta/q file into separate files, with up to (chunk_size + tolerance) bases in each file''' file_count = 1 coords = [] small_sequences = [] # sequences shorter than chunk_size seq_reader = sequences.file_reader(infile) f_coords = utils.open_file_write(outfiles_prefix + '.coords') for seq in seq_reader: if skip_if_all_Ns and seq.is_all_Ns(): continue if len(seq) < chunk_size: small_sequences.append(copy.copy(seq)) elif len(seq) <= chunk_size + tolerance: f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) print(seq, file=f) utils.close(f) file_count += 1 else: # make list of chunk coords chunks = [(x,x+chunk_size) for x in range(0, len(seq), chunk_size)] if chunks[-1][1] - 1 > len(seq): chunks[-1] = (chunks[-1][0], len(seq)) if len(chunks) > 1 and (chunks[-1][1] - chunks[-1][0]) <= tolerance: chunks[-2] = (chunks[-2][0], chunks[-1][1]) chunks.pop() # write one output file per chunk offset = 0 for chunk in chunks: if not(skip_if_all_Ns and seq.is_all_Ns(start=chunk[0], end=chunk[1]-1)): f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) chunk_id = seq.id + ':' + str(chunk[0]+1) + '-' + str(chunk[1]) print(sequences.Fasta(chunk_id, seq[chunk[0]:chunk[1]]), file=f) print(chunk_id, seq.id, offset, sep='\t', file=f_coords) utils.close(f) file_count += 1 offset += chunk[1] - chunk[0] # write files of small sequences if len(small_sequences): f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) file_count += 1 base_count = 0 for seq in small_sequences: if base_count > 0 and base_count + len(seq) > chunk_size + tolerance: utils.close(f) f = utils.open_file_write(outfiles_prefix + '.' + str(file_count)) file_count += 1 base_count = 0 print(seq, file=f) base_count += len(seq) utils.close(f)
def test_raise_exception(self): '''open_file_write() and open_file_read() should raise an exception when can't do the opening''' with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error') with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error.gz') with self.assertRaises(utils.Error): utils.open_file_read(os.path.join(data_dir, 'utils_test_not_really_zipped.gz')) with self.assertRaises(utils.Error): utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error')) with self.assertRaises(utils.Error): utils.open_file_write(os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
def make_random_contigs(contigs, length, outfile, name_by_letters=False, prefix='', seed=None, first_number=1): '''Makes a multi fasta file of random sequences, all the same length''' random.seed(a=seed) fout = utils.open_file_write(outfile) letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') letters_index = 0 for i in range(contigs): if name_by_letters: name = letters[letters_index] letters_index += 1 if letters_index == len(letters): letters_index = 0 else: name = str(i + first_number) fa = sequences.Fasta( prefix + name, ''.join([random.choice('ACGT') for x in range(length)])) print(fa, file=fout) utils.close(fout)
def get_seqs_flanking_gaps(infile, outfile, left, right): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) print('#id', 'gap_start', 'gap_end', 'left_bases', 'right_bases', sep='\t', file=fout) for seq in seq_reader: gaps = seq.gaps() for gap in gaps: left_start = max(gap.start - left, 0) right_end = min(gap.end + right + 1, len(seq)) print(seq.id, gap.start + 1, gap.end + 1, seq.seq[left_start:gap.start], seq.seq[gap.end + 1:right_end], sep='\t', file=fout) utils.close(fout)
def run(description): parser = argparse.ArgumentParser( description = 'Takes a random subset of reads from a sequence file and optionally the corresponding read ' + 'from a mates file. Output is interleaved if mates file given', usage = 'fastaq to_random_subset [options] <infile> <outfile> <percent>') parser.add_argument('--mate_file', help='Name of mates file') parser.add_argument('--seed', help='Seed for random number generator. If not given, python\'s default is used', metavar='INT') parser.add_argument('infile', help='Name of input file') parser.add_argument('outfile', help='Name of output file') parser.add_argument('percent', type=float, help='Per cent probability of keeping any given read (pair) in [0,100]', metavar='FLOAT') options = parser.parse_args() random.seed(a=options.seed) seq_reader = sequences.file_reader(options.infile) fout = utils.open_file_write(options.outfile) if options.mate_file: mate_seq_reader = sequences.file_reader(options.mate_file) for seq in seq_reader: if options.mate_file: try: mate_seq = next(mate_seq_reader) except StopIteration: print('Error! Didn\'t get mate for read', seq.id, file=sys.stderr) sys.exit(1) if 100 * random.random() <= options.percent: print(seq, file=fout) if options.mate_file: print(mate_seq, file=fout) utils.close(fout)
def to_fasta(infile, outfile, line_length=60, strip_after_first_whitespace=False, check_unique=False): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) original_line_length = sequences.Fasta.line_length sequences.Fasta.line_length = line_length if check_unique: used_names = {} for seq in seq_reader: if strip_after_first_whitespace: seq.strip_after_first_whitespace() if check_unique: used_names[seq.id] = used_names.get(seq.id, 0) + 1 if type(seq) == sequences.Fastq: print(sequences.Fasta(seq.id, seq.seq), file=f_out) else: print(seq, file=f_out) utils.close(f_out) sequences.Fasta.line_length = original_line_length if check_unique: all_unique = True for name, count in used_names.items(): if count > 1: print('Sequence name "' + name + '" not unique. Found', count, 'times', file=sys.stderr) all_unique = False if not all_unique: raise Error('Not all sequence names unique. Cannot continue')
def acgtn_only(infile, outfile): '''Replace every non-acgtn (case insensitve) character with an N''' f = utils.open_file_write(outfile) for seq in sequences.file_reader(infile): seq.replace_non_acgt() print(seq, file=f) utils.close(f)
def interleave(infile_1, infile_2, outfile, suffix1=None, suffix2=None): '''Makes interleaved file from two sequence files. If used, will append suffix1 onto end of every sequence name in infile_1, unless it already ends with suffix1. Similar for sufffix2.''' seq_reader_1 = sequences.file_reader(infile_1) seq_reader_2 = sequences.file_reader(infile_2) f_out = utils.open_file_write(outfile) for seq_1 in seq_reader_1: try: seq_2 = next(seq_reader_2) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') if suffix1 is not None and not seq_1.id.endswith(suffix1): seq_1.id += suffix1 if suffix2 is not None and not seq_2.id.endswith(suffix2): seq_2.id += suffix2 print(seq_1, file=f_out) print(seq_2, file=f_out) try: seq_2 = next(seq_reader_2) except: seq_2 = None if seq_2 is not None: utils.close(f_out) raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue') utils.close(f_out)
def run(self): original_dir = os.getcwd() os.chdir(self.working_directory) contigs_in_file = set(self.contigs.keys()) if contigs_in_file != self.ids_to_skip and not self.alignments: self.alignments = utils.run_nucmer(self.fasta_file, self.fasta_file, self._build_alignments_filename(), min_percent_id=self.overlap_percent_identity) output_fw = fastaqutils.open_file_write(self.output_file) for contig_id in sorted(self.contigs.keys()): #Look for overlaps, trim if applicable if contig_id not in self.ids_to_skip: best_overlap = self._find_best_overlap(contig_id) trim_status = None if best_overlap and self.trim: trim_status = self._trim(contig_id, best_overlap) self._write_summary(contig_id, best_overlap, trim_status) print(sequences.Fasta(contig_id, self.contigs[contig_id].seq), file=output_fw) fastaqutils.close(output_fw) # tasks.sort_by_size(self._build_intermediate_filename(), self.output_file) # Sort contigs in final file according to size if not self.debug: utils.delete(self._build_alignments_filename()) # utils.delete(self._build_intermediate_filename()) os.chdir(original_dir)
def interleave(infile_1, infile_2, outfile): seq_reader_1 = sequences.file_reader(infile_1) seq_reader_2 = sequences.file_reader(infile_2) f_out = utils.open_file_write(outfile) for seq_1 in seq_reader_1: try: seq_2 = next(seq_reader_2) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') print(seq_1, file=f_out) print(seq_2, file=f_out) try: seq_2 = next(seq_reader_2) except: seq_2 = None if seq_2 is not None: utils.close(f_out) raise Error('Error getting mate for sequence', seq_2.id, ' ... cannot continue') utils.close(f_out)
def fix_blast_coords(blast_file, coords_file, outfile): coords_offset = offset_coords_file_to_dict(coords_file) fin = utils.open_file_read(blast_file) fout = utils.open_file_write(outfile) for line in fin: # blastn sticks a bunch of header lines in the tabulated # output file. Need to ignore them if '\t' not in line: continue # Lines are supposed to be tab delimited. Sometimes they # have a space character following a tab character, so # split on whitespace. This is OK because the pipeline has already # removed whitespace from sequence names data = line.rstrip().split() if data[0] in coords_offset: data[6] = str(int(data[6]) + coords_offset[data[0]][1]) data[7] = str(int(data[7]) + coords_offset[data[0]][1]) data[0] = coords_offset[data[0]][0] # always reconstruct the line, because of spaces bug mentioned above line = '\t'.join(data) print(line.rstrip(),file=fout) utils.close(fin) utils.close(fout)
def split_by_fixed_size_onefile(infile, outfile, chunk_size, tolerance, skip_if_all_Ns=False): '''Splits each sequence in infile into chunks of fixed size, last chunk can be up to (chunk_size + tolerance) in length''' seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: for i in range(0, len(seq), chunk_size): if i + chunk_size + tolerance >= len(seq): end = len(seq) else: end = i + chunk_size subseq = seq.subseq(i, end) if not (skip_if_all_Ns and subseq.is_all_Ns()): subseq.id += '.' + str(i + 1) + '_' + str(end) print(subseq, file=f_out) if end == len(seq): break utils.close(f_out)
def fix_blast_coords(blast_file, coords_file, outfile): coords_offset = offset_coords_file_to_dict(coords_file) fin = utils.open_file_read(blast_file) fout = utils.open_file_write(outfile) for line in fin: # blastn sticks a bunch of header lines in the tabulated # output file. Need to ignore them if '\t' not in line: continue # Lines are supposed to be tab delimited. Sometimes they # have a space character following a tab character, so # split on whitespace. This is OK because the pipeline has already # removed whitespace from sequence names data = line.rstrip().split() if data[0] in coords_offset: data[6] = str(int(data[6]) + coords_offset[data[0]][1]) data[7] = str(int(data[7]) + coords_offset[data[0]][1]) data[0] = coords_offset[data[0]][0] # always reconstruct the line, because of spaces bug mentioned above line = '\t'.join(data) print(line.rstrip(), file=fout) utils.close(fin) utils.close(fout)
def trim_contigs(infile, outfile, trim): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: if len(seq) < 2 * trim: continue gaps = seq.gaps() bases = list(seq.seq) # extend the length of each gap for gap in gaps: left_start = max(gap.start - trim, 0) right_end = min(gap.end + trim + 1, len(seq)) for i in range(left_start, gap.start): bases[i] = 'N' for i in range(gap.end, right_end): bases[i] = 'N' seq.seq = ''.join(bases) # trim start/end bases and tidy up any resulting Ns at either end of the trimmed seq seq.trim(trim, trim) seq.trim_Ns() # check that there is some non-N sequence left over regex = re.compile('[^nN]') if regex.search(seq.seq) is not None: print(seq, file=fout) utils.close(fout)
def sequence_trim(infile_1, infile_2, outfile_1, outfile_2, to_trim_file, min_length=50, check_revcomp=False): to_trim_seqs = {} file_to_dict(to_trim_file, to_trim_seqs) trim_seqs = [x.seq for x in to_trim_seqs.values()] if check_revcomp: for seq in to_trim_seqs.values(): seq.revcomp() trim_seqs_revcomp = [x.seq for x in to_trim_seqs.values()] else: trim_seqs_revcomp = [] seq_reader_1 = sequences.file_reader(infile_1) seq_reader_2 = sequences.file_reader(infile_2) f_out_1 = utils.open_file_write(outfile_1) f_out_2 = utils.open_file_write(outfile_2) for seq_1 in seq_reader_1: try: seq_2 = next(seq_reader_2) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') for seq in seq_1, seq_2: for trim_seq in trim_seqs: if seq.seq.startswith(trim_seq): seq.trim(len(trim_seq), 0) break for trim_seq in trim_seqs_revcomp: if seq.seq.endswith(trim_seq): seq.trim(0, len(trim_seq)) break if len(seq_1) >= min_length and len(seq_2) >= min_length: print(seq_1, file=f_out_1) print(seq_2, file=f_out_2) utils.close(f_out_1) utils.close(f_out_2)
def translate(infile, outfile, frame=0): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: print(seq.translate(frame=frame), file=fout) utils.close(fout)
def reverse_complement(infile, outfile): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.revcomp() print(seq, file=fout) utils.close(fout)
def strip_illumina_suffix(infile, outfile): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: seq.strip_illumina_suffix() print(seq, file=f_out) utils.close(f_out)
def replace_bases(infile, outfile, old, new): seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for seq in seq_reader: seq.replace_bases(old, new) print(seq, file=f_out) utils.close(f_out)
def test_raise_exception(self): '''open_file_write() and open_file_read() should raise an exception when can't do the opening''' with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error') with self.assertRaises(utils.Error): utils.open_file_read('this_file_is_not_here_so_throw_error.gz') with self.assertRaises(utils.Error): utils.open_file_read( os.path.join(data_dir, 'utils_test_not_really_zipped.gz')) with self.assertRaises(utils.Error): utils.open_file_write( os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error')) with self.assertRaises(utils.Error): utils.open_file_write( os.path.join('not_a_directory', 'this_file_is_not_here_so_throw_error.gz'))
def capillary_to_pairs(infile, outprefix): # hash the sequences, only taking longest where an end has been sequenced more than once seq_reader = sequences.file_reader(infile) fwd_seqs = {} rev_seqs = {} unpaired_seqs = {} for seq in seq_reader: id_info = seq.split_capillary_id() if id_info['dir'] == 'fwd': seq.id = id_info['prefix'] + '/1' h = fwd_seqs elif id_info['dir'] == 'rev': seq.id = id_info['prefix'] + '/2' h = rev_seqs else: seq.id = id_info['prefix'] h = unpaired_seqs key = id_info['prefix'] if key not in h or len(h[key]) < len(seq): h[key] = copy.copy(seq) # write the output files f_pe = utils.open_file_write(outprefix + '.paired.gz') f_up = utils.open_file_write(outprefix + '.unpaired.gz') for id in fwd_seqs: if id in rev_seqs: print(fwd_seqs[id], file=f_pe) print(rev_seqs[id], file=f_pe) del rev_seqs[id] else: print(fwd_seqs[id], file=f_up) for seq in rev_seqs.values(): print(seq, file=f_up) for seq in unpaired_seqs.values(): print(seq, file=f_up) utils.close(f_pe) utils.close(f_up)
def sort_by_size(infile, outfile, smallest_first=False): '''Sorts input sequence file by biggest sequence first, writes sorted output file. Set smallest_first=True to have smallest first''' seqs = {} file_to_dict(infile, seqs) seqs = list(seqs.values()) seqs.sort(key=lambda x: len(x), reverse=not smallest_first) fout = utils.open_file_write(outfile) for seq in seqs: print(seq, file=fout) utils.close(fout)
def to_fasta_union(infile, outfile, seqname='union'): seq_reader = sequences.file_reader(infile) new_seq = [] for seq in seq_reader: new_seq.append(seq.seq) f_out = utils.open_file_write(outfile) print(sequences.Fasta(seqname, ''.join(new_seq)), file=f_out) utils.close(f_out)
def sort_by_name(infile, outfile): '''Sorts input sequence file by sort -d -k1,1, writes sorted output file.''' seqs = {} file_to_dict(infile, seqs) #seqs = list(seqs.values()) #seqs.sort() fout = utils.open_file_write(outfile) for name in sorted(seqs): print(seqs[name], file=fout) utils.close(fout)
def trim(infile, outfile, start, end): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.trim(start, end) if len(seq): print(seq, file=fout) utils.close(fout)
def search_for_seq(infile, outfile, search_string): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: hits = seq.search(search_string) for hit in hits: print(seq.id, hit[0]+1, hit[1], sep='\t', file=fout) utils.close(fout)
def trim_Ns_at_end(infile, outfile): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seq.trim_Ns() if len(seq): print(seq, file=fout) utils.close(fout)
def expand_nucleotides(infile, outfile): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: seqs = seq.expand_nucleotides() if len(seqs) > 1: for s in seqs: print(s, file=fout) else: print(seq, file=fout)
def make_long_reads(infile, outfile, method='tiling', fixed_read_length=20000, tile_step=10000, gamma_shape=1.2, gamma_scale=6000, coverage=10, gamma_min_length=20000, seed=None, ins_skip=None, ins_window=None,): assert method in ['tiling', 'gamma', 'uniform'] assert ins_skip == ins_window == None or None not in [ins_skip, ins_window] if seed is not None: random.seed(a=seed) seq_reader = sequences.file_reader(infile) f = utils.open_file_write(outfile) for seq in seq_reader: if method == 'tiling': if len(seq) < fixed_read_length: print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) continue for i in range(0, len(seq), tile_step): end = min(len(seq), i + fixed_read_length) fa = sequences.Fasta('_'.join([seq.id, str(i + 1), str(end)]), seq[i:end]) if ins_skip: fa.add_insertions(skip=ins_skip, window=ins_window) print(fa, file=f) if end >= len(seq): break elif method == 'gamma': if len(seq) < gamma_min_length: print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) continue total_read_length = 0 while total_read_length < coverage * len(seq) - 0.5 * gamma_min_length: read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale)) while read_length < gamma_min_length or read_length > len(seq): read_length = int(numpy.random.gamma(gamma_shape, scale=gamma_scale)) start = random.randint(0, len(seq) - read_length) end = start + read_length - 1 fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1]) total_read_length += len(fa) if ins_skip: fa.add_insertions(skip=ins_skip, window=ins_window) print(fa, file=f) elif method == 'uniform': if len(seq) < fixed_read_length: print('Skipping sequence', seq.id, 'because it is too short at', len(seq), 'bases', file=sys.stderr) continue total_read_length = 0 while total_read_length < coverage * len(seq) - 0.5 * fixed_read_length: start = random.randint(0, len(seq) - fixed_read_length) end = start + fixed_read_length - 1 fa = sequences.Fasta('_'.join([seq.id, str(start + 1), str(end + 1)]), seq[start:end+1]) total_read_length += len(fa) if ins_skip: fa.add_insertions(skip=ins_skip, window=ins_window) print(fa, file=f) utils.close(f)
def to_boulderio(infile, outfile): '''Converts input sequence file into a "Boulder-IO format", as used by primer3''' seq_reader = sequences.file_reader(infile) f_out = utils.open_file_write(outfile) for sequence in seq_reader: print("SEQUENCE_ID=" + sequence.id, file=f_out) print("SEQUENCE_TEMPLATE=" + sequence.seq, file=f_out) print("=", file=f_out) utils.close(f_out)
def enumerate_names(infile, outfile, start_index=1, keep_illumina_suffix=False, rename_file=None, suffix=None): seq_reader = sequences.file_reader(infile) fout_seqs = utils.open_file_write(outfile) counter = start_index if keep_illumina_suffix: sequence_suffixes = ['/1', '/2'] else: sequence_suffixes = [] if rename_file is not None: fout_rename = utils.open_file_write(rename_file) print('#old\tnew', file=fout_rename) for seq in seq_reader: old_id = seq.id seq.id = str(counter) for suff in sequence_suffixes: if old_id.endswith(suff): seq.id += suff break if rename_file is not None: print(old_id, seq.id, sep='\t', file=fout_rename) if suffix is not None: seq.id += suffix print(seq, file=fout_seqs) counter += 1 utils.close(fout_seqs) if rename_file is not None: utils.close(fout_rename)
def sequence_trim(infile_1, infile_2, outfile_1, outfile_2, to_trim_file, min_length=50, check_revcomp=False): to_trim_seqs = {} file_to_dict(to_trim_file, to_trim_seqs) trim_seqs = [x.seq for x in to_trim_seqs.values()] if check_revcomp: for seq in to_trim_seqs.values(): seq.revcomp() trim_seqs_revcomp = [x.seq for x in to_trim_seqs.values()] else: trim_seqs_revcomp = [] seq_reader_1 = sequences.file_reader(infile_1) seq_reader_2 = sequences.file_reader(infile_2) f_out_1 = utils.open_file_write(outfile_1) f_out_2 = utils.open_file_write(outfile_2) for seq_1 in seq_reader_1: try: seq_2 = next(seq_reader_2) except: utils.close(f_out) raise Error('Error getting mate for sequence', seq_1.id, ' ... cannot continue') for seq in seq_1, seq_2: for trim_seq in trim_seqs: if seq.seq.startswith(trim_seq): seq.trim(len(trim_seq),0) break for trim_seq in trim_seqs_revcomp: if seq.seq.endswith(trim_seq): seq.trim(0,len(trim_seq)) break if len(seq_1) >= min_length and len(seq_2) >= min_length: print(seq_1, file=f_out_1) print(seq_2, file=f_out_2) utils.close(f_out_1) utils.close(f_out_2)
def run(self): '''Look for break points in contigs''' contigs_in_file = set(self.contigs.keys()) if contigs_in_file != self.ids_to_skip: full_hits, partial_hits_at_start, partial_hits_at_end = self._run_promer_and_store_hits() chromosome_count = 1 plasmid_count = 1 output_fw = fastaqutils.open_file_write(self.output_file) for contig_id in self.contigs: contig_sequence = self.contigs[contig_id] gene_name = None new_name = contig_id skipped = False break_point = None on_reverse_strand = False if contig_id not in self.ids_to_skip: dnaA_found, break_point, on_reverse_strand, gene_name = self._best_promer_hit_for_contig(full_hits, partial_hits_at_start, partial_hits_at_end, contig_id) if dnaA_found: new_name = 'chromosome_' + str(chromosome_count) chromosome_count += 1 else: # If the dnaa has still not been found, look for a gene in prodigal results if self.choose_random_gene: self.random_gene_starts = self._run_prodigal_and_store_gene_starts() break_point, on_reverse_strand, gene_name = self._find_best_prodigal_gene(contig_id) new_name = 'plasmid_' + str(plasmid_count) # circularise the contig if break_point: if on_reverse_strand: contig_sequence.revcomp() contig_sequence = contig_sequence[break_point:] + contig_sequence[0:break_point] self.contigs[contig_id].seq = contig_sequence else: # Skipped, just write contig as it is skipped = True # write the contig out contig_name = new_name if self.rename else contig_id print(sequences.Fasta(contig_name, contig_sequence), file=output_fw) self._write_summary(contig_id, break_point, gene_name, on_reverse_strand, new_name, skipped) fastaqutils.close(output_fw) # clean up if not self.debug: utils.delete(self._build_promer_filename()) utils.delete(self._build_prodigal_filename()) utils.delete(self._build_temp_fasta_filename())
def test_write_and_read(self): '''open_file_write() and open_file_read() should do the right thing depending gzipped or not''' for filename in ['utils.tmp', 'utils.tmp.gz', 'utils.tmp.bgz']: f = utils.open_file_write(filename) for i in range(3): print(i, file=f) utils.close(f) counter = 0 f = utils.open_file_read(filename) for line in f: self.assertEqual(counter, int(line.strip())) counter += 1 utils.close(f) os.unlink(filename) f = utils.open_file_read('-') self.assertEqual(sys.stdin, f) f = utils.open_file_write('-') self.assertEqual(sys.stdout, f)
def deinterleave(infile, outfile_1, outfile_2, fasta_out=False): seq_reader = sequences.file_reader(infile) f_1 = utils.open_file_write(outfile_1) f_2 = utils.open_file_write(outfile_2) for seq in seq_reader: if fasta_out: print(sequences.Fasta(seq.id, seq.seq), file=f_1) else: print(seq, file=f_1) try: next(seq_reader) except StopIteration: utils.close(f_1) utils.close(f_2) raise Error('Error getting mate for sequence. Cannot continue') if fasta_out: print(sequences.Fasta(seq.id, seq.seq), file=f_2) else: print(seq, file=f_2) utils.close(f_1) utils.close(f_2)
def fastaq_to_fake_qual(infile, outfile, q=40): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: print('>' + seq.id, file=fout) if sequences.Fasta.line_length == 0: print(' '.join([str(q)] * len(seq)), file=fout) else: for i in range(0, len(seq), sequences.Fasta.line_length): print(' '.join([str(q)] * min(sequences.Fasta.line_length, len(seq) - i)), file=fout) utils.close(fout)
def fastaq_to_orfs_gff(infile, outfile, min_length=300, tool_name='fastaq'): seq_reader = sequences.file_reader(infile) fout = utils.open_file_write(outfile) for seq in seq_reader: orfs = seq.all_orfs(min_length=min_length) for coords, revcomp in orfs: if revcomp: strand = '-' else: strand = '+' print(seq.id, tool_name, 'CDS', coords.start+1, coords.end+1, '.', strand, '.', sep='\t', file=fout) utils.close(fout)