def split_fasta_file(input_file_path, dest_dir, prefix='part', num_reads_per_file=5000): input_fasta = u.SequenceSource(input_file_path) parts = [] next_part = 1 part_obj = None while input_fasta.next(): if (input_fasta.pos - 1) % num_reads_per_file == 0: if part_obj: part_obj.close() rand_bit = ''.join([ random.choice(string.ascii_letters + string.digits) for n in xrange(8) ]) file_path = os.path.join( dest_dir, '%s-%d-%s.fa' % (prefix, next_part, rand_bit)) parts.append(file_path) next_part += 1 part_obj = u.FastaOutput(file_path) part_obj.store(input_fasta, split=False) if part_obj: part_obj.close() return parts
def main(input_fasta_path, output_fasta_path=None, reverse=False): if not output_fasta_path: output_fasta_path = input_fasta_path + '-PADDED-WITH-GAPS' fasta = u.SequenceSource(input_fasta_path) output = u.FastaOutput(output_fasta_path) longest_read = 0 while next(fasta): if len(fasta.seq) > longest_read: longest_read = len(fasta.seq) fasta.reset() while next(fasta): if fasta.pos % 10000 == 0: sys.stderr.write('\rreads processed so far: %d' % (fasta.pos)) sys.stderr.flush() gaps = longest_read - len(fasta.seq) output.write_id(fasta.id) if reverse: output.write_seq('-' * gaps + fasta.seq, split=False) else: output.write_seq(fasta.seq + '-' * gaps, split=False) fasta.close() sys.stderr.write('\n')
def trim_uninformative_columns_from_alignment(input_file_path): input_fasta = u.SequenceSource(input_file_path, lazy_init=False) input_fasta.next() fasta_read_len = len(input_fasta.seq) invalid_columns = range(0, fasta_read_len) input_fasta.reset() while input_fasta.next(): for i in invalid_columns: if input_fasta.seq[i] != '-': invalid_columns.remove(i) columns_to_keep = [ x for x in range(0, fasta_read_len) if x not in invalid_columns ] input_fasta.reset() temp_file = tempfile.NamedTemporaryFile(delete=False) temp_file_path = temp_file.name temp_file.close() temp_file = u.FastaOutput(temp_file_path) while input_fasta.next(): new_seq = '' for i in columns_to_keep: new_seq += input_fasta.seq[i] temp_file.write_id(input_fasta.id) temp_file.write_seq(new_seq, split=False) temp_file.close() # overwrite the original file with trimmed content shutil.move(temp_file_path, input_file_path)
def gen_tmpl(taxon, otu_id_to_greengenes, greengenes_alignment, output_file_path=None): ids = [] for id, tax in [ line.strip().split('\t') for line in open(otu_id_to_greengenes).readlines() ]: if tax.find(taxon) > 0: ids.append(id) ids = list(set(ids)) print '%d ids found for %s.' % (len(ids), taxon) template = u.FastaOutput('%s.tmpl' % taxon) fasta = u.SequenceSource(greengenes_alignment) while fasta.next(): if fasta.id in ids: template.store(fasta, split=False) ids.remove(fasta.id) fasta.close() template.close()
def split_fasta_file(input_file_path, dest_dir, prefix='part', num_reads_per_file=5000): input_fasta = u.SequenceSource(input_file_path) parts = [] next_part = 1 part_obj = None while input_fasta.next(): if (input_fasta.pos - 1) % num_reads_per_file == 0: if part_obj: part_obj.close() file_path = os.path.join(dest_dir, '%s-%d' % (prefix, next_part)) parts.append(file_path) next_part += 1 part_obj = u.FastaOutput(file_path) part_obj.store(input_fasta, split=False) if part_obj: part_obj.close() return parts
def mask_defline_whitespaces_in_FASTA(fasta_file_path, defline_white_space_mask='<$!$>'): temp_file_path = fasta_file_path + '.tmp' fasta = u.SequenceSource(fasta_file_path) output = u.FastaOutput(fasta_file_path + '.tmp') while fasta.next(): output.write_id(fasta.id.replace(' ', defline_white_space_mask)) output.write_seq(fasta.seq, split=False) shutil.move(temp_file_path, fasta_file_path)
def main(input_fasta, subsample_to, output_fasta): fasta = u.SequenceSource(input_fasta) fasta_content = {} while fasta.next(): if fasta.pos % 1000 == 0: sys.stderr.write( '\r[Reading FASTA into memory] reads processed so far: %d' % (fasta.pos)) sys.stderr.flush() sample_name = get_sample_name_from_defline(fasta.id) if not fasta_content.has_key(sample_name): fasta_content[sample_name] = [] fasta_content[sample_name].append((fasta.id, fasta.seq), ) samples = sorted(fasta_content.keys()) sys.stderr.write( '\n%d samples found in the FASTA file: %s%s\n' % (len(samples), ', '.join(samples[0:3] if len(samples) > 3 else ', '.join(samples)), ' (...)' if len(samples) > 3 else '.')) sample_counter = 0 for sample in samples: sample_counter += 1 sys.stderr.write('\r[Shuffling] Sample %d of %d' % (sample_counter, len(samples))) sys.stderr.flush() random.shuffle(fasta_content[sample]) output = u.FastaOutput(output_fasta) sample_counter = 0 for sample in samples: sample_counter += 1 sys.stderr.write('\r[Writing Output] Sample %d of %d' % (sample_counter, len(samples))) sys.stderr.flush() for e in fasta_content[sample][0:subsample_to]: output.write_id(e[0]) output.write_seq(e[1], split=False) sys.stderr.write('\n') sys.stderr.flush()
def store_node_representatives(self, node_ids, output_file_path, store_gaps=False): output = u.FastaOutput(output_file_path) for node_id in node_ids: output.write_id(node_id) if store_gaps: output.write_seq(self.nodes[node_id].representative_seq, split=False) else: output.write_seq( self.nodes[node_id].representative_seq.replace('-', ''), split=False) output.close()
def unique_and_store_alignment(alignment_path, output_path): output = u.FastaOutput(output_path) alignment = u.SequenceSource(alignment_path, unique=True) alignment.next() most_abundant_unique_read = alignment.seq alignment.reset() read_ids = [] unique_read_counts = [] while alignment.next(): read_ids += alignment.ids unique_read_counts.append(len(alignment.ids)) output.store(alignment, split=False) output.close() alignment.close() return (read_ids, unique_read_counts, most_abundant_unique_read)
import sys import Oligotyping.lib.fastalib as u fasta = u.SequenceSource(sys.argv[1]) output = u.FastaOutput(sys.argv[1] + '-PADDED-WITH-GAPS') longest_read = 0 while fasta.next(): if len(fasta.seq) > longest_read: longest_read = len(fasta.seq) fasta.reset() while fasta.next(): if fasta.pos % 10000 == 0: sys.stdout.write('\rreads processed so far: %d' % (fasta.pos)) sys.stdout.flush() gaps = longest_read - len(fasta.seq) output.write_id(fasta.id) output.write_seq(fasta.seq + '-' * gaps, split=False) fasta.close() print
def main(fasta_file_path, min_percent=95.0, output_file_path=None): fasta = u.SequenceSource(fasta_file_path) fasta.next() alignment_length = len(fasta.seq) fasta.reset() positions = {} while fasta.next(): if fasta.pos % 1000 == 0: sys.stderr.write('\rAnalyzing all reads; pos: %d' % fasta.pos) sys.stderr.flush() for i in range(0, alignment_length): if fasta.seq[i] != '-': for j in range(i, alignment_length): try: positions[j] += 1 except: positions[j] = 1 break fasta.reset() sys.stderr.write('\n') num_reads = positions[alignment_length - 1] trim_location = 0 for i in range(0, alignment_length): pct_reads_will_survive = positions[i] * 100.0 / num_reads if pct_reads_will_survive >= min_percent and not trim_location: trim_location = i trim_location_pct_reads_survive = pct_reads_will_survive if pct_reads_will_survive == 100: print print 'All reads are going to be trimmed from the %dth position.' % ( trim_location_pct_reads_survive) if 100 - trim_location_pct_reads_survive: print print '%d reads that do not reach to this locaition will be eliminated.' % ( (100 - trim_location_pct_reads_survive) / 100.0 * num_reads) if min_percent < 100: print print 'If all reads were to be retained, alignments should have been trimmed from' print 'the %dth location, however, this would have required all reads to lose %d' % ( i, i - trim_location) print 'bases' print break output = u.FastaOutput( output_file_path if output_file_path else sys.argv[1] + '-TRIMMED') while fasta.next(): if fasta.pos % 1000 == 0: sys.stderr.write('\rStoring trimmed reads; pos: %d' % fasta.pos) sys.stderr.flush() if fasta.seq[trim_location:].startswith('-'): continue else: output.write_id(fasta.id) output.write_seq(fasta.seq[trim_location:], split=False) sys.stderr.write('\n') sys.stderr.write('\n') print 'Trimmed reads stored: "%s"\n' % ( output_file_path if output_file_path else sys.argv[1] + '-TRIMMED')
# removes samples from FASTA file: # # ./me FASTA_FILE sample_1,sample_2,[...],sample_N # import sys import Oligotyping.lib.fastalib as u from Oligotyping.utils.utils import pretty_print as pp fasta = u.SequenceSource(sys.argv[1]) output = u.FastaOutput(sys.argv[1] + '-SAMPLES-REMOVED.fa') samples_to_be_removed = [s.strip() for s in sys.argv[2].split(',')] while fasta.next(): if fasta.pos % 1000 == 0: sys.stderr.write('\rreads processed so far: %s' % (pp(fasta.pos))) sys.stderr.flush() sample_name = '_'.join(fasta.id.split('_')[:-1]) if sample_name in samples_to_be_removed: continue output.store(fasta, split=False) sys.stderr.write('\rNew FASTA file .............: %s\n' % (sys.argv[1] + '-SAMPLES-REMOVED.fa')) fasta.close() output.close()
# -*- coding: utf-8 -*- import sys import Oligotyping.lib.fastalib as u fasta = u.SequenceSource(sys.argv[1]) taxon = sys.argv[2] output = u.FastaOutput(sys.argv[2].replace(';', '')) while fasta.next(): if fasta.id.find(taxon) > -1: acc = fasta.id.split('|')[0] project = fasta.id.split('|')[1].split('=')[1] sample = fasta.id.split('|')[2].split('=')[1] new_id = project + '_' + sample + '_' + acc abundance = int(fasta.id.split('|')[7].split('=')[1]) for i in range(0, abundance): output.write_id('%s-%s|%s' % (new_id, str(i), fasta.id)) output.write_seq(fasta.seq, split=False) fasta.close() output.close()
# -*- coding: utf-8 -*- import sys import Oligotyping.lib.fastalib as u fasta = u.SequenceSource(sys.argv[1], lazy_init=False) output = u.FastaOutput(sys.argv[1] + '-TRIMMED') trim_from = int(sys.argv[2]) trim_to = int(sys.argv[3]) if len(sys.argv) == 4 else None while fasta.next(): output.write_id(fasta.id) if trim_to: output.write_seq(fasta.seq[trim_from:trim_to], split=False) else: output.write_seq(fasta.seq[trim_from:], split=False) fasta.close() output.close()