def combine_quiver_results(split_dirs, output_dir, hq_filename, lq_filename, tofu_prefix=''): """ For each size bin result, ex: clusterOut/0to2k/all.quiveredXXX.fastq combine it together, remember to add a prefix (ex: i0|c12, i1|c13/....) """ prefix_dict_hq = {} prefix_dict_lq = {} fout_hq = FastqWriter(os.path.join(output_dir, 'all_sizes.quivered_hq.fastq')) fout_lq = FastqWriter(os.path.join(output_dir, 'all_sizes.quivered_lq.fastq')) for i,d in enumerate(split_dirs): file_hq = os.path.join(d, hq_filename) #'all_quivered_hq.100_30_0.99.fastq') file_lq = os.path.join(d, lq_filename) #'all_quivered_lq.fastq') print >> sys.stderr, "Adding prefix i{0}| to {1},{2}...".format(i, file_hq, file_lq) prefix_dict_hq["i{i}HQ_{p}".format(i=i,p=tofu_prefix)] = os.path.abspath(d) prefix_dict_lq["i{i}LQ_{p}".format(i=i,p=tofu_prefix)] = os.path.abspath(d) for r in FastqReader(file_hq): _name_ = "i{i}HQ_{p}|{n}".format(p=tofu_prefix, i=i, n=r.name) fout_hq.writeRecord(_name_, r.sequence, r.quality) for r in FastqReader(file_lq): _name_ = "i{i}LQ_{p}|{n}".format(p=tofu_prefix, i=i, n=r.name) fout_lq.writeRecord(_name_, r.sequence, r.quality) fout_hq.close() fout_lq.close() print >> sys.stderr, "HQ quivered output combined to:", fout_hq.file.name print >> sys.stderr, "LQ quivered output combined to:", fout_lq.file.name return fout_hq.file.name,fout_lq.file.name,prefix_dict_hq,prefix_dict_lq
def trim_fastq(fastq_file, output_file, window=WINDOW): with FastqWriter(output_file) as writer: for record in FastqReader(fastq_file): start = _find_start(record, window) end = _find_end(record, window) trimmed_record = _trim_fastq(record, start, end) writer.writeRecord(trimmed_record)
def __init__(self, transfrag_filename, fsm_maps, cov_threshold=2, min_aln_coverage=.99, min_aln_identity=.85, is_fq=False): self.contiVec = None # current ContiVec object self.exons = None #self.MIN_EXON_SIZE = max_fuzzy_junction self.transfrag_filename = transfrag_filename if is_fq: self.transfrag_len_dict = dict( (r.name.split()[0], len(r.sequence)) for r in FastqReader(transfrag_filename)) else: self.transfrag_len_dict = dict( (r.name.split()[0], len(r.sequence)) for r in FastaReader(transfrag_filename)) self.fsm_maps = fsm_maps self.cov_threshold = cov_threshold # only output GTF records if >= this many GMAP records support it (this must be if I'm running non-clustered fasta on GMAP) self.min_aln_coverage = min_aln_coverage self.min_aln_identity = min_aln_identity self.cuff_index = 1
def precache_fastq(self, fastq_filename): """ Cache each sequence in the FASTQ file into self.qv """ for r in FastqReader(fastq_filename): seqid = r.name.split()[0] self.qv[seqid] = {} c_basQV.fastq_precache_helper(seqid, r.quality, self.qv)
def parseSequenceData(self): self.sequenceData = {} self.qualityData = {} for record in FastqReader(self.ccsFile): zmw = get_zmw(record.name) new_record = FastqRecord(zmw, record.sequence, record.quality) self.sequenceData[zmw] = new_record self.qualityData[zmw] = meanPQv(new_record)
def _parse_exon_records(exon_file, output_type): if output_type == 'fasta': return list(FastaReader(exon_file)) elif output_type == 'fastq': return list(FastqReader(exon_file)) msg = 'Exon data must be in either Fasta or Fastq format' log.error(msg) raise TypeError(msg)
def rename_resequencing(input_fastq, output_fastq): """ Rename resequenced Fastq to have an AA-style NumReads tag """ with FastqWriter(output_fastq) as writer: for record in FastqReader(input_fastq): new_name = get_new_name(record.name) new_record = FastqRecord(new_name, record.sequence, record.quality) writer.writeRecord(new_record)
def parseFastqData(self): self.sequenceData = {} log.info('Reading QV data from "%s"...' % self.fastq) counter = 0 for record in FastqReader(self.fastq): zmw = self.getZmw(record) self.sequenceData[zmw] = record counter += 1 log.info('A total of %s Fastq records were read into memory' % counter)
def read_names(sequence_file): # Open the sequence file with the appropriate reader if is_fasta(sequence_file): reader = FastaReader(sequence_file) elif is_fastq(sequence_file): reader = FastqReader(sequence_file) else: raise ValueError # Extract and return the sequence names return [r.name.strip().split()[0] for r in reader]
def make_current_fastq(icec_obj, flnc_filename, root_dir): """ current fasta will consists of all ids however --- if this was a already finished run and we are adding more input, then newids is empty, in this case we set newids = everything that has no affiliation or more than one affiliated cluster in d """ with FastqWriter(os.path.join(root_dir, 'current.fastq')) as f: for r in FastqReader(flnc_filename): f.writeRecord(r)
def read_sequences(sequence_file): """ Parse a list of records from either a Fasta or Fastq file """ if is_fasta(sequence_file): return list(FastaReader(sequence_file)) elif is_fastq(sequence_file): return list(FastqReader(sequence_file)) else: msg = 'Sequence file must be either Fasta or Fastq' log.error(msg) raise TypeError(msg)
def combine_fastq(sequence_files, output_file): """ Combine a series of sequence files into one Fastq """ with FastqWriter(output_file) as handle: for filename in sequence_files: try: for record in FastqReader(filename): handle.writeRecord(record) except: log.warn('Could not open "%s" as Fastq' % fasta) check_output_file(output_file) return output_file
def _parse_input_records( input_file ): """ Parse the input sequence records with the appropriate pbcore Reader """ input_type = get_file_type( input_file ) if input_type == 'fasta': return list( FastaReader( input_file )) elif input_type == 'fastq': return list( FastqReader( input_file )) else: msg = 'Input file must be either Fasta or Fastq' log.error( msg ) raise TypeError( msg )
def separate_sequences(self): # Open the appropriate Sequence Reader if self.filetype == 'fasta': reader = FastaReader(self.input_file) elif self.filetype == 'fastq': reader = FastqReader(self.input_file) # Iterate through records, writing out the for record in reader: try: group = self.groups[record.name] except: continue self.write_record(record, group)
def check_ids_unique(fa_or_fq_filename, is_fq=False): """ Confirm that a FASTA/FASTQ file has all unique IDs (used probably by collapse or fusion finding script) """ if is_fq: reader = FastqReader(fa_or_fq_filename) else: reader = FastaReader(fa_or_fq_filename) seen = set() for r in reader: if r.id in seen: raise Exception, "Duplicate id {0} detected. Abort!".format(r.id) seen.add(r.id)
def combine_amp_analysis( input_dir, output_file ): """ Combine all AmpAnalysis results into a single Fastq file """ log.info("Combining AmpliconAnalysis outputs") record_counter = 0 file_counter = 0 with FastqWriter( output_file ) as writer: for result in find_amp_analysis_results(input_dir): file_counter += 1 for record in FastqReader( result ): record_counter += 1 writer.writeRecord( record ) log.info("Found {0} consensus sequences in {1} outputs".format(record_counter, file_counter)) return output_file
def combine_fastq( input_files, output_file): """ Combine sequences from multiple Fastq files into one """ log.info("Combining multiple Fastq outputs") record_counter = 0 file_counter = 0 with FastqWriter( output_file ) as writer: for filename in input_files: file_counter += 1 for record in FastqReader( filename ): record_counter += 1 writer.writeRecord( record ) log.info("Found {0} consensus sequences in {1} outputs".format(record_counter, file_counter)) return output_file
def quality_filter(input_fastq, output_fastq, min_accuracy=ACCURACY): """ Filter out sequences below a threshold of predicted accuracy """ log.info("Filtering sequences below {0}% predicted accuracy".format( 100 * min_accuracy)) seq_count = 0 pass_count = 0 with FastqWriter(output_fastq) as writer: for record in FastqReader(input_fastq): seq_count += 1 if predicted_accuracy(record) >= min_accuracy: pass_count += 1 writer.writeRecord(record) percentage = round(100.0 * pass_count / seq_count, 4) log.info("{0} sequences of {1} ({2}%) passed filtering".format( pass_count, seq_count, percentage))
def filter_fastq(input_fastq, output_fastq, min_length=None, min_num_reads=None): """ Filter a Fastq file based on various criteria """ kept = 0 total = 0 with FastqWriter(output_fastq) as writer: for record in FastqReader(input_fastq): total += 1 if min_length and len(record.sequence) < min_length: continue if min_num_reads and get_num_reads(record) < min_num_reads: continue kept += 1 writer.writeRecord(record) log.info("Kept %s of %s consensus sequences" % (kept, total))
def convert_to_dazz_fasta(self): """ Convert input fasta/fastq file to daligner-compatibe fasta with ids: <prefix>/<index>/0_<seqlen> Also write out mappings to pickle """ i = 1 reader = FastaReader(self.input_filename) if self.filetype == 'fasta' else \ FastqReader(self.input_filename) f = FastaWriter(self.dazz_filename) for r in reader: f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence) self.dazz_mapping[i] = r.id i += 1 f.close() with open(self.dazz_filename + '.pickle', 'w') as f: dump(self.dazz_mapping, f)
def snr_filter(input_fastq, raw_data_file, output_fastq, min_snr=SNR): """ Filter out sequences below a threshold of predicted accuracy """ log.info( "Filtering sequences below {0} Signal-To-Noise Ratio".format(min_snr)) seq_count = 0 pass_count = 0 raw_data = BasH5Collection(raw_data_file) with FastqWriter(output_fastq) as writer: for record in FastqReader(input_fastq): seq_count += 1 zmw_name = '/'.join(record.name.strip().split('/')[:2]) zmw = raw_data[zmw_name] zmw_snr = min(zmw.zmwMetric("HQRegionSNR")) print zmw_name, zmw_snr if zmw_snr >= min_snr: pass_count += 1 writer.writeRecord(record) percentage = round(100.0 * pass_count / seq_count) log.info("{0} sequences of {1} ({2}%) passed filtering".format( pass_count, seq_count, percentage))
#!/usr/bin/env python from pbcore.io.FastqIO import FastqReader, FastqWriter, FastqRecord import shlex import sys import subprocess import os import re usage = "usage: circulization.py initial_contigs.fastq 20000 /tmp circulaized_contigs.fastq" try: fastq_f = FastqReader(sys.argv[1]) prepostfix_size = int(sys.argv[2]) tmp_dir = sys.argv[3] output_fn = sys.argv[4] except: print usage sys.exit(1) prefix_N = re.compile("^[Nn]+") postfix_N = re.compile("[Nn]+$") prefix_fn = os.path.join(tmp_dir, "prefix.fa") postfix_fn = os.path.join(tmp_dir, "postfix.fa") with FastqWriter(open(output_fn, "w")) as output_fh: for r in fastq_f: r_id = r.name r_seq = r.sequence
def __init__(self, input_fastq, output_fastq, min_accuracy=MIN_ACCURACY): self.input_reader = FastqReader(input_fastq) self.output_writer = FastqWriter(output_fastq) self.min_accuracy = min_accuracy
def open_reader(self): if self.filetype == 'fasta': self.reader = FastaReader(self.input_file) elif self.filetype == 'fastq': self.reader = FastqReader(self.input_file)
def filter_by_count(input_prefix, output_prefix, min_count): group_filename = input_prefix + '.group.txt' count_filename = input_prefix + '.abundance.txt' gff_filename = input_prefix + '.gff' rep_filename = input_prefix + '.rep.fq' # read group group_max_count_fl = {} group_max_count_p = {} f = open(group_filename) for line in f: #ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split('\t') group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(',') for m in members: tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16 fl_count, p_count = tmp.split('p') fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k, v in d.iteritems(): print k, v f.close() # group_max_count_p NOT used for now good = filter( lambda x: int(d[x]['count_fl']) >= min_count and group_max_count_fl[x] >= min_count and group_max_count_p >= 0, d) # write output GFF f = open(output_prefix + '.gff', 'w') for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) f.close() # write output rep.fq f = FastqWriter(output_prefix + '.rep.fq') for r in FastqReader(rep_filename): if r.name.split('|')[0] in good: f.writeRecord(r) f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close()
def _get_stats(fastq_file_name): raw_qvs = np.array([r.quality for r in FastqReader(fastq_file_name)]) qvs = np.hstack(raw_qvs) reads = np.array([len(r.sequence) for r in FastqReader(fastq_file_name)]) return qvs, reads
def add_seqs_from_fastq(self, fastq_filename, smooth=True): """Add sequence ids from a fastq file.""" self.qver.precache_fastq(fastq_filename) newids = [r.name.split()[0] for r in FastqReader(fastq_filename)] self.qver.presmooth(newids, self.window_size)
def parseFastqData(self): self.fastqData = [] log.info('Reading Fastq data into memory from %s...' % self.fastq) for fastqRecord in FastqReader(self.fastq): self.fastqData.append(fastqRecord)
def _parse_fastq(filename): return set([rec.name.split()[0] for rec in FastqReader(filename)])