def combine_quiver_results(split_dirs, output_dir, hq_filename, lq_filename, tofu_prefix=''): """ For each size bin result, ex: clusterOut/0to2k/all.quiveredXXX.fastq combine it together, remember to add a prefix (ex: i0|c12, i1|c13/....) """ prefix_dict_hq = {} prefix_dict_lq = {} fout_hq = FastqWriter(os.path.join(output_dir, 'all_sizes.quivered_hq.fastq')) fout_lq = FastqWriter(os.path.join(output_dir, 'all_sizes.quivered_lq.fastq')) for i,d in enumerate(split_dirs): file_hq = os.path.join(d, hq_filename) #'all_quivered_hq.100_30_0.99.fastq') file_lq = os.path.join(d, lq_filename) #'all_quivered_lq.fastq') print >> sys.stderr, "Adding prefix i{0}| to {1},{2}...".format(i, file_hq, file_lq) prefix_dict_hq["i{i}HQ_{p}".format(i=i,p=tofu_prefix)] = os.path.abspath(d) prefix_dict_lq["i{i}LQ_{p}".format(i=i,p=tofu_prefix)] = os.path.abspath(d) for r in FastqReader(file_hq): _name_ = "i{i}HQ_{p}|{n}".format(p=tofu_prefix, i=i, n=r.name) fout_hq.writeRecord(_name_, r.sequence, r.quality) for r in FastqReader(file_lq): _name_ = "i{i}LQ_{p}|{n}".format(p=tofu_prefix, i=i, n=r.name) fout_lq.writeRecord(_name_, r.sequence, r.quality) fout_hq.close() fout_lq.close() print >> sys.stderr, "HQ quivered output combined to:", fout_hq.file.name print >> sys.stderr, "LQ quivered output combined to:", fout_lq.file.name return fout_hq.file.name,fout_lq.file.name,prefix_dict_hq,prefix_dict_lq
def open_writer( self ): if self.filetype == 'fasta': output_file = '%s.trim.fasta' % self.prefix self.writer = FastaWriter( output_file ) elif self.filetype == 'fastq': output_file = '%s.trim.fastq' % self.prefix self.writer = FastqWriter( output_file )
def open_writer(self): if self.filetype == 'fasta': output_file = '%s.trim.fasta' % self.prefix self.writer = FastaWriter(output_file) elif self.filetype == 'fastq': output_file = '%s.trim.fastq' % self.prefix self.writer = FastqWriter(output_file)
def pick_rep(fa_fq_filename, gff_filename, group_filename, output_filename, is_fq=False, pick_least_err_instead=True): """ For each group, select the representative record If is FASTA file (is_fa False) -- then always pick the longest one If is FASTQ file (is_fq True) -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ if is_fq: fd = LazyFastqReader(fa_fq_filename) fout = FastqWriter(output_filename) else: fd = LazyFastaReader(fa_fq_filename) fout = FastaWriter(output_filename) coords = {} for line in open(gff_filename): # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PB.1"; transcript_id "PB.1.1"; raw = line.strip().split('\t') if raw[2] == 'transcript': tid = raw[-1].split('; ')[1].split()[1][1:-2] coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) for line in open(group_filename): pb_id, members = line.strip().split('\t') print >> sys.stderr, "Picking representative sequence for", pb_id best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members.split(','): if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or ( (not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if is_fq: fout.writeRecord(_id_, _seq_, best_qual) else: fout.writeRecord(_id_, _seq_) fout.close()
class QualityFilter(object): """ A tool for filtering out low-quality fastq files """ def __init__(self, input_fastq, output_fastq, min_accuracy=MIN_ACCURACY): self.input_reader = FastqReader(input_fastq) self.output_writer = FastqWriter(output_fastq) self.min_accuracy = min_accuracy def __call__(self): for fastq in self.input_reader: if predicted_accuracy(fastq) >= self.min_accuracy: self.output_writer.writeRecord(fastq)
class QualityFilter( object ): """ A tool for filtering out low-quality fastq files """ def __init__(self, input_fastq, output_fastq, min_accuracy=MIN_ACCURACY): self.input_reader = FastqReader(input_fastq) self.output_writer = FastqWriter(output_fastq) self.min_accuracy = min_accuracy def __call__(self): for fastq in self.input_reader: if predicted_accuracy(fastq) >= self.min_accuracy: self.output_writer.writeRecord( fastq )
class BarcodeTrimmer( object ): def __init__( self, input_file, barcode_file, prefix=None, filetype=None ): self.input_file = input_file self.barcode_file = barcode_file self.prefix = prefix or get_prefix( input_file ) self.filetype = filetype or get_filetype( input_file ) self.positions = {} def run( self ): self.parse_barcode_data() self.open_reader() self.open_writer() self.trim_sequences() def parse_barcode_data( self ): with open( self.barcode_file ) as handle: for entry in map(barcode._make, csv.reader(handle, delimiter='\t')): if entry.id == 'ID': continue start = None if entry.end5 == 'NA' else int(entry.end5) end = None if entry.end3 == 'NA' else int(entry.end3) self.positions[entry.id] = (start, end) def open_reader( self ): if self.filetype == 'fasta': self.reader = FastaReader( self.input_file ) elif self.filetype == 'fastq': self.reader = FastqReader( self.input_file ) def open_writer( self ): if self.filetype == 'fasta': output_file = '%s.trim.fasta' % self.prefix self.writer = FastaWriter( output_file ) elif self.filetype == 'fastq': output_file = '%s.trim.fastq' % self.prefix self.writer = FastqWriter( output_file ) def trim_sequences( self ): for record in self.reader: try: start, end = self.positions[record.name] except: msg = 'Unknown sequence record "%s"!' % record.name log.error( msg ) raise ValueError( msg ) trimmed_record = trim_record( record, start, end ) self.writer.writeRecord( trimmed_record )
class BarcodeTrimmer(object): def __init__(self, input_file, barcode_file, prefix=None, filetype=None): self.input_file = input_file self.barcode_file = barcode_file self.prefix = prefix or get_prefix(input_file) self.filetype = filetype or get_filetype(input_file) self.positions = {} def run(self): self.parse_barcode_data() self.open_reader() self.open_writer() self.trim_sequences() def parse_barcode_data(self): with open(self.barcode_file) as handle: for entry in map(barcode._make, csv.reader(handle, delimiter='\t')): if entry.id == 'ID': continue start = None if entry.end5 == 'NA' else int(entry.end5) end = None if entry.end3 == 'NA' else int(entry.end3) self.positions[entry.id] = (start, end) def open_reader(self): if self.filetype == 'fasta': self.reader = FastaReader(self.input_file) elif self.filetype == 'fastq': self.reader = FastqReader(self.input_file) def open_writer(self): if self.filetype == 'fasta': output_file = '%s.trim.fasta' % self.prefix self.writer = FastaWriter(output_file) elif self.filetype == 'fastq': output_file = '%s.trim.fastq' % self.prefix self.writer = FastqWriter(output_file) def trim_sequences(self): for record in self.reader: try: start, end = self.positions[record.name] except: msg = 'Unknown sequence record "%s"!' % record.name log.error(msg) raise ValueError(msg) trimmed_record = trim_record(record, start, end) self.writer.writeRecord(trimmed_record)
def pick_rep(fa_fq_filename, gff_filename, group_filename, output_filename, is_fq=False, pick_least_err_instead=False, bad_gff_filename=None): """ For each group, select the representative record If is FASTA file (is_fa False) -- then always pick the longest one If is FASTQ file (is_fq True) -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ if is_fq: fd = LazyFastqReader(fa_fq_filename) fout = FastqWriter(output_filename) else: fd = LazyFastaReader(fa_fq_filename) fout = FastaWriter(output_filename) coords = {} for line in open(gff_filename): # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PB.1"; transcript_id "PB.1.1"; raw = line.strip().split('\t') if raw[2] == 'transcript': tid = raw[-1].split('; ')[1].split()[1][1:-2] coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) if bad_gff_filename is not None: for line in open(bad_gff_filename): raw = line.strip().split('\t') if raw[2] == 'transcript': tid = raw[-1].split('; ')[1].split()[1][1:-2] coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) for line in open(group_filename): pb_id, members = line.strip().split('\t') print >> sys.stderr, "Picking representative sequence for", pb_id best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members.split(','): if is_fq and pick_least_err_instead: err = sum(i**-(i/10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if is_fq: fout.writeRecord(_id_, _seq_, best_qual) else: fout.writeRecord(_id_, _seq_) fout.close()
def trim_fastq(fastq_file, output_file, window=WINDOW): with FastqWriter(output_file) as writer: for record in FastqReader(fastq_file): start = _find_start(record, window) end = _find_end(record, window) trimmed_record = _trim_fastq(record, start, end) writer.writeRecord(trimmed_record)
def writerProcess(outDir): # makes output directories if not os.path.exists(outDir): os.makedirs(outDir) fastOutDir = os.path.join(outDir, "Demultiplexed/") if not os.path.exists(fastOutDir): os.makedirs(fastOutDir) # opens files csvOut = open(os.path.join(outDir, "Report.csv"), "w") csvOut.write("Name,Barcode,NumPasses,Coverage,AvgConfidence,MinConfidence,TrimFail,MappingFail\n") writers = {} for writecount in range(totalNumber): result = resultQueue.get() csvOut.write("%s,%s,%d,%d,%0.6f,%0.6f,%s,%s\n" % ( result.name, result.barcode, result.numPasses, result.coverage, result.predictedAccuracy, result.minConfidence, result.trimFail, result.mappingFail)) if result.barcode not in writers: if args.fastq: writers[result.barcode] = FastqWriter(os.path.join(fastOutDir, result.barcode + ".fastq")) else: writers[result.barcode] = FastaWriter(os.path.join(fastOutDir, result.barcode + ".fasta")) if not any((result.minNumPassesFail, result.mappingFail, result.trimFail, result.minCoverageFail, result.minAvgConfidenceFail, result.minConfidenceFail)): if args.fastq: writers[result.barcode].writeRecord(result.name, result.seq, result.qual) else: writers[result.barcode].writeRecord(result.name, result.seq)
def main(parser): args = parser.parse_args() bam = BamReader(args.ccsBAM) bcFofn = BarcodeH5Fofn(args.barcodeFofn) oFiles = { bc:FastqWriter('{dir}/{bc}.fastq'.format(dir=args.outDir,bc=bc)) for bc in bcFofn.barcodeLabels } for rec in bam: try: lZmw = bcFofn.labeledZmwFromName(rec.readName) except KeyError: #catch zmws with no barcode and skip continue if rec.readScore >= args.minPredictedAccuracy \ and lZmw.averageScore >= args.minAvgBarcodeScore \ and rec.numPasses >= args.minNumPasses: header = rec.readName if args.extendedHeader: header += ' predictedAccuracy={predAcc} numPasses={numPasses} barcodeScore={bcScore}'\ .format(predAcc=rec.readScore, numPasses=rec.numPasses, bcScore=lZmw.averageScore) qual = [ ord(q)-33 for q in rec.peer.qual ] writer = oFiles[bcFofn.barcodeLabels[lZmw.bestIdx]] writer.writeRecord(header, rec.read(aligned=False), qual) for f in oFile.values(): f.close()
def add_writer(self, group): if self.filetype == 'fasta': output_file = '%s.g%s.fasta' % (self.prefix, group) self.writers[group] = FastaWriter(output_file) if self.filetype == 'fastq': output_file = '%s.g%s.fastq' % (self.prefix, group) self.writers[group] = FastqWriter(output_file)
def _write_output(records, output_file, output_type): """Write the records out to file""" if output_type == 'fasta': write_fasta(records, output_file) else: with FastqWriter(output_file) as writer: for record in records: writer.writeRecord(record) check_output_file(output_file)
def rename_resequencing(input_fastq, output_fastq): """ Rename resequenced Fastq to have an AA-style NumReads tag """ with FastqWriter(output_fastq) as writer: for record in FastqReader(input_fastq): new_name = get_new_name(record.name) new_record = FastqRecord(new_name, record.sequence, record.quality) writer.writeRecord(new_record)
def write_fastq(records, output_file): """ Write a FastqRecord, or a list of FastqRecords, out to file """ with FastqWriter(output_file) as handle: for record in records: assert isinstance(record, FastqRecord) handle.writeRecord(record) check_output_file(output_file) return output_file
def _open_output_handle(output_file, output_type): """ Open an appropriate output handle to record the exon sequences """ if output_type == 'fasta': return FastaWriter(output_file) elif output_type == 'fastq': return FastqWriter(output_file) msg = 'Output type must be Fasta or Fastq' log.error(msg) raise TypeError(msg)
def make_current_fastq(icec_obj, flnc_filename, root_dir): """ current fasta will consists of all ids however --- if this was a already finished run and we are adding more input, then newids is empty, in this case we set newids = everything that has no affiliation or more than one affiliated cluster in d """ with FastqWriter(os.path.join(root_dir, 'current.fastq')) as f: for r in FastqReader(flnc_filename): f.writeRecord(r)
def combine_fastq(sequence_files, output_file): """ Combine a series of sequence files into one Fastq """ with FastqWriter(output_file) as handle: for filename in sequence_files: try: for record in FastqReader(filename): handle.writeRecord(record) except: log.warn('Could not open "%s" as Fastq' % fasta) check_output_file(output_file) return output_file
def combine_fastq( input_files, output_file): """ Combine sequences from multiple Fastq files into one """ log.info("Combining multiple Fastq outputs") record_counter = 0 file_counter = 0 with FastqWriter( output_file ) as writer: for filename in input_files: file_counter += 1 for record in FastqReader( filename ): record_counter += 1 writer.writeRecord( record ) log.info("Found {0} consensus sequences in {1} outputs".format(record_counter, file_counter)) return output_file
def combine_amp_analysis( input_dir, output_file ): """ Combine all AmpAnalysis results into a single Fastq file """ log.info("Combining AmpliconAnalysis outputs") record_counter = 0 file_counter = 0 with FastqWriter( output_file ) as writer: for result in find_amp_analysis_results(input_dir): file_counter += 1 for record in FastqReader( result ): record_counter += 1 writer.writeRecord( record ) log.info("Found {0} consensus sequences in {1} outputs".format(record_counter, file_counter)) return output_file
def quality_filter(input_fastq, output_fastq, min_accuracy=ACCURACY): """ Filter out sequences below a threshold of predicted accuracy """ log.info("Filtering sequences below {0}% predicted accuracy".format( 100 * min_accuracy)) seq_count = 0 pass_count = 0 with FastqWriter(output_fastq) as writer: for record in FastqReader(input_fastq): seq_count += 1 if predicted_accuracy(record) >= min_accuracy: pass_count += 1 writer.writeRecord(record) percentage = round(100.0 * pass_count / seq_count, 4) log.info("{0} sequences of {1} ({2}%) passed filtering".format( pass_count, seq_count, percentage))
def filter_fastq(input_fastq, output_fastq, min_length=None, min_num_reads=None): """ Filter a Fastq file based on various criteria """ kept = 0 total = 0 with FastqWriter(output_fastq) as writer: for record in FastqReader(input_fastq): total += 1 if min_length and len(record.sequence) < min_length: continue if min_num_reads and get_num_reads(record) < min_num_reads: continue kept += 1 writer.writeRecord(record) log.info("Kept %s of %s consensus sequences" % (kept, total))
def extract_ccs_fastq(collection, output_file, min_length, min_snr): log.info('Extracting fastq CCS reads from input files') log.debug(' min_length: %s' % min_length) log.debug(' min_snr: %s' % min_snr) ccs_total = 0 pass_total = 0 with FastqWriter(output_file) as writer: for movie in collection.movieNames: log.info('Extracting fastq CCS reads from %s' % os.path.basename(movie)) ccs_count = 0 pass_count = 0 for well in collection[movie].sequencingZmws: zmw = collection[movie][well] # Skip non-CCS ZMWs if not zmw.ccsRead: continue ccs_count += 1 # Skip short and low-SNR sequences basecalls = zmw.ccsRead.basecalls() if len(basecalls) < min_length: continue zmw_snr = min(zmw.zmwMetric("HQRegionSNR")) if zmw_snr < min_snr: continue pass_count += 1 # Finally write the CCS Fastq to file record = FastqRecord(zmw.ccsRead.readName, basecalls, zmw.ccsRead.QualityValue()) writer.writeRecord(record) percentage = round(100.0 * pass_count / ccs_count) log.info( "Identified {0} CCS reads, of which {1} ({2}%) passed filter". format(ccs_count, pass_count, percentage)) ccs_total += ccs_count pass_total += pass_count percentage = round(100.0 * pass_total / ccs_total) log.info( 'Found a total of {0} CCS reads, of which {1} ({2}%) passed filter'. format(ccs_total, pass_total, percentage))
def snr_filter(input_fastq, raw_data_file, output_fastq, min_snr=SNR): """ Filter out sequences below a threshold of predicted accuracy """ log.info( "Filtering sequences below {0} Signal-To-Noise Ratio".format(min_snr)) seq_count = 0 pass_count = 0 raw_data = BasH5Collection(raw_data_file) with FastqWriter(output_fastq) as writer: for record in FastqReader(input_fastq): seq_count += 1 zmw_name = '/'.join(record.name.strip().split('/')[:2]) zmw = raw_data[zmw_name] zmw_snr = min(zmw.zmwMetric("HQRegionSNR")) print zmw_name, zmw_snr if zmw_snr >= min_snr: pass_count += 1 writer.writeRecord(record) percentage = round(100.0 * pass_count / seq_count) log.info("{0} sequences of {1} ({2}%) passed filtering".format( pass_count, seq_count, percentage))
def pick_rep(fa_fq_filename, sam_filename, gff_filename, group_filename, output_filename, is_fq=False, pick_least_err_instead=False): """ For each group, select the representative record If is FASTA file (is_fa False) -- then always pick the longest one If is FASTQ file (is_fq True) -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ if is_fq: fd = LazyFastqReader(fa_fq_filename) fout = FastqWriter(output_filename) else: fd = LazyFastaReader(fa_fq_filename) fout = FastaWriter(output_filename) # for line in open(gff_filename): # # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PBfusion.1"; transcript_id "PBfusion.1.1"; # raw = line.strip().split('\t') # if raw[2] == 'transcript': # # check if this is first or 2+ part of fusion # tid = raw[-1].split('; ')[1].split()[1][1:-2] # ex: tid = PBfusion.1.1 # gid = tid[:tid.rfind('.')] # ex: gid = PBfusion.1 # if tid.endswith('.1'): # coords[gid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) # else: # assert gid in coords # coords[gid] += "+{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) rep_info = {} id_to_rep = {} for line in open(group_filename): pb_id, members = line.strip().split('\t') print >> sys.stderr, "Picking representative sequence for", pb_id best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members.split(','): if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or ( (not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) rep_info[pb_id] = (best_id, best_seq, best_qual) id_to_rep[best_id] = pb_id f_gff = open(gff_filename, 'w') coords = {} record_storage = { } # temporary storage for the .1 record to write in conjunction with second record for r in BioReaders.GMAPSAMReader(sam_filename, True): if r.qID in id_to_rep: pb_id = id_to_rep[r.qID] best_id, best_seq, best_qual = rep_info[pb_id] # make coordinates & write the SAM file if r.qID not in coords: # this is the .1 portion coords[r.qID] = "{0}:{1}-{2}({3})".format( r.sID, r.sStart, r.sEnd, r.flag.strand) isoform_index = 1 record_storage[pb_id] = r else: # this is the .2 portion coords[r.qID] += "+{0}:{1}-{2}({3})".format( r.sID, r.sStart, r.sEnd, r.flag.strand) isoform_index = 1 old_r = record_storage[pb_id] f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=old_r.sID, s=old_r.segments[0].start+1, e=old_r.segments[-1].end, pi=pb_id, j=isoform_index, strand=old_r.flag.strand)) for s in old_r.segments: f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=old_r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=old_r.flag.strand)) isoform_index = 2 f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) for s in r.segments: f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) f_gff.close() for pb_id in rep_info: best_id, best_seq, best_qual = rep_info[pb_id] _id_ = "{0}|{1}|{2}".format(pb_id, coords[best_id], best_id) _seq_ = best_seq if is_fq: fout.writeRecord(_id_, _seq_, best_qual) else: fout.writeRecord(_id_, _seq_)
def writeFastqData(self): log.info('Writing the masked Fastq data out to "%s"...' % self.output) with FastqWriter(self.output) as writer: for fastqRecord in self.maskedFastqs: writer.writeRecord(fastqRecord)
def __init__(self, input_fastq, output_fastq, min_accuracy=MIN_ACCURACY): self.input_reader = FastqReader(input_fastq) self.output_writer = FastqWriter(output_fastq) self.min_accuracy = min_accuracy
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("input_prefix", help="Input prefix") parser.add_argument("output_prefix", help="Output prefix") parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)") args = parser.parse_args() #group_filename = args.input_prefix + '.group.txt' count_filename = args.input_prefix + '.abundance.txt' gff_filename = args.input_prefix + '.gff' rep_filename = args.input_prefix + '.rep.fq' recs = defaultdict(lambda: []) reader = GFF.collapseGFFReader(gff_filename) for r in reader: assert r.seqid.startswith('PB.') recs[int(r.seqid.split('.')[1])].append(r) good = [] f = open(args.output_prefix + '.gff', 'w') keys = recs.keys() keys.sort() for k in recs: xxx = recs[k] filter_out_subsets(xxx, args.fuzzy_junction) for r in xxx: GFF.write_collapseGFF_format(f, r) good.append(r.seqid) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k,v in d.iteritems(): print k,v f.close() # write output rep.fq f = FastqWriter(args.output_prefix + '.rep.fq') for r in FastqReader(rep_filename): if r.name.split('|')[0] in good: f.writeRecord(r) f.close() # write output to .abundance.txt f = open(args.output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close()
def writeFastqData(self): log.info('Writing aligned Fastq data out to "%s"' % self.output) with FastqWriter(self.output) as handle: for alignedFastq in self.alignedFastqs: handle.writeRecord(alignedFastq)
def write_fastq_records(records, filename): log.info("Writing {0} FastqRecords to {1}".format(len(records), filename)) with FastqWriter(filename) as handle: for record in records: handle.writeRecord(record) check_output_file(filename)
def filter_by_count(input_prefix, output_prefix, min_count): group_filename = input_prefix + ".group.txt" count_filename = input_prefix + ".abundance.txt" gff_filename = input_prefix + ".gff" rep_filename = input_prefix + ".rep.fq" # read group group_max_count_fl = {} group_max_count_p = {} f = open(group_filename) for line in f: # ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split("\t") group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(",") for m in members: tmp = m.split("|")[1].split("/")[1] # ex: tmp = f30p16 fl_count, p_count = tmp.split("p") fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) f.close() # read abundance first f = open(count_filename) count_header = "" while True: cur_pos = f.tell() line = f.readline() if not line.startswith("#"): f.seek(cur_pos) break else: count_header += line d = dict((r["pbid"], r) for r in DictReader(f, delimiter="\t")) for k, v in d.iteritems(): print k, v f.close() # group_max_count_p NOT used for now good = filter( lambda x: int(d[x]["count_fl"]) >= min_count and group_max_count_fl[x] >= min_count and group_max_count_p >= 0, d, ) # write output GFF f = open(output_prefix + ".gff", "w") for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) f.close() # write output rep.fq f = FastqWriter(output_prefix + ".rep.fq") for r in FastqReader(rep_filename): if r.name.split("|")[0] in good: f.writeRecord(r) f.close() # write output to .abundance.txt f = open(output_prefix + ".abundance.txt", "w") f.write(count_header) writer = DictWriter( f, fieldnames=["pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb"], delimiter="\t", lineterminator="\n", ) writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close()
def outputSubreadFastq(self): log.info('Parsing Fastq subreads from input BAS.H5 files') with FastqWriter(self.output) as writer: for reader in self.bash5_readers: self.writeSubreadFastq(reader, writer)
def pick_rep(fa_fq_filename, sam_filename, gff_filename, group_filename, output_filename, is_fq=False, pick_least_err_instead=False): """ For each group, select the representative record If is FASTA file (is_fa False) -- then always pick the longest one If is FASTQ file (is_fq True) -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ if is_fq: fd = LazyFastqReader(fa_fq_filename) fout = FastqWriter(output_filename) else: fd = LazyFastaReader(fa_fq_filename) fout = FastaWriter(output_filename) # for line in open(gff_filename): # # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PBfusion.1"; transcript_id "PBfusion.1.1"; # raw = line.strip().split('\t') # if raw[2] == 'transcript': # # check if this is first or 2+ part of fusion # tid = raw[-1].split('; ')[1].split()[1][1:-2] # ex: tid = PBfusion.1.1 # gid = tid[:tid.rfind('.')] # ex: gid = PBfusion.1 # if tid.endswith('.1'): # coords[gid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) # else: # assert gid in coords # coords[gid] += "+{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) rep_info = {} id_to_rep = {} for line in open(group_filename): pb_id, members = line.strip().split('\t') print >> sys.stderr, "Picking representative sequence for", pb_id best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members.split(','): if is_fq and pick_least_err_instead: err = sum(i**-(i/10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) rep_info[pb_id] = (best_id, best_seq, best_qual) id_to_rep[best_id] = pb_id f_gff = open(gff_filename, 'w') coords = {} record_storage = {} # temporary storage for the .1 record to write in conjunction with second record for r in BioReaders.GMAPSAMReader(sam_filename, True): if r.qID in id_to_rep: pb_id = id_to_rep[r.qID] best_id, best_seq, best_qual = rep_info[pb_id] # make coordinates & write the SAM file if r.qID not in coords: # this is the .1 portion coords[r.qID] = "{0}:{1}-{2}({3})".format(r.sID, r.sStart, r.sEnd, r.flag.strand) isoform_index = 1 record_storage[pb_id] = r else: # this is the .2 portion coords[r.qID] += "+{0}:{1}-{2}({3})".format(r.sID, r.sStart, r.sEnd, r.flag.strand) isoform_index = 1 old_r = record_storage[pb_id] f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=old_r.sID, s=old_r.segments[0].start+1, e=old_r.segments[-1].end, pi=pb_id, j=isoform_index, strand=old_r.flag.strand)) for s in old_r.segments: f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=old_r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=old_r.flag.strand)) isoform_index = 2 f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) for s in r.segments: f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) f_gff.close() for pb_id in rep_info: best_id, best_seq, best_qual = rep_info[pb_id] _id_ = "{0}|{1}|{2}".format(pb_id, coords[best_id], best_id) _seq_ = best_seq if is_fq: fout.writeRecord(_id_, _seq_, best_qual) else: fout.writeRecord(_id_, _seq_)
def filter_by_count(input_prefix, output_prefix, min_count): group_filename = input_prefix + '.group.txt' count_filename = input_prefix + '.abundance.txt' gff_filename = input_prefix + '.gff' rep_filename = input_prefix + '.rep.fq' # read group group_max_count_fl = {} group_max_count_p = {} f = open(group_filename) for line in f: #ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split('\t') group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(',') for m in members: tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16 fl_count, p_count = tmp.split('p') fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) f.close() # read abundance first f = open(count_filename) count_header = '' while True: cur_pos = f.tell() line = f.readline() if not line.startswith('#'): f.seek(cur_pos) break else: count_header += line d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t')) for k, v in d.iteritems(): print k, v f.close() # group_max_count_p NOT used for now good = filter( lambda x: int(d[x]['count_fl']) >= min_count and group_max_count_fl[x] >= min_count and group_max_count_p >= 0, d) # write output GFF f = open(output_prefix + '.gff', 'w') for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) f.close() # write output rep.fq f = FastqWriter(output_prefix + '.rep.fq') for r in FastqReader(rep_filename): if r.name.split('|')[0] in good: f.writeRecord(r) f.close() # write output to .abundance.txt f = open(output_prefix + '.abundance.txt', 'w') f.write(count_header) writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \ delimiter='\t', lineterminator='\n') writer.writeheader() for k in good: r = d[k] writer.writerow(r) f.close()
try: fastq_f = FastqReader(sys.argv[1]) prepostfix_size = int(sys.argv[2]) tmp_dir = sys.argv[3] output_fn = sys.argv[4] except: print usage sys.exit(1) prefix_N = re.compile("^[Nn]+") postfix_N = re.compile("[Nn]+$") prefix_fn = os.path.join(tmp_dir, "prefix.fa") postfix_fn = os.path.join(tmp_dir, "postfix.fa") with FastqWriter(open(output_fn, "w")) as output_fh: for r in fastq_f: r_id = r.name r_seq = r.sequence r_qv = r.quality m = prefix_N.search(r_seq) if m: prefix_trim = m.end() else: prefix_trim = 0 m = postfix_N.search(r_seq) if m: postfix_trim = m.start() else: postfix_trim = len(r_seq)