class FastqEmitter(object): def __init__(self, filename): self.writer = FastqWriter(filename) def emit(self, zmwRead): self.writer.writeRecord(zmwRead.readName, zmwRead.basecalls(), zmwRead.QualityValue())
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns, combined_hq_fa, combined_hq_fq, combined_lq_fa, combined_lq_fq, hq_lq_prefix_dict_pickle, sample_name): """Combine split hq (lq) files and save to combined_dir. Dumping hq|lq prefix dictionary to pickle. Return an instance of CombinedFiles. Parameters: split_indices -- indices of splitted cluster bins. split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...] split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...] """ assert len(split_indices) == len(split_hq_fns) assert len(split_indices) == len(split_lq_fns) assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns]) hq_pre_dict, lq_pre_dict = {}, {} hq_fa_writer = FastaWriter(combined_hq_fa) hq_fq_writer = FastqWriter(combined_hq_fq) lq_fa_writer = FastaWriter(combined_lq_fa) lq_fq_writer = FastqWriter(combined_lq_fq) for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns): logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq) hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ", sample_name=sample_name) lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ", sample_name=sample_name) hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq)) lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq)) with FastqReader(split_hq) as reader: for read in reader: name = combined_cid_hq_name(cluster_bin_index=i, name=read.name, sample_name=sample_name) hq_fa_writer.writeRecord(name, read.sequence[:]) hq_fq_writer.writeRecord(name, read.sequence[:], read.quality) with FastqReader(split_lq) as reader: for read in reader: name = combined_cid_lq_name(cluster_bin_index=i, name=read.name, sample_name=sample_name) lq_fa_writer.writeRecord(name, read.sequence[:]) lq_fq_writer.writeRecord(name, read.sequence[:], read.quality) hq_fa_writer.close() hq_fq_writer.close() lq_fa_writer.close() lq_fq_writer.close() logging.info("HQ polished output combined to:%s", combined_hq_fq) logging.info("LQ polished output combined to:%s", combined_lq_fq) logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle) with open(hq_lq_prefix_dict_pickle, 'wb') as writer: cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
def split(self, first_split=None): """Split `input_fastq` into smaller files each containing `reads_per_split` reads. Return splitted fastq.""" split_index = 0 self.out_fns = [] writer = FastqWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) if first_split is None: first_split = self.reads_per_split with FastqReader(self.input_fastq) as reader: for ridx, r in enumerate(reader): if ((split_index == 0 and ridx == first_split) or (split_index > 0 and ridx % self.reads_per_split == 0)) \ and ridx != 0: split_index += 1 writer.close() writer = FastqWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) writer.writeRecord(r.name, r.sequence, r.quality) writer.close() return list(self.out_fns)
class ResultCollector(object): """ Gathers results and writes to a file. """ def __init__(self, resultsQueue, algorithmName, algorithmConfig): self._resultsQueue = resultsQueue self._algorithmName = algorithmName self._algorithmConfig = algorithmConfig def _run(self): self.onStart() sentinelsReceived = 0 while sentinelsReceived < options.numWorkers: result = self._resultsQueue.get() if result is None: sentinelsReceived += 1 else: self.onResult(result) self.onFinish() def run(self): if options.doProfiling: cProfile.runctx("self._run()", globals=globals(), locals=locals(), filename=os.path.join( options.temporaryDirectory, "profile-%s.out" % (self.name))) else: self._run() # ================================== # Overridable interface begins here. # def onStart(self): self.referenceBasesProcessedById = OrderedDict() for refId in reference.byName: self.referenceBasesProcessedById[refId] = 0 self.variantsByRefId = defaultdict(list) self.consensusChunksByRefId = defaultdict(list) # open file writers self.fastaWriter = None self.fastqWriter = None self.gffWriter = None self.vcfWriter = None if options.fastaOutputFilename: self.fastaWriter = FastaWriter(options.fastaOutputFilename) if options.fastqOutputFilename: self.fastqWriter = FastqWriter(options.fastqOutputFilename) if options.gffOutputFilename: self.gffWriter = VariantsGffWriter(options.gffOutputFilename, vars(options), reference.byName.values()) if options.vcfOutputFilename: self.vcfWriter = VariantsVcfWriter(options.vcfOutputFilename, vars(options), reference.byName.values()) def onResult(self, result): window, cssAndVariants = result css, variants = cssAndVariants self._recordNewResults(window, css, variants) self._flushContigIfCompleted(window) def onFinish(self): logging.info("Analysis completed.") if self.fastaWriter: self.fastaWriter.close() if self.fastqWriter: self.fastqWriter.close() if self.gffWriter: self.gffWriter.close() if self.vcfWriter: self.vcfWriter.close() logging.info("Output files completed.") def _recordNewResults(self, window, css, variants): refId, refStart, refEnd = window self.consensusChunksByRefId[refId].append(css) self.variantsByRefId[refId] += variants self.referenceBasesProcessedById[refId] += (refEnd - refStart) def _flushContigIfCompleted(self, window): refId, _, _ = window refEntry = reference.byName[refId] refName = refEntry.fullName basesProcessed = self.referenceBasesProcessedById[refId] requiredBases = reference.numReferenceBases(refId, options.referenceWindows) if basesProcessed == requiredBases: # This contig is done, so we can dump to file and delete # the data structures. if self.gffWriter or self.vcfWriter: variants = sorted(self.variantsByRefId[refId]) if self.gffWriter: self.gffWriter.writeVariants(variants) if self.vcfWriter: self.vcfWriter.writeVariants(variants) del self.variantsByRefId[refId] # # If the user asked to analyze a window or a set of # windows, we output a FAST[AQ] contig per analyzed # window. Otherwise we output a fasta contig per # reference contig. # # We try to be intelligent about naming the output # contigs, to include window information where applicable. # for span in reference.enumerateSpans(refId, options.referenceWindows): _, s, e = span if (s == 0) and (e == refEntry.length): spanName = refName else: spanName = refName + "_%d_%d" % (s, e) cssName = consensus.consensusContigName( spanName, self._algorithmName) # Gather just the chunks pertaining to this span chunksThisSpan = [ chunk for chunk in self.consensusChunksByRefId[refId] if windows.windowsIntersect(chunk.refWindow, span) ] css = consensus.join(chunksThisSpan) if self.fastaWriter: self.fastaWriter.writeRecord(cssName, css.sequence) if self.fastqWriter: self.fastqWriter.writeRecord(cssName, css.sequence, css.confidence) del self.consensusChunksByRefId[refId]
class ResultCollector(object): """ Gathers results and writes to a file. """ def __init__(self, resultsQueue, algorithmName, algorithmConfig): self._resultsQueue = resultsQueue self._algorithmName = algorithmName self._algorithmConfig = algorithmConfig def _run(self): self.onStart() sentinelsReceived = 0 while sentinelsReceived < options.numWorkers: result = self._resultsQueue.get() if result is None: sentinelsReceived += 1 else: self.onResult(result) self.onFinish() def run(self): if options.doProfiling: cProfile.runctx("self._run()", globals=globals(), locals=locals(), filename=os.path.join(options.temporaryDirectory, "profile-%s.out" % (self.name))) else: self._run() # ================================== # Overridable interface begins here. # def onStart(self): self.referenceBasesProcessedById = OrderedDict() for refId in reference.byName: self.referenceBasesProcessedById[refId] = 0 self.variantsByRefId = defaultdict(list) self.consensusChunksByRefId = defaultdict(list) # open file writers self.fastaWriter = self.fastqWriter = self.gffWriter = None if options.fastaOutputFilename: self.fastaWriter = FastaWriter(options.fastaOutputFilename) if options.fastqOutputFilename: self.fastqWriter = FastqWriter(options.fastqOutputFilename) if options.gffOutputFilename: self.gffWriter = VariantsGffWriter(options.gffOutputFilename, vars(options), reference.byName.values()) def onResult(self, result): window, cssAndVariants = result css, variants = cssAndVariants self._recordNewResults(window, css, variants) self._flushContigIfCompleted(window) def onFinish(self): logging.info("Analysis completed.") if self.fastaWriter: self.fastaWriter.close() if self.fastqWriter: self.fastqWriter.close() if self.gffWriter: self.gffWriter.close() logging.info("Output files completed.") def _recordNewResults(self, window, css, variants): refId, refStart, refEnd = window self.consensusChunksByRefId[refId].append(css) self.variantsByRefId[refId] += variants self.referenceBasesProcessedById[refId] += (refEnd - refStart) def _flushContigIfCompleted(self, window): refId, _, _ = window refEntry = reference.byName[refId] refName = refEntry.fullName basesProcessed = self.referenceBasesProcessedById[refId] requiredBases = reference.numReferenceBases(refId, options.referenceWindows) if basesProcessed == requiredBases: # This contig is done, so we can dump to file and delete # the data structures. if self.gffWriter: self.gffWriter.writeVariants(sorted(self.variantsByRefId[refId])) del self.variantsByRefId[refId] # # If the user asked to analyze a window or a set of # windows, we output a FAST[AQ] contig per analyzed # window. Otherwise we output a fasta contig per # reference contig. # # We try to be intelligent about naming the output # contigs, to include window information where applicable. # for span in reference.enumerateSpans(refId, options.referenceWindows): _, s, e = span if (s == 0) and (e == refEntry.length): spanName = refName else: spanName = refName + "_%d_%d" % (s, e) cssName = consensus.consensusContigName(spanName, self._algorithmName) # Gather just the chunks pertaining to this span chunksThisSpan = [ chunk for chunk in self.consensusChunksByRefId[refId] if windows.windowsIntersect(chunk.refWindow, span) ] css = consensus.join(chunksThisSpan) if self.fastaWriter: self.fastaWriter.writeRecord(cssName, css.sequence) if self.fastqWriter: self.fastqWriter.writeRecord(cssName, css.sequence, css.confidence) del self.consensusChunksByRefId[refId]
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith( ".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError( "%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError("%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i/10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)