def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename, in_rep_filename, out_abundance_filename, out_gff_filename, out_rep_filename, good): """Write good collapsed isoforms.""" in_suffix = parse_ds_filename(in_rep_filename)[1] out_suffix = parse_ds_filename(out_rep_filename)[1] if in_suffix != out_suffix: raise ValueError("Format of input %s and output %s must match." % (in_rep_filename, out_rep_filename)) if in_suffix not in ("fasta", "fastq"): raise ValueError("Format of input %s and output %s must be either FASTA or FASTQ." % (in_rep_filename, out_rep_filename)) # then read gff, and write good gff record. with CollapseGffWriter(out_gff_filename) as gff_writer: for r in CollapseGffReader(in_gff_filename): if r.seqid in good: gff_writer.writeRecord(r) # next read rep fasta/fastq, and write good rep fasta/fastq record. rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \ else FastqReader(in_rep_filename) rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \ else FastqWriter(out_rep_filename) for r in rep_reader: # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465 if r.name.split('|')[0] in good: rep_writer.writeRecord(r) # finally write abundance info of good records. with AbundanceReader(in_abundance_filename) as a_reader, \ AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer: for r in a_reader: if r.pbid in good: a_writer.writeRecord(r)
class FastaEmitter(object): def __init__(self, filename): self.writer = FastaWriter(filename) def emit(self, zmwRead): self.writer.writeRecord(zmwRead.readName, zmwRead.basecalls())
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns, combined_hq_fa, combined_hq_fq, combined_lq_fa, combined_lq_fq, hq_lq_prefix_dict_pickle, sample_name): """Combine split hq (lq) files and save to combined_dir. Dumping hq|lq prefix dictionary to pickle. Return an instance of CombinedFiles. Parameters: split_indices -- indices of splitted cluster bins. split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...] split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...] """ assert len(split_indices) == len(split_hq_fns) assert len(split_indices) == len(split_lq_fns) assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns]) hq_pre_dict, lq_pre_dict = {}, {} hq_fa_writer = FastaWriter(combined_hq_fa) hq_fq_writer = FastqWriter(combined_hq_fq) lq_fa_writer = FastaWriter(combined_lq_fa) lq_fq_writer = FastqWriter(combined_lq_fq) for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns): logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq) hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ", sample_name=sample_name) lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ", sample_name=sample_name) hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq)) lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq)) with FastqReader(split_hq) as reader: for read in reader: name = combined_cid_hq_name(cluster_bin_index=i, name=read.name, sample_name=sample_name) hq_fa_writer.writeRecord(name, read.sequence[:]) hq_fq_writer.writeRecord(name, read.sequence[:], read.quality) with FastqReader(split_lq) as reader: for read in reader: name = combined_cid_lq_name(cluster_bin_index=i, name=read.name, sample_name=sample_name) lq_fa_writer.writeRecord(name, read.sequence[:]) lq_fq_writer.writeRecord(name, read.sequence[:], read.quality) hq_fa_writer.close() hq_fq_writer.close() lq_fa_writer.close() lq_fq_writer.close() logging.info("HQ polished output combined to:%s", combined_hq_fq) logging.info("LQ polished output combined to:%s", combined_lq_fq) logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle) with open(hq_lq_prefix_dict_pickle, 'wb') as writer: cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
def test_contigset_write(self): fasta = upstreamData.getLambdaFasta() ds = ContigSet(fasta) assert isinstance(ds.resourceReaders()[0], IndexedFastaReader) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'test.fasta') w = FastaWriter(outfn) for rec in ds: w.writeRecord(rec) w.close() fas = FastaReader(outfn) for rec in fas: # make sure a __repr__ didn't slip through: assert not rec.sequence.startswith('<')
def test_contigset_write(self): fasta = upstreamData.getLambdaFasta() ds = ContigSet(fasta) self.assertTrue(isinstance(ds.resourceReaders()[0], IndexedFastaReader)) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'test.fasta') w = FastaWriter(outfn) for rec in ds: w.writeRecord(rec) w.close() fas = FastaReader(outfn) for rec in fas: # make sure a __repr__ didn't slip through: self.assertFalse(rec.sequence.startswith('<'))
def split(self): """Split `input_fasta` into smaller files each containing `reads_per_split` reads. Return splitted fasta.""" split_index = 0 self.out_fns = [] writer = FastaWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) with FastaReader(self.input_fasta) as reader: for ridx, r in enumerate(reader): if ridx % self.reads_per_split == 0 and ridx != 0: split_index += 1 writer.close() writer = FastaWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) writer.writeRecord(r.name, r.sequence) writer.close() return list(self.out_fns)
def save(self, dir): """ Save this ArrowEvidence to a directory. The directory will be *created* by this method. Format of evidence dump: evidence_dump/ ref000001/ 0-1005/ consensus.fa arrow-scores.h5 995-2005/ ... """ logging.info("Dumping evidence to %s" % (dir, )) join = os.path.join if os.path.exists(dir): raise Exception( "Evidence dump does not expect directory %s to exist." % dir) os.makedirs(dir) #refFasta = FastaWriter(join(dir, "reference.fa")) #readsFasta = FastaWriter(join(dir, "reads.fa")) consensusFasta = FastaWriter(join(dir, "consensus.fa")) windowName = self.refName + (":%d-%d" % (self.refStart, self.refEnd)) #refFasta.writeRecord(windowName, self.refSequence) #refFasta.close() consensusFasta.writeRecord(windowName + "|arrow", self.consensus) consensusFasta.close() import h5py arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5")) arrowScoreFile.create_dataset("Scores", data=self.scores) vlen_str = h5py.special_dtype(vlen=str) arrowScoreFile.create_dataset("RowNames", data=self.rowNames, dtype=vlen_str) arrowScoreFile.create_dataset("ColumnNames", data=self.colNames, dtype=vlen_str) arrowScoreFile.create_dataset("BaselineScores", data=self.baselineScores) arrowScoreFile.close()
def main(): id2seq = {} parser = argparse.ArgumentParser() parser.add_argument("-b", "--breakpoint", help="file containing breakpoints") parser.add_argument("-a", "--assembly", help="fasta file containing contigs") parser.add_argument("-o", "--outfile", help="new assembly file") parser.add_argument("-l", "--lenfile", help="length of contigs") args = parser.parse_args() lenfile = open(args.lenfile, 'w') lenmap = {} f = FastaReader(args.assembly) for record in f: id = record.id id2seq[id] = record.sequence[0:-10] new_seq = {} f = open(args.breakpoint, 'r') lines = f.readlines() for line in lines: attrs = line.split() if len(attrs) == 1: curr_contig = attrs[0] seq = id2seq[curr_contig] else: start = long(attrs[0]) end = long(attrs[1]) new_id = curr_contig + '_' + attrs[0] + '_' + attrs[1] new_seq[new_id] = seq[start:end] lenmap[new_id] = end - start + 1 rec_list = [] writer = FastaWriter(args.scaffold) for key in new_seq: writer.writeRecord(key, new_seq[key]) for key in lenmap: lenfile.write(key + "\t" + str(lenmap[key]) + '\n')
def main(parser): args = parser.parse_args() # Get outfile name if args.outFile is None: outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq' else: outfile = args.outFile # Input files barcodeFofn = (l.strip('\n') for l in args.barcode_fofn) ccsFofn = (l.strip('\n') for l in args.ccs_fofn) # Get the read names that are not barcoded no_barcode = set() for barcodeFile in barcodeFofn: bcH5 = BarcodeH5Reader(barcodeFile) for row in bcH5.bestDS: if row[3] / row[1] < args.minAvgBarcodeScore: no_barcode.add('%s/%d' % (bcH5.movieName, row[0])) if args.fasta: outh = FastaWriter(outfile) else: outh = FastqWriter(outfile) for ccsFile in ccsFofn: ccsH5 = BasH5Reader(ccsFile) for ccsRead in ccsH5.ccsReads(): if ccsRead.zmw.zmwName in no_barcode: basecalls = ccsRead.basecalls() if len(basecalls) >= args.minMaxInsertLength: if args.fasta: outh.writeRecord( FastaRecord(ccsRead.zmw.zmwName, ccsRead.basecalls())) else: outh.writeRecord( FastqRecord(ccsRead.zmw.zmwName, ccsRead.basecalls(), ccsRead.QualityValue())) outh.close()
def combine_consensus_isoforms(split_indices, split_files, combined_consensus_isoforms_fa, sample_name): """ Parameters: split_indices -- indices of splitted cluster bins. split_files -- consensus isoforms in each splitted cluster bin. """ assert len(split_indices) == len(split_files) writer = FastaWriter(combined_consensus_isoforms_fa) for i, split_fn in zip(split_indices, split_files): logging.debug("Adding prefix i%s to %s.", str(i), split_fn) with ContigSetReaderWrapper(split_fn) as reader: for read in reader: name = combined_cid_ice_name(name=read.name, cluster_bin_index=i, sample_name=sample_name) writer.writeRecord(name, read.sequence[:]) writer.close() logging.info("Consensus isoforms output combined to:%s", combined_consensus_isoforms_fa)
def main(parser): args = parser.parse_args() # Get outfile name if args.outFile is None: outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq' else: outfile = args.outFile # Input files barcodeFofn = (l.strip('\n') for l in args.barcode_fofn) baxFofn = (l.strip('\n') for l in args.bax_fofn) # Get the read names that are not barcoded no_barcode = defaultdict(set) for barcodeFile in barcodeFofn: bcH5 = BarcodeH5Reader(barcodeFile) for row in bcH5.bestDS: if row[3] / row[1] < args.minAvgBarcodeScore: no_barcode[bcH5.movieName].add(row[0]) if args.fasta: outh = FastaWriter(outfile) else: outh = FastqWriter(outfile) for baxFile in baxFofn: baxH5 = BasH5Reader(baxFile) for holeNum in baxH5.sequencingZmws: if holeNum in no_barcode[baxH5.movieName]: zmw = baxH5[holeNum] if len(zmw.subreads) and max(len(sr.basecalls()) for sr in zmw.subreads) >= args.minMaxInsertLength: for subread in zmw.subreads: if len(subread.basecalls()) >= args.minSubreadLength: if args.fasta: outh.writeRecord(FastaRecord(subread.readName,subread.basecalls())) else: outh.writeRecord(FastqRecord(subread.readName,subread.basecalls(),subread.QualityValue())) outh.close()
def split(self, reads_in_first_split=None): """Split `input_fasta` into smaller files each containing `reads_per_split` reads. Return splitted fasta.""" split_index = 0 self.out_fns = [] writer = FastaWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) if reads_in_first_split is None: reads_in_first_split = self.reads_per_split with ContigSetReaderWrapper(self.input_fasta) as reader: for ridx, r in enumerate(reader): if ((split_index == 0 and ridx == reads_in_first_split) or (split_index > 0 and ridx % self.reads_per_split == 0)) \ and ridx != 0: split_index += 1 writer.close() writer = FastaWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) writer.writeRecord(r.name, r.sequence[:]) writer.close() return list(self.out_fns)
def main(parser): args = parser.parse_args() # Get outfile name if args.outFile is None: outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq' else: outfile = args.outFile # Input files barcodeFofn = (l.strip('\n') for l in args.barcode_fofn) ccsFofn = (l.strip('\n') for l in args.ccs_fofn) # Get the read names that are not barcoded no_barcode = set() for barcodeFile in barcodeFofn: bcH5 = BarcodeH5Reader(barcodeFile) for row in bcH5.bestDS: if row[3] / row[1] < args.minAvgBarcodeScore: no_barcode.add('%s/%d' % (bcH5.movieName,row[0])) if args.fasta: outh = FastaWriter(outfile) else: outh = FastqWriter(outfile) for ccsFile in ccsFofn: ccsH5 = BasH5Reader(ccsFile) for ccsRead in ccsH5.ccsReads(): if ccsRead.zmw.zmwName in no_barcode: basecalls = ccsRead.basecalls() if len(basecalls) >= args.minMaxInsertLength: if args.fasta: outh.writeRecord(FastaRecord(ccsRead.zmw.zmwName, ccsRead.basecalls())) else: outh.writeRecord(FastqRecord(ccsRead.zmw.zmwName, ccsRead.basecalls(), ccsRead.QualityValue())) outh.close()
def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename, in_rep_filename, out_abundance_filename, out_gff_filename, out_rep_filename, good): """Write good collapsed isoforms.""" in_suffix = parse_ds_filename(in_rep_filename)[1] out_suffix = parse_ds_filename(out_rep_filename)[1] if in_suffix != out_suffix: raise ValueError("Format of input %s and output %s must match." % (in_rep_filename, out_rep_filename)) if in_suffix not in ("fasta", "fastq"): raise ValueError( "Format of input %s and output %s must be either FASTA or FASTQ." % (in_rep_filename, out_rep_filename)) # then read gff, and write good gff record. with CollapseGffWriter(out_gff_filename) as gff_writer: for r in CollapseGffReader(in_gff_filename): if r.seqid in good: gff_writer.writeRecord(r) # next read rep fasta/fastq, and write good rep fasta/fastq record. rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \ else FastqReader(in_rep_filename) rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \ else FastqWriter(out_rep_filename) for r in rep_reader: # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465 if r.name.split('|')[0] in good: rep_writer.writeRecord(r) # finally write abundance info of good records. with AbundanceReader(in_abundance_filename) as a_reader, \ AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer: for r in a_reader: if r.pbid in good: a_writer.writeRecord(r)
def save(self, dir): """ Save this ArrowEvidence to a directory. The directory will be *created* by this method. Format of evidence dump: evidence_dump/ ref000001/ 0-1005/ consensus.fa arrow-scores.h5 995-2005/ ... """ logging.info("Dumping evidence to %s" % (dir,)) join = os.path.join if os.path.exists(dir): raise Exception, "Evidence dump does not expect directory %s to exist." % dir os.makedirs(dir) #refFasta = FastaWriter(join(dir, "reference.fa")) #readsFasta = FastaWriter(join(dir, "reads.fa")) consensusFasta = FastaWriter(join(dir, "consensus.fa")) windowName = self.refName + (":%d-%d" % (self.refStart, self.refEnd)) #refFasta.writeRecord(windowName, self.refSequence) #refFasta.close() consensusFasta.writeRecord(windowName + "|arrow", self.consensus) consensusFasta.close() arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5")) arrowScoreFile.create_dataset("Scores", data=self.scores) vlen_str = h5py.special_dtype(vlen=str) arrowScoreFile.create_dataset("RowNames", data=self.rowNames, dtype=vlen_str) arrowScoreFile.create_dataset("ColumnNames", data=self.colNames, dtype=vlen_str) arrowScoreFile.create_dataset("BaselineScores", data=self.baselineScores) arrowScoreFile.close()
def dumpEvidence(evidenceDumpBaseDirectory, refWindow, refSequence, alns, quiverConsensus): """This will import h5py at runtime. """ # Format of evidence dump: # evidence_dump/ # ref000001/ # 0-1005/ # reference.fa # reads.fa # consensus.fa # quiver-scores.h5 # 995-2005/ # ... join = os.path.join refId, refStart, refEnd = refWindow refName = reference.idToName(refId) windowDirectory = join(evidenceDumpBaseDirectory, refName, "%d-%d" % (refStart, refEnd)) logging.info("Dumping evidence to %s" % (windowDirectory, )) if os.path.exists(windowDirectory): raise Exception( "Evidence dump does not expect directory %s to exist." % windowDirectory) os.makedirs(windowDirectory) refFasta = FastaWriter(join(windowDirectory, "reference.fa")) readsFasta = FastaWriter(join(windowDirectory, "reads.fa")) consensusFasta = FastaWriter(join(windowDirectory, "consensus.fa")) windowName = refName + (":%d-%d" % (refStart, refEnd)) refFasta.writeRecord(windowName, refSequence) refFasta.close() consensusFasta.writeRecord(windowName + "|quiver", quiverConsensus.sequence) consensusFasta.close() rowNames, columnNames, baselineScores, scores = scoreMatrix( quiverConsensus.mms) import h5py quiverScoreFile = h5py.File(join(windowDirectory, "quiver-scores.h5")) quiverScoreFile.create_dataset("Scores", data=scores) vlen_str = h5py.special_dtype(vlen=str) quiverScoreFile.create_dataset("RowNames", data=rowNames, dtype=vlen_str) quiverScoreFile.create_dataset("ColumnNames", data=columnNames, dtype=vlen_str) quiverScoreFile.create_dataset("BaselineScores", data=baselineScores) quiverScoreFile.close() for aln in alns: readsFasta.writeRecord(str(aln.rowNumber), aln.read(orientation="genomic", aligned=False)) readsFasta.close()
def dumpEvidence(evidenceDumpBaseDirectory, refWindow, refSequence, alns, quiverConsensus): # Format of evidence dump: # evidence_dump/ # ref000001/ # 0-1005/ # reference.fa # reads.fa # consensus.fa # quiver-scores.h5 # 995-2005/ # ... join = os.path.join refId, refStart, refEnd = refWindow refName = reference.idToName(refId) windowDirectory = join(evidenceDumpBaseDirectory, refName, "%d-%d" % (refStart, refEnd)) logging.info("Dumping evidence to %s" % (windowDirectory,)) if os.path.exists(windowDirectory): raise Exception, "Evidence dump does not expect directory %s to exist." % windowDirectory os.makedirs(windowDirectory) refFasta = FastaWriter(join(windowDirectory, "reference.fa")) readsFasta = FastaWriter(join(windowDirectory, "reads.fa")) consensusFasta = FastaWriter(join(windowDirectory, "consensus.fa")) windowName = refName + (":%d-%d" % (refStart, refEnd)) refFasta.writeRecord(windowName, refSequence) refFasta.close() consensusFasta.writeRecord(windowName + "|quiver", quiverConsensus.sequence) consensusFasta.close() rowNames, columnNames, baselineScores, scores = scoreMatrix(quiverConsensus.mms) quiverScoreFile = h5py.File(join(windowDirectory, "quiver-scores.h5")) quiverScoreFile.create_dataset("Scores", data=scores) vlen_str = h5py.special_dtype(vlen=str) quiverScoreFile.create_dataset("RowNames", data=rowNames, dtype=vlen_str) quiverScoreFile.create_dataset("ColumnNames", data=columnNames, dtype=vlen_str) quiverScoreFile.create_dataset("BaselineScores", data=baselineScores) quiverScoreFile.close() for aln in alns: readsFasta.writeRecord(str(aln.rowNumber), aln.read(orientation="genomic", aligned=False)) readsFasta.close()
def split_imgt_drb( input_file ): drb3 = FastaWriter("DRB3_nuc.fasta") drb4 = FastaWriter("DRB4_nuc.fasta") drb5 = FastaWriter("DRB5_nuc.fasta") for record in FastaReader( input_file ): # Check that this is an IMGT-formatted FASTA record assert record.header.startswith('HLA:') # Split the locus name out of the header locus = record.header.strip().split('_')[1] # Write the record to the appropriate file if locus.startswith('DRB3'): drb3.writeRecord( record ) elif locus.startswith('DRB4'): drb4.writeRecord( record ) elif locus.startswith('DRB5'): drb5.writeRecord( record ) else: raise Exception("Locus {0} is not DRB3, 4, or 5!".format(locus))
def ice_fq2fa(in_fq, out_fa): handle = FastaWriter(out_fa) for r in FastqReader(in_fq): handle.writeRecord(r.name, r.sequence)
def test_writeFasta2(self): f = StringIO() w = FastaWriter(f) for record in FastaReader(self.fasta1): w.writeRecord(record.header, record.sequence) assert_equal(self.fasta1.getvalue(), f.getvalue())
if len(insert) < 500: continue smrtBellReads = simSmrtBellReads(ccRng, seqParams, insert, readLength) for subreadId, subread in enumerate(smrtBellReads): yield moleculeId, subreadId, (strand + subreadId) % 2, \ chromosomeId, tStart, tEnd, subread if __name__ == '__main__': fw = FastaWriter("/tmp/reads.fa") csvOut = open("/tmp/reads.csv", "w") csvOut.write("MoleculeId,SubReadId,Chromosome,tStart,tEnd,Strand\n") genome = [r.sequence for r in FastaReader("~/Data/lambdaNEB.fa")] #genome = [r.sequence for r in FastaReader("~/Data/Diploid/diploidLambda.fa")] seqParams = cc.SequencingParameters.C2() for readId, item in enumerate(simExperiment(42, seqParams, 4000, 1000, genome, 50000, False)): moleculeId, subreadId, strand, chromosomeId, tStart, tEnd, subread = item rStart = subreadId rEnd = subreadId readName = "mSimulator/%d" % moleculeId fw.writeRecord(readName, subread) csvOut.write("%d,%d,%d,%d,%d,%d" % (moleculeId, subreadId, chromosomeId, tStart, tEnd, strand)) csvOut.write("\n")
def test_writeFasta1(self): f = StringIO() w = FastaWriter(f) for record in FastaReader(self.fasta1): w.writeRecord(record) assert self.fasta1.getvalue() == f.getvalue()
class ResultCollector(object): """ Gathers results and writes to a file. """ def __init__(self, resultsQueue, algorithmName, algorithmConfig): self._resultsQueue = resultsQueue self._algorithmName = algorithmName self._algorithmConfig = algorithmConfig def _run(self): self.onStart() sentinelsReceived = 0 while sentinelsReceived < options.numWorkers: result = self._resultsQueue.get() if result is None: sentinelsReceived += 1 else: self.onResult(result) self.onFinish() def run(self): if options.doProfiling: cProfile.runctx("self._run()", globals=globals(), locals=locals(), filename=os.path.join( options.temporaryDirectory, "profile-%s.out" % (self.name))) else: self._run() # ================================== # Overridable interface begins here. # def onStart(self): self.referenceBasesProcessedById = OrderedDict() for refId in reference.byName: self.referenceBasesProcessedById[refId] = 0 self.variantsByRefId = defaultdict(list) self.consensusChunksByRefId = defaultdict(list) # open file writers self.fastaWriter = None self.fastqWriter = None self.gffWriter = None self.vcfWriter = None if options.fastaOutputFilename: self.fastaWriter = FastaWriter(options.fastaOutputFilename) if options.fastqOutputFilename: self.fastqWriter = FastqWriter(options.fastqOutputFilename) if options.gffOutputFilename: self.gffWriter = VariantsGffWriter(options.gffOutputFilename, vars(options), reference.byName.values()) if options.vcfOutputFilename: self.vcfWriter = VariantsVcfWriter(options.vcfOutputFilename, vars(options), reference.byName.values()) def onResult(self, result): window, cssAndVariants = result css, variants = cssAndVariants self._recordNewResults(window, css, variants) self._flushContigIfCompleted(window) def onFinish(self): logging.info("Analysis completed.") if self.fastaWriter: self.fastaWriter.close() if self.fastqWriter: self.fastqWriter.close() if self.gffWriter: self.gffWriter.close() if self.vcfWriter: self.vcfWriter.close() logging.info("Output files completed.") def _recordNewResults(self, window, css, variants): refId, refStart, refEnd = window self.consensusChunksByRefId[refId].append(css) self.variantsByRefId[refId] += variants self.referenceBasesProcessedById[refId] += (refEnd - refStart) def _flushContigIfCompleted(self, window): refId, _, _ = window refEntry = reference.byName[refId] refName = refEntry.fullName basesProcessed = self.referenceBasesProcessedById[refId] requiredBases = reference.numReferenceBases(refId, options.referenceWindows) if basesProcessed == requiredBases: # This contig is done, so we can dump to file and delete # the data structures. if self.gffWriter or self.vcfWriter: variants = sorted(self.variantsByRefId[refId]) if self.gffWriter: self.gffWriter.writeVariants(variants) if self.vcfWriter: self.vcfWriter.writeVariants(variants) del self.variantsByRefId[refId] # # If the user asked to analyze a window or a set of # windows, we output a FAST[AQ] contig per analyzed # window. Otherwise we output a fasta contig per # reference contig. # # We try to be intelligent about naming the output # contigs, to include window information where applicable. # for span in reference.enumerateSpans(refId, options.referenceWindows): _, s, e = span if (s == 0) and (e == refEntry.length): spanName = refName else: spanName = refName + "_%d_%d" % (s, e) cssName = consensus.consensusContigName( spanName, self._algorithmName) # Gather just the chunks pertaining to this span chunksThisSpan = [ chunk for chunk in self.consensusChunksByRefId[refId] if windows.windowsIntersect(chunk.refWindow, span) ] css = consensus.join(chunksThisSpan) if self.fastaWriter: self.fastaWriter.writeRecord(cssName, css.sequence) if self.fastqWriter: self.fastqWriter.writeRecord(cssName, css.sequence, css.confidence) del self.consensusChunksByRefId[refId]
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--cleaned", help="cleaned assembly") parser.add_argument("-f", "--scaffold", help="final scaffold file") parser.add_argument("-l", "--links", help="links sorted by score") parser.add_argument("-n", "--length", help="contig length") args = parser.parse_args() f = FastaReader(args.cleaned) for record in f: id = record.id print id id2seq[id] = record.sequence[0:-10] def break_cycle(nodes): nodeset = set() for node in nodes: nodeset.add(node.split(":")[0]) nodeset = list(nodeset) weight = "" chosen_edge = "" if len(nodeset) == 2: u = nodeset[0] v = nodeset[1] if u in nodes_to_edges: edges = nodes_to_edges[u] sorted_edges = sorted(edges, key=operator.itemgetter(2), reverse=True) #print sorted_edges for each in sorted_edges: if each[1].split(":")[0] == v: weight = each[2] chosen_edge = each if v in nodes_to_edges: edges = nodes_to_edges[v] sorted_edges = sorted(edges, key=operator.itemgetter(2), reverse=True) for each in sorted_edges: if each[1].split(":")[0] == u: if each[2] > weight: weight = each[2] chosen_edge = each if each[2] < weight: break if chosen_edge != "": start = chosen_edge[0] end = chosen_edge[1] path = [] if start.split(":")[1] == 'B': path.append(start.split(":")[0] + ":E") #path.append(start.split(":")[0]+":B") else: path.append(start.split(":")[0] + ":B") #path.append(start.split(":")[0]+":E") path.append(start) path.append(end) if end.split(":")[1] == 'B': path.append(end.split(":")[0] + ":E") #path.append(end.split(":")[0]+":B") else: path.append(end.split(":")[0] + ":B") #path.append(end.split(":")[0]+":E") return path with open(args.length, 'r') as f: lines = f.readlines() for line in lines: attrs = line.split() contig_length[attrs[0]] = long(attrs[-1]) contigs = set() with open(args.links, 'r') as f: for row in f: row = row.strip().split() v1, v2 = row[0:2] score = float(row[-1]) count = float(row[3]) c1 = v1.split(":")[0] c2 = v2.split(":")[0] contigs.add(c1) contigs.add(c2) if c1 not in nodes_to_edges: nodes_to_edges[c1] = [] if c2 not in nodes_to_edges: nodes_to_edges[c2] = [] nodes_to_edges[c1].append((v1, v2, float(row[3]))) key = c1 + '$' + c2 if count >= 60: H.add_edge(c1, c2, weight=int(row[-1])) oriented.add_edge(v1, v2, weight=int(row[-1])) #H.add_edge(c1,c2,weight=int(row[-1])) #oriented.add_edge(v1,v2,weight=int(row[-1]),count=float(row[3])) #print key if key not in edgemap: edgemap[key] = int(row[-1]) else: edgemap[key] += int(row[-1]) key = c2 + '$' + c1 if key not in edgemap: edgemap[key] = int(row[-1]) else: edgemap[key] += int(row[-1]) if v1 not in existing_nodes and v2 not in existing_nodes: if count < 150: continue G.add_edge(v1, v2, score=score, t="x") existing_nodes.add(v1) existing_nodes.add(v2) for ctg in list(contigs): G.add_edge(ctg + ":B", ctg + ":E", t="c", score=0) g_idx = 1 recs = [] to_merge = set() backbone_paths = {} path_id = 1 assigned = {} # for u,v,data in G.edges(data=True): # if data['score'] == 0: # G[u][v]['score'] = 1000000 # continue # G[u][v]['score'] = 1.0/data['score'] for subg in nx.connected_component_subgraphs(G): p0 = [] for v in subg.nodes(): if subg.degree(v) == 1: p0.append(v) if len(p0) != 2: path = break_cycle(subg.nodes()) if path != None: #print path if len(path) == 2: assigned[path[0].split(':')[0]] = False to_merge.add(path[0].split(':')[0]) continue backbone_paths[path_id] = path path_id += 1 else: #print 'here' #print subg.nodes() for each in subg.nodes(): to_merge.add(each.split(':')[0]) assigned[each.split(':')[0]] = False continue else: path = nx.shortest_path(subg, p0[0], p0[1]) if len(path) == 2: to_merge.add(path[0].split(':')[0]) continue backbone_paths[path_id] = path #print path path_id += 1 curr_contig = "" g_idx += 1 #now for each separate contig, find a maximum likely backbone path assignment = {} for each in to_merge: max_sum = -1 max_path = -1 for key in backbone_paths: path = backbone_paths[key] cur_sum = 0 cnt = 0 for node in path: if H.has_edge(each, node.split(':')[0]): cur_sum += H[each][node.split(':')[0]]['weight'] cnt += 1 if cnt != 0 and cur_sum > max_sum: max_sum = cur_sum max_path = key if max_sum != -1: assignment[each] = (max_path, max_sum, contig_length[each]) #now that we have found the path, try putting contig at best position in the path count = len(assignment) path_to_contig = {} for each in assignment: key = assignment[each][0] if key not in path_to_contig: path_to_contig[key] = [] path_to_contig[key].append( (each, assignment[each][1], assignment[each][2])) for each in path_to_contig: contigs = path_to_contig[each] contigs_sorted = sorted(contigs, key=operator.itemgetter(1), reverse=True) path_to_contig[each] = contigs_sorted #print contigs_sorted ofile = open('ambigous_contigs', 'w') for path_id in path_to_contig: path = backbone_paths[path_id] temp_path = list(path) contigs = path_to_contig[path_id] contigs = [str(i[0]) for i in contigs] explored = {} cnt = len(contigs) #print 'contig_length = ' + str(cnt) prev_len = -1 curr_len = 0 while True: final_max = -1 final_pos = -1 final_orient = '' final_contig = '' final_begin = '' final_end = '' #print len(explored) if len(explored) == len(contigs) or prev_len == len(explored): break prev_len = len(explored) for contig in contigs: if contig not in explored: begin = contig + ":B" end = contig + ":E" total_max = -1 orientation = '' pos = -1 #check for positions in the middle of the path for i in range(1, len(path) - 1, 2): score_fow = -1 score_rev = -1 if oriented.has_edge(path[i], begin) and oriented.has_edge( end, path[i + 1]): score_fow = oriented[ path[i]][begin]['weight'] + oriented[end][path[ i + 1]]['weight'] if oriented.has_edge(path[i], end) and oriented.has_edge( begin, path[i + 1]): score_rev = oriented[ path[i]][end]['weight'] + oriented[begin][path[ i + 1]]['weight'] if score_fow >= score_rev: if score_fow > total_max: total_max = score_fow orientation = 'fow' pos = i else: if score_rev > total_max: total_max = score_rev orientation = 'rev' pos = i #check for start and end if oriented.has_edge(begin, path[0]): score_fow = oriented[begin][path[0]]['weight'] if score_fow > total_max: total_max = score_fow orientation = 'fow' pos = 0 if oriented.has_edge(end, path[0]): score_rev = oriented[end][path[0]]['weight'] if score_rev > total_max: total_max = score_rev orientation = 'rev' pos = 0 if oriented.has_edge(path[-1], begin): score_fow = oriented[path[-1]][begin]['weight'] if score_fow > total_max: total_max = score_fow orientation = 'fow' pos = len(path) if oriented.has_edge(path[-1], end): score_rev = oriented[path[-1]][end]['weight'] if score_rev > total_max: total_max = score_rev orientation = 'rev' pos = len(path) if total_max > final_max: final_max = total_max final_pos = pos final_orient = orientation final_contig = contig final_begin = begin final_end = end if final_max > 70: #prev_len = len(explored) explored[final_contig] = 1 if final_orient == 'fow': if final_pos == 0: path.insert(0, final_begin) path.insert(0, final_end) else: if final_pos == len(path): path.append(final_begin) path.append(final_end) else: path.insert(final_pos + 1, final_begin) path.insert(final_pos + 2, final_end) else: if final_pos == 0: path.insert(0, final_begin) path.insert(0, final_end) else: if final_pos == len(path): path.append(final_end) path.append(final_begin) else: path.insert(final_pos + 1, final_end) path.insert(final_pos + 2, final_begin) else: explored[final_contig] = 1 backbone_paths[path_id] = path # for key in backbone_paths: # if len(backbone_paths[key]) >= 4: # print backbone_paths[key] # for key1 in backbone_paths: # max_weight = 0 # max_path = '' # for key2 in backbone_paths: # if key1 != key2: # path1 = backbone_paths[key1] # path2 = backbone_paths[key2] # weight = 0 # for contig1 in path1: # ctg1 = contig1.split(':')[0] # for contig2 in path2: # ctg2 = contig2.split(':')[0] # if H.has_edge(ctg1,ctg2): # weight += H[ctg1][ctg2]['weight'] # if weight > max_weight: # max_weight = weight # max_path = key2 # if max_path != '' and 1000 < max_weight < 4000: # print backbone_paths[key1], backbone_paths[max_path], max_weight c_id = 1 writer = FastaWriter(args.scaffold) for key in backbone_paths: if len(backbone_paths[key]) >= 4: path = backbone_paths[key] curr_contig = "" print c_id for i in range(0, len(path) - 1, 2): curr = path[i] next = path[i + 1] curr = curr.split(':') next = next.split(':') print curr if curr[1] == 'B' and next[1] == 'E': curr_contig += id2seq[curr[0]] if curr[1] == 'E' and next[1] == 'B': #print id2seq[curr[0]] curr_contig += revcompl(id2seq[curr[0]]) if i != len(path) - 2: for j in range(0, 500): curr_contig += 'N' # rec = SeqRecord(Seq(curr_contig,generic_dna),id='scaffold_'+str(c_id)) # recs.append(rec) print c_id writer.writeRecord('scaffold_' + str(c_id), curr_contig) c_id += 1
class ResultCollector(object): """ Gathers results and writes to a file. """ def __init__(self, resultsQueue, algorithmName, algorithmConfig): self._resultsQueue = resultsQueue self._algorithmName = algorithmName self._algorithmConfig = algorithmConfig def _run(self): self.onStart() sentinelsReceived = 0 while sentinelsReceived < options.numWorkers: result = self._resultsQueue.get() if result is None: sentinelsReceived += 1 else: self.onResult(result) self.onFinish() def run(self): if options.doProfiling: cProfile.runctx("self._run()", globals=globals(), locals=locals(), filename=os.path.join(options.temporaryDirectory, "profile-%s.out" % (self.name))) else: self._run() # ================================== # Overridable interface begins here. # def onStart(self): self.referenceBasesProcessedById = OrderedDict() for refId in reference.byName: self.referenceBasesProcessedById[refId] = 0 self.variantsByRefId = defaultdict(list) self.consensusChunksByRefId = defaultdict(list) # open file writers self.fastaWriter = self.fastqWriter = self.gffWriter = None if options.fastaOutputFilename: self.fastaWriter = FastaWriter(options.fastaOutputFilename) if options.fastqOutputFilename: self.fastqWriter = FastqWriter(options.fastqOutputFilename) if options.gffOutputFilename: self.gffWriter = VariantsGffWriter(options.gffOutputFilename, vars(options), reference.byName.values()) def onResult(self, result): window, cssAndVariants = result css, variants = cssAndVariants self._recordNewResults(window, css, variants) self._flushContigIfCompleted(window) def onFinish(self): logging.info("Analysis completed.") if self.fastaWriter: self.fastaWriter.close() if self.fastqWriter: self.fastqWriter.close() if self.gffWriter: self.gffWriter.close() logging.info("Output files completed.") def _recordNewResults(self, window, css, variants): refId, refStart, refEnd = window self.consensusChunksByRefId[refId].append(css) self.variantsByRefId[refId] += variants self.referenceBasesProcessedById[refId] += (refEnd - refStart) def _flushContigIfCompleted(self, window): refId, _, _ = window refEntry = reference.byName[refId] refName = refEntry.fullName basesProcessed = self.referenceBasesProcessedById[refId] requiredBases = reference.numReferenceBases(refId, options.referenceWindows) if basesProcessed == requiredBases: # This contig is done, so we can dump to file and delete # the data structures. if self.gffWriter: self.gffWriter.writeVariants(sorted(self.variantsByRefId[refId])) del self.variantsByRefId[refId] # # If the user asked to analyze a window or a set of # windows, we output a FAST[AQ] contig per analyzed # window. Otherwise we output a fasta contig per # reference contig. # # We try to be intelligent about naming the output # contigs, to include window information where applicable. # for span in reference.enumerateSpans(refId, options.referenceWindows): _, s, e = span if (s == 0) and (e == refEntry.length): spanName = refName else: spanName = refName + "_%d_%d" % (s, e) cssName = consensus.consensusContigName(spanName, self._algorithmName) # Gather just the chunks pertaining to this span chunksThisSpan = [ chunk for chunk in self.consensusChunksByRefId[refId] if windows.windowsIntersect(chunk.refWindow, span) ] css = consensus.join(chunksThisSpan) if self.fastaWriter: self.fastaWriter.writeRecord(cssName, css.sequence) if self.fastqWriter: self.fastqWriter.writeRecord(cssName, css.sequence, css.confidence) del self.consensusChunksByRefId[refId]
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith( ".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError( "%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)
def test_writeFasta2(self): f = StringIO() w = FastaWriter(f) for record in FastaReader(self.fasta2): w.writeRecord(record.header, record.sequence) assert self.fasta2.getvalue() == f.getvalue()
def test_writeFasta2(self): f = StringIO() w = FastaWriter(f) for record in FastaReader(self.fasta1): w.writeRecord(record.name, record.sequence) assert_equal(self.fasta1.getvalue(), f.getvalue())
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError("%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i/10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)