def test_contigset_consolidate_int_names(self): # build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used shutil.copyfile( ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas) rs1 = ContigSet(inFas) double = 'B.cereus.1' exp_double = rs1.get_contig(double) # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord('5141', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord('5142', exp_double.sequence) exp_double_seqs = [exp_double.sequence, exp_double.sequence] exp_names = ['5141', '5142'] obs_file = ContigSet(outFas1, outFas2) log.debug(obs_file.toExternalFiles()) obs_file.consolidate() log.debug(obs_file.toExternalFiles()) # open obs and compare to exp for name, seq in zip(exp_names, exp_double_seqs): assert obs_file.get_contig(name).sequence[:] == seq
def test_contigset_consolidate(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format(i=ReferenceSet( data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) singletons = ['A.baumannii.1', 'A.odontolyticus.1'] double = 'B.cereus.1' reader = rs1.resourceReaders()[0] exp_double = rs1.get_contig(double) exp_singles = [rs1.get_contig(name) for name in singletons] # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord(exp_singles[0]) writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord(exp_double.name + '_0_10', exp_double.sequence + 'ATCGATCGATCG') writer.writeRecord(exp_singles[1]) exp_double_seq = ''.join( [exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence]) exp_single_seqs = [rec.sequence for rec in exp_singles] acc_file = ContigSet(outFas1, outFas2) acc_file.induceIndices() log.debug(acc_file.toExternalFiles()) self.assertEqual(len(acc_file), 4) self.assertEqual(len(list(acc_file)), 4) acc_file.consolidate() log.debug(acc_file.toExternalFiles()) # open acc and compare to exp for name, seq in zip(singletons, exp_single_seqs): self.assertEqual(acc_file.get_contig(name).sequence[:], seq) self.assertEqual( acc_file.get_contig(double).sequence[:], exp_double_seq) self.assertEqual(len(acc_file._openReaders), 1) self.assertEqual(len(acc_file.index), 3) self.assertEqual(len(acc_file._indexMap), 3) self.assertEqual(len(acc_file), 3) self.assertEqual(len(list(acc_file)), 3) # test merge: acc1 = ContigSet(outFas1) acc2 = ContigSet(outFas2) acc3 = acc1 + acc2
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns, combined_hq_fa, combined_hq_fq, combined_lq_fa, combined_lq_fq, hq_lq_prefix_dict_pickle, sample_name): """Combine split hq (lq) files and save to combined_dir. Dumping hq|lq prefix dictionary to pickle. Return an instance of CombinedFiles. Parameters: split_indices -- indices of splitted cluster bins. split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...] split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...] """ assert len(split_indices) == len(split_hq_fns) assert len(split_indices) == len(split_lq_fns) assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns]) hq_pre_dict, lq_pre_dict = {}, {} hq_fa_writer = FastaWriter(combined_hq_fa) hq_fq_writer = FastqWriter(combined_hq_fq) lq_fa_writer = FastaWriter(combined_lq_fa) lq_fq_writer = FastqWriter(combined_lq_fq) for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns): logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq) hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ", sample_name=sample_name) lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ", sample_name=sample_name) hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq)) lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq)) with FastqReader(split_hq) as reader: for read in reader: name = combined_cid_hq_name(cluster_bin_index=i, name=read.name, sample_name=sample_name) hq_fa_writer.writeRecord(name, read.sequence[:]) hq_fq_writer.writeRecord(name, read.sequence[:], read.quality) with FastqReader(split_lq) as reader: for read in reader: name = combined_cid_lq_name(cluster_bin_index=i, name=read.name, sample_name=sample_name) lq_fa_writer.writeRecord(name, read.sequence[:]) lq_fq_writer.writeRecord(name, read.sequence[:], read.quality) hq_fa_writer.close() hq_fq_writer.close() lq_fa_writer.close() lq_fq_writer.close() logging.info("HQ polished output combined to:%s", combined_hq_fq) logging.info("LQ polished output combined to:%s", combined_lq_fq) logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle) with open(hq_lq_prefix_dict_pickle, 'wb') as writer: cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_nc_fn, out_c_fn, primer_report_fn, write_report_header=True): """ in_read_fn --- a fasta of full-length reads or a fasta of non-full-length reads. For each full-length read in in_read_fn FASTA file, detect whether it is chimeric or not, and write its annotation to primer_report_fn. Return: (num_nc, num_c, num_nc_bases, num_c_bases) """ logging.debug( "Update chimera info for reads in {f} ".format(f=in_read_fn)) logging.debug( "Write primer report to {rpt}".format(rpt=primer_report_fn)) out_nc_fn_fasta, out_c_fn_fasta = out_nc_fn, out_c_fn if out_nc_fn.endswith(".xml"): out_nc_fn_fasta = out_nc_fn[:-4] + ".fasta" if out_c_fn.endswith(".xml"): out_c_fn_fasta = out_c_fn[:-4] + ".fasta" num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0 with ContigSetReaderWrapper(in_read_fn) as reader, \ FastaWriter(out_nc_fn_fasta) as writer, \ FastaWriter(out_c_fn_fasta) as writer_chimera, \ open(primer_report_fn, 'w') as reporter: if write_report_header: reporter.write(ReadAnnotation.header(delimiter=",") + "\n") for r in reader: # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;" readid = r.name.split()[0] annotation = ReadAnnotation.fromString( r.name, ignore_polyA=self.ignore_polyA) if readid not in suspicous_hits: # Non-chimeric reads # Primer of a primer-trimmed read can not be None. # assert(annotation.primer is not None) annotation.chimera = 0 num_nc += 1 num_nc_bases += len(r.sequence) writer.writeRecord(annotation.toAnnotation(), r.sequence[:]) else: # chimeric reads annotation.chimera = 1 num_c += 1 num_c_bases += len(r.sequence) writer_chimera.writeRecord(annotation.toAnnotation(), r.sequence[:]) reporter.write(annotation.toReportRecord(delimitor=",") + "\n") return (num_nc, num_c, num_nc_bases, num_c_bases)
def dumpEvidence(evidenceDumpBaseDirectory, refWindow, refSequence, alns, quiverConsensus): """This will import h5py at runtime. """ # Format of evidence dump: # evidence_dump/ # ref000001/ # 0-1005/ # reference.fa # reads.fa # consensus.fa # quiver-scores.h5 # 995-2005/ # ... join = os.path.join refId, refStart, refEnd = refWindow refName = reference.idToName(refId) windowDirectory = join(evidenceDumpBaseDirectory, refName, "%d-%d" % (refStart, refEnd)) logging.info("Dumping evidence to %s" % (windowDirectory, )) if os.path.exists(windowDirectory): raise Exception( "Evidence dump does not expect directory %s to exist." % windowDirectory) os.makedirs(windowDirectory) refFasta = FastaWriter(join(windowDirectory, "reference.fa")) readsFasta = FastaWriter(join(windowDirectory, "reads.fa")) consensusFasta = FastaWriter(join(windowDirectory, "consensus.fa")) windowName = refName + (":%d-%d" % (refStart, refEnd)) refFasta.writeRecord(windowName, refSequence) refFasta.close() consensusFasta.writeRecord(windowName + "|quiver", quiverConsensus.sequence) consensusFasta.close() rowNames, columnNames, baselineScores, scores = scoreMatrix( quiverConsensus.mms) import h5py quiverScoreFile = h5py.File(join(windowDirectory, "quiver-scores.h5")) quiverScoreFile.create_dataset("Scores", data=scores) vlen_str = h5py.special_dtype(vlen=str) quiverScoreFile.create_dataset("RowNames", data=rowNames, dtype=vlen_str) quiverScoreFile.create_dataset("ColumnNames", data=columnNames, dtype=vlen_str) quiverScoreFile.create_dataset("BaselineScores", data=baselineScores) quiverScoreFile.close() for aln in alns: readsFasta.writeRecord(str(aln.rowNumber), aln.read(orientation="genomic", aligned=False)) readsFasta.close()
def write_temporary_fasta(record_list): temp_fasta = tempfile.NamedTemporaryFile(suffix=".fasta", delete=False) with FastaWriter(temp_fasta.name) as handle: for record in record_list: rec = FastaRecord(record.name, record.sequence) handle.writeRecord(rec) return temp_fasta
def reconstruct_ref_fa_for_clusters_in_bin(self, cids, refs): """ Reconstruct ref_fa of the cluster in the new tmp_dir e.g., self.g_consensus_ref_fa_of_cluster(cid) cids --- list[int(cid)], e.g., [10, 11, 12, ..., 20] refs --- dict{int(cid): ref_fa of cluster(cid)} """ # Check existence when first time it is read. if not nfs_exists(self.final_consensus_fa): raise IOError("Final consensus FASTA file {f}".format( f=self.final_consensus_fa) + "does not exist.") self.add_log("Reconstructing g consensus files for clusters " "[%d, %d] in %s" % (cids[0], cids[-1], self.tmp_dir), level=logging.INFO) final_consensus_d = FastaRandomReader(self.final_consensus_fa) for ref_id in final_consensus_d.d.keys(): cid = int(ref_id.split('/')[0].replace('c', '')) # e.g., ref_id = c103/1/3708, cid = 103, # refs[cid] = ...tmp/0/c103/g_consensus_ref.fasta if cid in cids: mkdir(self.cluster_dir(cid)) ref_fa = op.join(self.cluster_dir(cid), op.basename(refs[cid])) refs[cid] = ref_fa with FastaWriter(ref_fa) as writer: self.add_log("Writing ref_fa %s" % refs[cid]) writer.writeRecord(ref_id, final_consensus_d[ref_id].sequence[:]) self.add_log("Reconstruct of g consensus files completed.", level=logging.INFO)
def Write(self): """Clean-up the sequences and write out a Genomic Fasta""" sets = [] writers = [] for allele, seq in self._dict.iteritems(): exons = seq.split("|") while len(writers) < len(exons): fasta = "{0}_exon{1}.fasta".format(self._locus, len(writers) + 1) writers.append(FastaWriter(fasta)) sets.append(set()) for i, exon in enumerate(exons): exon = re.sub("[.|*]", "", exon) if len(exon) == 0: continue if exon in sets[i]: continue record = FastaRecord(allele, exon) writers[i].writeRecord(record) sets[i].add(exon)
def __init__(self, isoseq_output_fn, reference_transcripts_fn, output_analysis_fn, min_true_positive, max_false_positive, min_seq_similarity, max_fuzzy_junction): self.isoseq_output_fn = isoseq_output_fn self.reference_transcripts_fn = reference_transcripts_fn self.output_analysis_fn = output_analysis_fn if isoseq_output_fn.endswith(".fasta") or isoseq_output_fn.endswith( ".fa"): self.isoforms = [r for r in FastaReader(isoseq_output_fn)] self.isoseq_output_fa = self.isoseq_output_fn elif isoseq_output_fn.endswith(".fastq") or isoseq_output_fn.endswith( ".fq"): self.isoforms = [r for r in FastqReader(isoseq_output_fn)] self.isoseq_output_fa = self.output_analysis_fn + ".isoseq.fa" with FastaWriter(self.isoseq_output_fa) as writer: for r in self.isoforms: writer.writeRecord(r.name, r.sequence) self.reference_transcripts = [ r for r in FastaReader(reference_transcripts_fn) ] self.min_true_positive = min_true_positive self.max_false_positive = max_false_positive self.min_seq_similarity = min_seq_similarity if min_seq_similarity <= 1 \ else min_seq_similarity / 100.0 self.max_fuzzy_junction = max_fuzzy_junction self.alns = self.filter_alns( self.map_isoforms_to_reference_transcripts())
def convert_to_dazz_fasta(self): """ Convert input fasta/fastq file to daligner-compatibe fasta with ids: <prefix>/<index>/0_<seqlen> Also write out mappings to pickle """ log.debug("Converting %s to daligner compatible fasta %s.", self.input_filename, self.dazz_filename) reader = ContigSetReaderWrapper(self.input_filename) with FastaWriter(self.dazz_filename) as f: i = 1 for r in reader: f.writeRecord( "{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence[:]) self.dazz_mapping[i] = r.name i += 1 reader.close() with open(self.pickle_filename, 'w') as f: dump(self.dazz_mapping, f)
def run(self): """Subset reads based on read annotation and subset rules.""" infoMsg = "Extracting reads from {f} based on ".format(f=self.inFN) infoMsg += "rules(FullLength={fl}, nonChimeric={nc}).".format( fl="true" if self.rules.FL != 0 else "false", nc="true" if self.rules.nonChimeric != 0 else "false") logging.info(infoMsg) if not self.printReadLengthOnly: with FastaReader(self.inFN) as reader, \ FastaWriter(self.outFN) as writer: for r in reader: #print >> sys.stderr, r.name, self.ignore_polyA annotation = ReadAnnotation.fromString( r.name, self.ignore_polyA) if self.satisfy(annotation, self.rules): writer.writeRecord(r.name, r.sequence) else: # print read length only, dont print read names and sequences with FastaReader(self.inFN) as reader, \ open(self.outFN, 'w') as writer: for r in reader: annotation = ReadAnnotation.fromString( r.name, self.ignore_polyA) if self.satisfy(annotation, self.rules): writer.write("{rl}\n".format(rl=len(r.sequence)))
def run_fasta_filter(fasta_in, fasta_out, min_seq_length): with FastaWriter(fasta_out) as w: with FastaReader(fasta_in) as r: for record in r: if len(record.sequence) > min_seq_length: w.writeRecord(record) return 0
def split(self): """Split `input_fasta` into smaller files each containing `reads_per_split` reads. Return splitted fasta.""" split_index = 0 self.out_fns = [] writer = FastaWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) with FastaReader(self.input_fasta) as reader: for ridx, r in enumerate(reader): if ridx % self.reads_per_split == 0 and ridx != 0: split_index += 1 writer.close() writer = FastaWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) writer.writeRecord(r.name, r.sequence) writer.close() return list(self.out_fns)
def Write(self): """Clean-up the sequences and write out a Genomic Fasta""" filename = "{0}_genomic.fasta".format(self._locus) with FastaWriter(filename) as handle: for allele, seq in self._dict.iteritems(): # Remove inserts, exon/intron boundaries, and trimmed regions seq = re.sub("[.|*]", "", seq) record = FastaRecord(allele, seq) handle.writeRecord(record)
def _fastq_to_fasta(fastq_path, fasta_path): """Convert a fastq file to fasta file""" with FastqReader(fastq_path) as r: with FastaWriter(fasta_path) as w: for fastq_record in r: fasta_record = FastaRecord(fastq_record.name, fastq_record.sequence) w.writeRecord(fasta_record) log.info("Completed converting {q} to {f}".format(q=fastq_path, f=fasta_path)) return 0
def write_temp_fasta(fastq_file): """ Write a temporary Fasta file from a Fastq """ temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False) with FastaWriter(temp.name) as handle: for record in FastqReader(fastq_file): temp_record = FastaRecord(record.name, record.sequence) handle.writeRecord(temp_record) return temp
def _writeFasta(filepath, records): """ Attempt to write a list of records to a new reference FASTA """ try: with FastaWriter(filepath) as handle: for record in records: handle.writeRecord(record) except: raise ReferenceIOException( 'Unable to write reference FASTA "{0}"'.format(filepath))
def split(self, reads_in_first_split=None): """Split `input_fasta` into smaller files each containing `reads_per_split` reads. Return splitted fasta.""" split_index = 0 self.out_fns = [] writer = FastaWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) if reads_in_first_split is None: reads_in_first_split = self.reads_per_split with ContigSetReaderWrapper(self.input_fasta) as reader: for ridx, r in enumerate(reader): if ((split_index == 0 and ridx == reads_in_first_split) or (split_index > 0 and ridx % self.reads_per_split == 0)) \ and ridx != 0: split_index += 1 writer.close() writer = FastaWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) writer.writeRecord(r.name, r.sequence[:]) writer.close() return list(self.out_fns)
def rename_imgt_fasta(input_file, output_file): with FastaWriter(output_file) as handle: for record in FastaReader(input_file): # Check that this is an IMGT-formatted FASTA record assert record.header.startswith('HLA:') # Extract the header and replace spaces with underscores new_header = record.header.strip().replace(' ', '_') # Create a new record with the same sequence and the type # in place of it's id. new_record = FastaRecord(new_header, record.sequence) handle.writeRecord(new_record)
def setUpClass(cls): with FastaWriter(cls.REFERENCE) as fasta_out: with FastaReader(TestCoverageRpt.REFERENCE) as fasta_in: for rec in fasta_in: header = rec.id + "|quiver" fasta_out.writeRecord(header, rec.sequence) with GffWriter(cls.GFF) as gff_out: with GffReader(TestCoverageRpt.GFF) as gff_in: for header in gff_in.headers: gff_out.writeHeader(header) for rec in gff_in: rec.seqid += "|quiver" gff_out.writeRecord(rec)
def test_contigset_write(self): fasta = upstreamData.getLambdaFasta() ds = ContigSet(fasta) assert isinstance(ds.resourceReaders()[0], IndexedFastaReader) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'test.fasta') w = FastaWriter(outfn) for rec in ds: w.writeRecord(rec) w.close() fas = FastaReader(outfn) for rec in fas: # make sure a __repr__ didn't slip through: assert not rec.sequence.startswith('<')
def resolved_tool_contract_to_args(resolved_tool_contract): """Convert resolved tool contract to args.""" rtc = resolved_tool_contract args = [ "--verbose", "classify", resolved_tool_contract.task.input_files[0], resolved_tool_contract.task.output_files[0], "--flnc", resolved_tool_contract.task.output_files[1], "--nfl", resolved_tool_contract.task.output_files[2], "--summary", resolved_tool_contract.task.output_files[3], # JSON "--report", resolved_tool_contract.task.output_files[4], # CSV "--min_seq_len", str(rtc.task.options[Constants.MIN_SEQ_LEN_ID]), "--cpus", str(resolved_tool_contract.task.nproc), "--outDir", op.dirname(rtc.task.output_files[0]), "--ignore-empty-output", ] if rtc.task.options[Constants.IGNORE_POLYA_ID]: args.append("--ignore_polyA") primers_str_obj = rtc.task.options[Constants.PRIMER_SEQUENCES_ID] primers_str = str(primers_str_obj).strip().translate(None, '\'\" ') if primers_str_obj is not None and primers_str not in ('None', ''): logging.info("Detected customer primer: %s", primers_str) # Save primer sequences to a fasta file under output dir primer_fasta_records = parse_primer_sequences(primers_str=primers_str) d = op.dirname(resolved_tool_contract.task.output_files[2]) mkdir(d) primer_fn = op.join(d, "customer_primers.fasta") with FastaWriter(primer_fn) as writer: for record in primer_fasta_records: writer.writeRecord(record) logging.info("Customer primer sequences written to file %s", primer_fn) args.append("-p") args.append("%s" % primer_fn) else: logging.info("No customer primer detected.") return get_argument_parser().parse_args(args)
def onStart(self): self.referenceBasesProcessedById = OrderedDict() for refId in reference.byName: self.referenceBasesProcessedById[refId] = 0 self.variantsByRefId = defaultdict(list) self.consensusChunksByRefId = defaultdict(list) # open file writers self.fastaWriter = self.fastqWriter = self.gffWriter = None if options.fastaOutputFilename: self.fastaWriter = FastaWriter(options.fastaOutputFilename) if options.fastqOutputFilename: self.fastqWriter = FastqWriter(options.fastqOutputFilename) if options.gffOutputFilename: self.gffWriter = VariantsGffWriter(options.gffOutputFilename, vars(options), reference.byName.values())
def writeSequenceRecords(filename, records, filetype=None): """ Write the records out to file """ fileType = filetype or getFileType(filename) if fileType == 'fasta': with FastaWriter(filename) as writer: for record in records: writer.writeRecord(record) elif fileType == 'fastq': with FastqWriter(filename) as writer: for record in records: writer.writeRecord(record) else: msg = 'Output filetype must be either FASTA or FASTQ' log.error(msg) raise TypeError(msg) return filename
def run_main(input_file, output_file, min_sequence_length): """ Main function entry point to your application (this should be imported from your library code) :rtype int: """ _d = dict(i=input_file, a=min_sequence_length, o=output_file) msg = "Running dev_app task. with input:{i} output:{o} and min-length={a}".format( **_d) log.info(msg) with FastaWriter(output_file) as w: with FastaReader(input_file) as r: for record in r: if len(record.sequence) > min_sequence_length: w.writeRecord(record) log.debug("completed running main.") return 0
def save(self, dir): """ Save this ArrowEvidence to a directory. The directory will be *created* by this method. Format of evidence dump: evidence_dump/ ref000001/ 0-1005/ consensus.fa arrow-scores.h5 995-2005/ ... """ logging.info("Dumping evidence to %s" % (dir, )) join = os.path.join if os.path.exists(dir): raise Exception( "Evidence dump does not expect directory %s to exist." % dir) os.makedirs(dir) #refFasta = FastaWriter(join(dir, "reference.fa")) #readsFasta = FastaWriter(join(dir, "reads.fa")) consensusFasta = FastaWriter(join(dir, "consensus.fa")) windowName = self.refName + (":%d-%d" % (self.refStart, self.refEnd)) #refFasta.writeRecord(windowName, self.refSequence) #refFasta.close() consensusFasta.writeRecord(windowName + "|arrow", self.consensus) consensusFasta.close() import h5py arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5")) arrowScoreFile.create_dataset("Scores", data=self.scores) vlen_str = h5py.special_dtype(vlen=str) arrowScoreFile.create_dataset("RowNames", data=self.rowNames, dtype=vlen_str) arrowScoreFile.create_dataset("ColumnNames", data=self.colNames, dtype=vlen_str) arrowScoreFile.create_dataset("BaselineScores", data=self.baselineScores) arrowScoreFile.close()
def main(): id2seq = {} parser = argparse.ArgumentParser() parser.add_argument("-b", "--breakpoint", help="file containing breakpoints") parser.add_argument("-a", "--assembly", help="fasta file containing contigs") parser.add_argument("-o", "--outfile", help="new assembly file") parser.add_argument("-l", "--lenfile", help="length of contigs") args = parser.parse_args() lenfile = open(args.lenfile, 'w') lenmap = {} f = FastaReader(args.assembly) for record in f: id = record.id id2seq[id] = record.sequence[0:-10] new_seq = {} f = open(args.breakpoint, 'r') lines = f.readlines() for line in lines: attrs = line.split() if len(attrs) == 1: curr_contig = attrs[0] seq = id2seq[curr_contig] else: start = long(attrs[0]) end = long(attrs[1]) new_id = curr_contig + '_' + attrs[0] + '_' + attrs[1] new_seq[new_id] = seq[start:end] lenmap[new_id] = end - start + 1 rec_list = [] writer = FastaWriter(args.scaffold) for key in new_seq: writer.writeRecord(key, new_seq[key]) for key in lenmap: lenfile.write(key + "\t" + str(lenmap[key]) + '\n')
def main(parser): args = parser.parse_args() # Get outfile name if args.outFile is None: outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq' else: outfile = args.outFile # Input files barcodeFofn = (l.strip('\n') for l in args.barcode_fofn) ccsFofn = (l.strip('\n') for l in args.ccs_fofn) # Get the read names that are not barcoded no_barcode = set() for barcodeFile in barcodeFofn: bcH5 = BarcodeH5Reader(barcodeFile) for row in bcH5.bestDS: if row[3] / row[1] < args.minAvgBarcodeScore: no_barcode.add('%s/%d' % (bcH5.movieName, row[0])) if args.fasta: outh = FastaWriter(outfile) else: outh = FastqWriter(outfile) for ccsFile in ccsFofn: ccsH5 = BasH5Reader(ccsFile) for ccsRead in ccsH5.ccsReads(): if ccsRead.zmw.zmwName in no_barcode: basecalls = ccsRead.basecalls() if len(basecalls) >= args.minMaxInsertLength: if args.fasta: outh.writeRecord( FastaRecord(ccsRead.zmw.zmwName, ccsRead.basecalls())) else: outh.writeRecord( FastqRecord(ccsRead.zmw.zmwName, ccsRead.basecalls(), ccsRead.QualityValue())) outh.close()
def combine_consensus_isoforms(split_indices, split_files, combined_consensus_isoforms_fa, sample_name): """ Parameters: split_indices -- indices of splitted cluster bins. split_files -- consensus isoforms in each splitted cluster bin. """ assert len(split_indices) == len(split_files) writer = FastaWriter(combined_consensus_isoforms_fa) for i, split_fn in zip(split_indices, split_files): logging.debug("Adding prefix i%s to %s.", str(i), split_fn) with ContigSetReaderWrapper(split_fn) as reader: for read in reader: name = combined_cid_ice_name(name=read.name, cluster_bin_index=i, sample_name=sample_name) writer.writeRecord(name, read.sequence[:]) writer.close() logging.info("Consensus isoforms output combined to:%s", combined_consensus_isoforms_fa)
def split_results(amp_analysis): """Split the output of an Amplicon Analysis job by Barcode""" assert os.path.isdir(amp_analysis) sequence_path = os.path.join(amp_analysis, 'amplicon_analysis.fasta') check_output_file(sequence_path) print "Analyzing %s output sequences" % fasta_size(sequence_path) barcode_path = os.path.join(amp_analysis, 'by_barcode') create_directory(barcode_path) records = list(FastaReader(sequence_path)) barcodes = {get_barcode(r): [] for r in records} [barcodes[get_barcode(r)].append(r) for r in records] barcode_files = {} for barcode, records in barcodes.iteritems(): barcode_file = barcode + '.fasta' sample_path = os.path.join(barcode_path, barcode_file) with FastaWriter(sample_path) as handle: for record in records: handle.writeRecord(record) barcode_files[barcode] = sample_path return barcode_files