def pick_longest_rep(fasta_filename, gff_filename, group_filename, output_filename): """ For each group, select the representative record to be the longest """ fastad = LazyFastaReader(fasta_filename) fout = FastaWriter(output_filename) coords = {} for line in open(gff_filename): # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PB.1"; transcript_id "PB.1.1"; raw = line.strip().split("\t") if raw[2] == "transcript": tid = raw[-1].split("; ")[1].split()[1][1:-2] coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) for line in open(group_filename): pb_id, members = line.strip().split("\t") best_id = None best_seq = None max_len = 0 for x in members.split(","): if len(fastad[x].sequence) >= max_len: best_id = x best_seq = fastad[x].sequence max_len = len(fastad[x].sequence) fout.writeRecord("{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id), best_seq) fout.close()
def main(argv): desc = 'A tool to trim quiver results for contigs majority lowercase' parser = argparse.ArgumentParser(description=desc) parser.add_argument('inputFile', help='input sequence') parser.add_argument('outputFile', help='output fasta') parser.add_argument( '--filt', default=0.5, dest='filt', type=float, help= 'proportion of lowercase bases a contig can have before being filtered out' ) args = parser.parse_args() writer = FastaWriter(args.outputFile) for record in FastaReader(args.inputFile): upper_output = [] upper_indx = [] lower = float(sum(1 for c in record.sequence if c.islower())) pro = lower / float(len(record.sequence)) print pro if pro < args.filt: writer.writeRecord(record)
def separate_listed_sequences(fasta_file, good_values, good_output, bad_output): """ Separate a fasta file into two based on a supplied value list """ with FastaWriter(good_output) as good_handle: with FastaWriter(bad_output) as bad_handle: for record in FastaReader(fasta_file): name = get_base_sequence_name(record.name) if name in good_values: good_handle.writeRecord(record) else: bad_handle.writeRecord(record)
def separate_aligned_sequences(fasta_file, dictionary, good_values, good_output, bad_output): """ Separate a fasta file into two based on a supplied dictionary and value list """ with FastaWriter(good_output) as good_handle: with FastaWriter(bad_output) as bad_handle: for record in FastaReader(fasta_file): name = get_base_sequence_name(record.name) value = dictionary.get(name, "Unmapped") if value in good_values: good_handle.writeRecord(record) else: bad_handle.writeRecord(record)
def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_nc_fn, out_c_fn, primer_report_fn, write_report_header=True): """ in_read_fn --- a fasta of full-length reads or a fasta of non-full-length reads. For each full-length read in in_read_fn FASTA file, detect whether it is chimeric or not, and write its annotation to primer_report_fn. Return: (num_nc, num_c, num_nc_bases, num_c_bases) """ logging.debug( "Update chimera info for reads in {f} ".format(f=in_read_fn)) logging.debug( "Write primer report to {rpt}".format(rpt=primer_report_fn)) num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0 with FastaReader(in_read_fn) as reader, \ FastaWriter(out_nc_fn) as writer, \ FastaWriter(out_c_fn) as writer_chimera, \ open(primer_report_fn, 'w') as reporter: if write_report_header: reporter.write(ReadAnnotation.header(delimiter=",") + "\n") for r in reader: # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;" readid = r.name.split()[0] annotation = ReadAnnotation.fromString( r.name, ignore_polyA=self.ignore_polyA) if readid not in suspicous_hits: # Non-chimeric reads # Primer of a primer-trimmed read can not be None. # assert(annotation.primer is not None) annotation.chimera = 0 num_nc += 1 num_nc_bases += len(r.sequence) writer.writeRecord(annotation.toAnnotation(), r.sequence) else: # chimeric reads annotation.chimera = 1 num_c += 1 num_c_bases += len(r.sequence) writer_chimera.writeRecord(annotation.toAnnotation(), r.sequence) reporter.write(annotation.toReportRecord(delimitor=",") + "\n") return (num_nc, num_c, num_nc_bases, num_c_bases)
def writerProcess(outDir): # makes output directories if not os.path.exists(outDir): os.makedirs(outDir) fastOutDir = os.path.join(outDir, "Demultiplexed/") if not os.path.exists(fastOutDir): os.makedirs(fastOutDir) # opens files csvOut = open(os.path.join(outDir, "Report.csv"), "w") csvOut.write("Name,Barcode,NumPasses,Coverage,AvgConfidence,MinConfidence,TrimFail,MappingFail\n") writers = {} for writecount in range(totalNumber): result = resultQueue.get() csvOut.write("%s,%s,%d,%d,%0.6f,%0.6f,%s,%s\n" % ( result.name, result.barcode, result.numPasses, result.coverage, result.predictedAccuracy, result.minConfidence, result.trimFail, result.mappingFail)) if result.barcode not in writers: if args.fastq: writers[result.barcode] = FastqWriter(os.path.join(fastOutDir, result.barcode + ".fastq")) else: writers[result.barcode] = FastaWriter(os.path.join(fastOutDir, result.barcode + ".fasta")) if not any((result.minNumPassesFail, result.mappingFail, result.trimFail, result.minCoverageFail, result.minAvgConfidenceFail, result.minConfidenceFail)): if args.fastq: writers[result.barcode].writeRecord(result.name, result.seq, result.qual) else: writers[result.barcode].writeRecord(result.name, result.seq)
def _write_assigned_reads(input_fasta, assignments): """ Write out subreads to the appropriate file """ log.info("Separating subreads based on their amplicon assignments") output_files = [] writers = {} root_name = '.'.join(input_fasta.split('.')[:-1]) # Open up output writers for each group for group in assignments: output_file = "%s_%s.fasta" % (root_name, group) output_files.append(output_file) writers[group] = FastaWriter(output_file) # Write each record to it's appropriate group(s) for record in FastaReader(input_fasta): name = record.name.split()[0] for group in assignments: if name in assignments[group]: writers[group].writeRecord(record) break # Close all of the output writers for group in writers: writers[group].close() return output_files
def open_writer(self): if self.filetype == 'fasta': output_file = '%s.trim.fasta' % self.prefix self.writer = FastaWriter(output_file) elif self.filetype == 'fastq': output_file = '%s.trim.fastq' % self.prefix self.writer = FastqWriter(output_file)
def add_writer(self, group): if self.filetype == 'fasta': output_file = '%s.g%s.fasta' % (self.prefix, group) self.writers[group] = FastaWriter(output_file) if self.filetype == 'fastq': output_file = '%s.g%s.fastq' % (self.prefix, group) self.writers[group] = FastqWriter(output_file)
def writeSequenceData(self, sequenceData): outputFile = 'temp_%s.fasta' % self.counter with FastaWriter(outputFile) as handle: for record in sequenceData: handle.writeRecord(record) self.tempFiles.append(outputFile) return outputFile
def pick_rep(fa_fq_filename, gff_filename, group_filename, output_filename, is_fq=False, pick_least_err_instead=True): """ For each group, select the representative record If is FASTA file (is_fa False) -- then always pick the longest one If is FASTQ file (is_fq True) -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ if is_fq: fd = LazyFastqReader(fa_fq_filename) fout = FastqWriter(output_filename) else: fd = LazyFastaReader(fa_fq_filename) fout = FastaWriter(output_filename) coords = {} for line in open(gff_filename): # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PB.1"; transcript_id "PB.1.1"; raw = line.strip().split('\t') if raw[2] == 'transcript': tid = raw[-1].split('; ')[1].split()[1][1:-2] coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) for line in open(group_filename): pb_id, members = line.strip().split('\t') print >> sys.stderr, "Picking representative sequence for", pb_id best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members.split(','): if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or ( (not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if is_fq: fout.writeRecord(_id_, _seq_, best_qual) else: fout.writeRecord(_id_, _seq_) fout.close()
def write_references(reference_file, references): for i, ref in enumerate(references): for record in FastaReader(reference_file): name = record.name.split()[0] if name == ref: filename = 'reference_%s.fasta' % (i + 1) with FastaWriter(filename) as writer: writer.writeRecord(record)
def subset_references(reference_file, reference_names): output = 'references.fasta' with FastaWriter(output) as writer: for record in FastaReader(reference_file): name = record.name.split()[0] if name in reference_names: writer.writeRecord(record) return output
def get_temp_fasta(record): """ Create a temporary Fasta file for Blasr/HMMsearch/etc """ temp_record = get_temp_fasta_record(record) temp_fasta = NamedTemporaryFile(suffix='.fasta') with FastaWriter(temp_fasta.name) as handle: handle.writeRecord(temp_record) return temp_fasta
def combine_fasta(fasta_files, destination): with FastaWriter(destination) as handle: for fasta in fasta_files: try: for record in FastaReader(fasta): handle.writeRecord(record) except: log.warn('Could not open "%s" as Fasta' % fasta) check_output_file(destination)
def write_fasta(records, output_file): """ Write a FastaRecord, or a list of FastaRecords, out to file """ with FastaWriter(output_file) as handle: for record in records: assert isinstance(record, FastaRecord) handle.writeRecord(record) check_output_file(output_file) return output_file
def extract_subreads(input_file, output_file, min_length, max_length, min_score, min_snr, max_count, white_list=None): """ Extract, filter and subset subreads from Bas/Bax/Fofn Files """ log.info('Extracting subreads from %s' % os.path.basename(input_file)) log.debug('\tMinimum Length:\t%s' % min_length) log.debug('\tMaximum Length:\t%s' % max_length) log.debug('\tMinimum Score:\t%s' % min_score) log.debug('\tMinimum SNR:\t%s' % min_snr) log.debug('\tMax Count:\t%s' % max_count) log.debug('\tWhitelisted ZMWs:\t%s' % white_list) if white_list: white_list = set(_parse_white_list(white_list)) output_prefix = os.path.dirname(output_file) output_file_list = [] subread_count = 0 for i, filename in enumerate(_iterate_input_files(input_file)): curr_output = os.path.join(output_prefix, 'subreads_%s.fasta' % (i + 1)) if filename.endswith('.bas.h5') or filename.endswith('bax.h5'): subreads = _extract_from_bash5(filename, min_length, max_length, min_score, min_snr, white_list) elif filename.endswith('.fa') or filename.endswith('.fasta'): subreads = _extract_from_fasta(filename, min_length, max_length) with FastaWriter(curr_output) as writer: for record in subreads: writer.writeRecord(record) subread_count += len(subreads) output_file_list.append(curr_output) log.info("Extracted %s subreads from %s files" % (subread_count, i + 1)) log.info("Writing FOFN of subread files") with open(output_file, 'w') as handle: for filename in output_file_list: handle.write(filename + '\n') # TODO: Fix MaxCount function #if max_count: # subreads = _subset_subreads( subreads, max_count ) log.info("Finished extracting subreads") return output_file
def make_current_fasta(icec_obj, flnc_filename, root_dir): """ current fasta will consists of all ids however --- if this was a already finished run and we are adding more input, then newids is empty, in this case we set newids = everything that has no affiliation or more than one affiliated cluster in d """ with FastaWriter(current_fasta(root_dir)) as f: for r in FastaReader(flnc_filename): f.writeRecord(r)
def _open_output_handle(output_file, output_type): """ Open an appropriate output handle to record the exon sequences """ if output_type == 'fasta': return FastaWriter(output_file) elif output_type == 'fastq': return FastqWriter(output_file) msg = 'Output type must be Fasta or Fastq' log.error(msg) raise TypeError(msg)
def outputReferenceFasta(self, reference, count): print "Creating reference sequence for Cluster #%s" % count referenceFile = 'cluster%s_ref.fasta' % count reference_desc = 'cluster{0}_reference\t{1}'.format( count, reference.name) if os.path.exists(referenceFile): return referenceFile with FastaWriter(referenceFile) as handle: referenceFasta = FastaRecord(reference_desc, reference.sequence) handle.writeRecord(referenceFasta) return referenceFile
def outputClusterFasta(self, reads, count): fastaFile = 'cluster%s.fasta' % count if os.path.exists(fastaFile): return fastaFile # Rename the "Reference" sequence to the cluster with FastaWriter(fastaFile) as handle: for fastqRecord in reads: fastaRecord = FastaRecord(fastqRecord.name, fastqRecord.sequence) handle.writeRecord(fastaRecord) return fastaFile
def convert_to_dazz_fasta(self): """ Convert input fasta/fastq file to daligner-compatibe fasta with ids: <prefix>/<index>/0_<seqlen> Also write out mappings to pickle """ i = 1 reader = FastaReader(self.input_filename) if self.filetype == "fasta" else FastqReader(self.input_filename) f = FastaWriter(self.dazz_filename) for r in reader: f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence) self.dazz_mapping[i] = r.id i += 1 f.close() with open(self.dazz_filename + ".pickle", "w") as f: dump(self.dazz_mapping, f)
def pick_longest_rep(fasta_filename, gff_filename, group_filename, output_filename): """ For each group, select the representative record to be the longest """ fastad = LazyFastaReader(fasta_filename) fout = FastaWriter(output_filename) coords = {} for line in open(gff_filename): # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PB.1"; transcript_id "PB.1.1"; raw = line.strip().split('\t') if raw[2] == 'transcript': tid = raw[-1].split('; ')[1].split()[1][1:-2] coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) for line in open(group_filename): pb_id, members = line.strip().split('\t') best_id = None best_seq = None max_len = 0 for x in members.split(','): if len(fastad[x].sequence) >= max_len: best_id = x best_seq = fastad[x].sequence max_len = len(fastad[x].sequence) fout.writeRecord("{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id), best_seq) fout.close()
def runBlasr(cls, fastqRecord, alignedRecord): # Write the query and reference records to file tempId = str(int(random() * 10000000)) tempRef = 'temp_ref_%s.fasta' % tempId with FastaWriter(tempRef) as handle: fastaRecord = cls.convertFastqToFasta(fastqRecord) handle.writeRecord(fastaRecord) tempQuery = 'temp_query_%s.fasta' % tempId with FastaWriter(tempQuery) as handle: handle.writeRecord(alignedRecord) # Create and run the command-line tempOut = 'temp_%s.m1' % tempId cline = 'blasr %s %s -m 1 -bestn 1 -out %s' % (tempQuery, tempRef, tempOut) p = subprocess.Popen(cline.split()) stdout, stderr = p.communicate() # Parse and return the best hit and remove temp files bestHit = cls.readBestBlasrHit(tempOut) os.remove(tempRef) os.remove(tempQuery) os.remove(tempOut) return bestHit
def combine_fasta(sequence_files, output_file): """ Combine a series of sequence files into one Fasta """ with FastaWriter(output_file) as handle: for filename in sequence_files: try: for record in FastaReader(filename): handle.writeRecord(record) except: log.warn('Could not open "%s" as Fasta' % fasta) check_output_file(output_file) return output_file
def output_final_sequences(self, finalSequenceList): outputFile = self.process_setup(finalSequenceList, 'SequenceWriter', suffix='fasta') if self.output_files_exist(output_file=outputFile): return outputFile with FastaWriter(outputFile) as writer: with open(finalSequenceList) as handle: for line in handle: sequenceFile = line.strip() copy_fasta_sequences(sequenceFile, writer) self.process_cleanup(output_file=outputFile) return outputFile
def write_fasta(fasta_records, output_file): """ Write a FastaRecord, or list of records, out to file """ with FastaWriter(output_file) as handle: if isinstance(fasta_records, FastaRecord): handle.writeRecord(fasta_records) elif isinstance(fasta_records, list): for record in fasta_records: handle.writeRecord(record) else: msg = "Input Record(s) type not recognized" log.error(msg) raise TypeError(msg) check_output_file(output_file)
def separate_sequences(fasta_file, dictionary, prefix=''): """ Separate a fasta file into multiple groups based on some dict """ file_handles = {} for record in FastaReader(fasta_file): name = get_base_sequence_name(record.name) group = dictionary.get(name, "Unmapped") group_file = prefix + '_' + group + '.fasta' try: file_handles[group_file].writeRecord(record) except KeyError: file_handles[group_file] = FastaWriter(group_file) file_handles[group_file].writeRecord(record) return closed_file_handles(file_handles)
def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_flnc_fn, out_flc_fn, primer_report_fl_fn): """ in_read_fn --- a fasta of full-length reads For each full-length read in in_read_fn FASTA file, detect whether it is chimeric or not, and write its annotation to primer_report_fl_fn. """ logging.info("Update chimera info to reads annotations " + "in the output FASTA file and the primer report.") with FastaReader(in_read_fn) as reader, \ FastaWriter(out_flnc_fn) as writer, \ FastaWriter(out_flc_fn) as writer_chimera, \ open(primer_report_fl_fn, 'w') as reporter: reporter.write("\t".join(ReadAnnotation.fieldsNames()) + "\n") for r in reader: # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;" readid = r.name.split()[0] annotation = ReadAnnotation.fromString( r.name, ignore_polyA=self.ignore_polyA) if readid not in suspicous_hits: # Non-chimeric reads # Primer of a primer-trimmed read can not be None. # assert(annotation.primer is not None) annotation.chimera = 0 assert (annotation.isFullLength) self.summary.num_flnc += 1 self.summary.num_flnc_bases += len(r.sequence) writer.writeRecord(annotation.toAnnotation(), r.sequence) else: # chimeric reads annotation.chimera = 1 self.summary.num_flc += 1 writer_chimera.writeRecord(annotation.toAnnotation(), r.sequence) reporter.write(annotation.toReportRecord() + "\n")
def extract_exons(afa_file, info_file): locus = afa_file.split('_')[0] output_fofn = '%s_exons.fofn' % locus records = list(FastaReader(afa_file)) regions = list(_parse_info_file(info_file)) with open(output_fofn, 'w') as fofn_handle: for exon in _select_exons(regions): output_file = _get_output_file(locus, exon) with FastaWriter(output_file) as output: for record in _extract_fasta_region(records, exon): if len(set(record.sequence)) == 1: continue output.writeRecord(record) fofn_handle.write(os.path.abspath(output_file) + '\n')
def rename_fasta( input_file, output_file, name_key ): """ Rename a single Fasta of subreads """ renaming_dict = read_dict_file( name_key ) with FastaWriter( output_file ) as writer: for record in FastaReader( input_file ): old_name = record.name.split()[0] try: new_name = renaming_dict[old_name] except KeyError: msg = "Sequence name not found!" log.error( msg ) raise KeyError( msg ) new_record = FastaRecord( new_name, record.sequence ) writer.writeRecord( new_record ) check_output_file( output_file ) return output_file
def extract_subreads(input_file, output_file, min_length, max_length, min_score, min_snr, max_count, white_list=None): """ Extract, filter and subset subreads from Bas/Bax/Fofn Files """ log.info('Extracting subreads from %s' % os.path.basename(input_file)) log.debug('\tMinimum Length:\t%s' % min_length) log.debug('\tMaximum Length:\t%s' % max_length) log.debug('\tMinimum Score:\t%s' % min_score) log.debug('\tMinimum SNR:\t%s' % min_snr) log.debug('\tMax Count:\t%s' % max_count) log.debug('\tWhitelisted ZMWs:\t%s' % white_list) if white_list: white_list = set(_parse_white_list(white_list)) subreads = [] for i, filename in enumerate(_iterate_input_files(input_file)): if filename.endswith('.bas.h5') or filename.endswith('bax.h5'): subreads += _extract_from_bash5(filename, min_length, max_length, min_score, min_snr, white_list) elif filename.endswith('.fa') or filename.endswith('.fasta'): subreads += _extract_from_fasta(filename, min_length, max_length) log.info("Extracted %s subreads from %s files" % (len(subreads), i + 1)) if max_count: subreads = _subset_subreads(subreads, max_count) with FastaWriter(output_file) as writer: for record in subreads: writer.writeRecord(record) log.info("Finished extracting subreads") return output_file