def __init__(self, root, gopt, popt, max_isize, pairing_batch_size, visitor=None, mmap_enabled=False): if visitor is None: self.visitor = QuietMonitor() else: self.visitor = visitor self.root = root self.gopt = gopt self.popt = popt self.max_isize = max_isize self.pairing_batch_size = pairing_batch_size self.mmap_enabled = mmap_enabled # reference structures, loaded on demand self.__bwts = None # the reference index # the reference itself self.__pacseq = None self.__bnsp = None self.__last_ii = None self.clean_isize_statistics()
def __init__(self): self.event_monitor = QuietMonitor() self.logger = QuietMonitor() self.__gopt = bwa.gap_init_opt() self.__popt = bwa.pe_init_opt() self.__batch = [] self.qformat = "fastq-illumina" self.max_isize = 1000 self.nthreads = 1 self.trim_qual = 0 self.mmap_enabled = False # .reference is the path to the indexed BWA reference to use. Include the # "root name" of the reference files--i.e. the names without the file extensions. # The trailing dot, if included, will be removed. # e.g. my_references/hg_18.bwt # my_references/hg_18.rsax # my_references/hg_18.sax => "my_references/hg_18" # my_references/hg_18.pac # ... self.reference = None self.hit_visitor = None self.__iterator = None
class BWAIterator(object): def __init__(self, root, gopt, popt, max_isize, pairing_batch_size, visitor=None, mmap_enabled=False): if visitor is None: self.visitor = QuietMonitor() else: self.visitor = visitor self.root = root self.gopt = gopt self.popt = popt self.max_isize = max_isize self.pairing_batch_size = pairing_batch_size self.mmap_enabled = mmap_enabled # reference structures, loaded on demand self.__bwts = None # the reference index # the reference itself self.__pacseq = None self.__bnsp = None self.__last_ii = None self.clean_isize_statistics() def clean_isize_statistics(self): self.__last_ii = bwa.isize_info_t() self.__last_ii.avg = -1.0 def reference_loaded(self): # we use a single variable (__bwts) to determine whether # the ref has been loaded. return self.__bwts is not None def load_reference(self): """ Load the reference at self.root. Remember to deallocate it when unload_reference() once finished!""" if self.reference_loaded(): raise RuntimeError("A reference is already loaded!") try: with self.visitor.time_block("restore_index"): self.__bwts = bwa.restore_index(self.root, self.mmap_enabled) with self.visitor.time_block("restore_reference"): self.__bnsp, self.__pacseq = bwa.restore_reference(self.root, self.mmap_enabled) except: self.unload_reference() raise def unload_reference(self): """Free an allocated reference, if any. Sets __bwts, __pacseq, and __bnsp to None. """ if self.__bwts: for j in 0, 1: bwa.bwt_destroy(self.__bwts[j], self.mmap_enabled) self.__bwts = None if self.__pacseq: del self.__pacseq self.__pacseq = None if self.__bnsp: bwa.bns_destroy(self.__bnsp, self.mmap_enabled) self.__bnsp = None def __analyze(self, bwsa, seq_pairs_read): ################################################## # This method performs the actual analysis work. # BWA writes its results directly into the bwsa structures. ################################################## if not self.reference_loaded(): self.load_reference() ii = bwa.isize_info_t() self.visitor.start("cal_sa_reg_gap") for i in 0, 1: bwa.cal_sa_reg_gap_mt(self.__bwts, seq_pairs_read, bwsa[i], self.gopt) self.visitor.stop("cal_sa_reg_gap") self.visitor.start("cal_pac_pos_pe") bwa.cal_pac_pos_pe(self.__bwts, seq_pairs_read, bwsa, ii, self.popt, self.gopt, self.__last_ii) self.visitor.stop("cal_pac_pos_pe") self.__last_ii = ii if ii.avg > self.max_isize: self.visitor.log_warning("skipping S-W, isize is too big (%.3f)" % ii.avg) else: self.visitor.start("paired_sw") if 0 < self.pairing_batch_size < seq_pairs_read: for offset in xrange(0, seq_pairs_read, self.pairing_batch_size): nseq = min(self.pairing_batch_size, seq_pairs_read - offset) self.visitor.start("paired_sw_batch") bwa.paired_sw(self.__bnsp, self.__pacseq, nseq, bwsa, self.popt, ii, offset) self.visitor.stop_batch("paired_sw_batch", offset, nseq) else: bwa.paired_sw(self.__bnsp, self.__pacseq, seq_pairs_read, bwsa, self.popt, ii) self.visitor.stop("paired_sw") self.visitor.start("refine_gapped") for i in 0, 1: bwa.refine_gapped(self.__bnsp, seq_pairs_read, bwsa[i], self.__pacseq) self.visitor.stop("refine_gapped") def analyze(self, bwsa, seq_pairs_read): """ Align the seq_pairs_read sequence pairs in bwsa and iterate through the matches one read at a time. For each read in each pair in bwsa, this method will yield pairs of BwaMapping objects (mread1, mread2) and (mread2, mread1). """ self.__analyze(bwsa, seq_pairs_read) for i in xrange(seq_pairs_read): for j in 0, 1: yield BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[j][i], bwsa[j^1][i]) def analyze_pairs(self, bwsa, seq_pairs_read): """ Align the seq_pairs_read sequence pairs in bwsa and iterate through the matches two reads at a time. For each pair in bwsa this method will yield a pairs of BwaMapping objects (mread1, mread2) and (mread2, mread1). To analyze the match objects use bwa.analyze_hit """ self.__analyze(bwsa, seq_pairs_read) for i in xrange(seq_pairs_read): # XXX LP: if we leave in this check our filtered map counts will be wrong. # since it doesn't seem to make much of a difference in speed I'll leave it in. #if bwsa[0][i].type == bwa.BWA_TYPE_NO_MATCH or bwsa[1][i].type == bwa.BWA_TYPE_NO_MATCH: yield BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[0][i], bwsa[1][i]), BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[1][i], bwsa[0][i])
class BwaAligner510(object): """ Object oriented interface to perform bwa aln + bwa sampe using libbwa and the bwa module. """ def __init__(self): self.event_monitor = QuietMonitor() self.logger = QuietMonitor() self.__gopt = bwa.gap_init_opt() self.__popt = bwa.pe_init_opt() self.__batch = [] self.qformat = "fastq-illumina" self.max_isize = 1000 self.nthreads = 1 self.trim_qual = 0 self.mmap_enabled = False # .reference is the path to the indexed BWA reference to use. Include the # "root name" of the reference files--i.e. the names without the file extensions. # The trailing dot, if included, will be removed. # e.g. my_references/hg_18.bwt # my_references/hg_18.rsax # my_references/hg_18.sax => "my_references/hg_18" # my_references/hg_18.pac # ... self.reference = None self.hit_visitor = None self.__iterator = None def load_pair_record(self, record): """ Append a tuple of the format (id, seq1, qual1, seq2, qual2) to this aligner's work batch. """ self.__batch.append(record) return len(self.__batch) def get_batch_size(self): return len(self.__batch) def clear_batch(self): self.__batch = [] def run_alignment(self): if not self.reference: raise ValueError( "You must set the reference path before calling run_alignment") if not self.hit_visitor: raise ValueError( "You must set the hit_visitor before calling run_alignment (else you'll lose the alignment results)" ) self.__check_reference() # update __gopt to reflect any changes to our public attributes. self.__gopt.contents.n_threads = self.nthreads self.event_monitor.log_debug("BWA using %d threads", self.__gopt.contents.n_threads) if self.__iterator is None: self.__iterator = BWAIterator(self.reference, self.__gopt, self.__popt, self.max_isize, len(self.__batch), visitor=self.event_monitor, mmap_enabled=self.mmap_enabled) status = "aligning against %s" % os.path.basename(self.reference) self.event_monitor.new_status(status) self.event_monitor.log_debug(status) self.event_monitor.count("reads processed", 2 * len(self.__batch)) with self.event_monitor.time_block("build_bwsa"): # we need to rebuild sequences each time, bwa functions modify them. bwsa = bwa.build_bws_array(self.__batch, qtype=self.qformat, trim_qual=self.trim_qual) self.__count_bases(bwsa, len(self.__batch)) # bwa_iterator.analyze_pairs performs the alignment, then lets you iterate # over the results with self.event_monitor.time_block( "analyze_pairs (cal_+sw+refgap+process)"): # The python speed optimization tips suggest removing non-local variable look-ups # from loops. # http://wiki.python.org/moin/PythonSpeed/PerformanceTips h_proc = self.hit_visitor.process for hit1, hit2 in self.__iterator.analyze_pairs( bwsa, len(self.__batch)): h_proc((hit1, hit2)) with self.event_monitor.time_block("destroy_sequences"): for j in 0, 1: bwa.free_seq(len(self.__batch), bwsa[j]) def release_resources(self): if self.__iterator: self.__iterator.unload_reference() def __count_bases(self, bwsa, bwsa_size): self.event_monitor.start("count_bases") for r in 0, 1: for i in xrange(bwsa_size): seq = bwsa[r][i] self.event_monitor.count("total bases", seq.full_len) self.event_monitor.count("trimmed bases", seq.full_len - seq.clip_len) self.event_monitor.stop("count_bases") def __check_reference(self): """ Checks that the indexed referenced at path self.reference includes files with all the expected extensions. Raises ValueError if something is missing. """ if self.reference[-1] == '.': self.reference = self.reference[ 0:-1] # remove the trailing '.', if any ref_extensions = set([ os.path.splitext(path)[1].lstrip('.') for path in glob.iglob(self.reference + ".*") ]) index_extensions = set([ e for e in ref_extensions if e in BWA_INDEX_EXT ]) # only extensions pertaining to index missing = BWA_INDEX_MANDATORY_EXT - index_extensions if missing: raise ValueError("Missing BWA index file types: %s" % ', '.join(missing)) if self.mmap_enabled and (BWA_INDEX_MMAP_EXT - index_extensions): raise ValueError("Missing BWA mmap index files: %s" % ', '.join(BWA_INDEX_MMAP_EXT - index_extensions)) elif not self.mmap_enabled and (BWA_INDEX_NORM_EXT - index_extensions): raise ValueError("Missing BWA index files: %s" % ', '.join(BWA_INDEX_NORM_EXT - index_extensions))
class BwaAligner510(object): """ Object oriented interface to perform bwa aln + bwa sampe using libbwa and the bwa module. """ def __init__(self): self.event_monitor = QuietMonitor() self.logger = QuietMonitor() self.__gopt = bwa.gap_init_opt() self.__popt = bwa.pe_init_opt() self.__batch = [] self.qformat = "fastq-illumina" self.max_isize = 1000 self.nthreads = 1 self.trim_qual = 0 self.mmap_enabled = False # .reference is the path to the indexed BWA reference to use. Include the # "root name" of the reference files--i.e. the names without the file extensions. # The trailing dot, if included, will be removed. # e.g. my_references/hg_18.bwt # my_references/hg_18.rsax # my_references/hg_18.sax => "my_references/hg_18" # my_references/hg_18.pac # ... self.reference = None self.hit_visitor = None self.__iterator = None def load_pair_record(self, record): """ Append a tuple of the format (id, seq1, qual1, seq2, qual2) to this aligner's work batch. """ self.__batch.append(record) return len(self.__batch) def get_batch_size(self): return len(self.__batch) def clear_batch(self): self.__batch = [] def run_alignment(self): if not self.reference: raise ValueError("You must set the reference path before calling run_alignment") if not self.hit_visitor: raise ValueError("You must set the hit_visitor before calling run_alignment (else you'll lose the alignment results)") self.__check_reference() # update __gopt to reflect any changes to our public attributes. self.__gopt.contents.n_threads = self.nthreads self.event_monitor.log_debug("BWA using %d threads", self.__gopt.contents.n_threads) if self.__iterator is None: self.__iterator = BWAIterator(self.reference, self.__gopt, self.__popt, self.max_isize, len(self.__batch), visitor=self.event_monitor, mmap_enabled=self.mmap_enabled) status = "aligning against %s" % os.path.basename(self.reference) self.event_monitor.new_status(status) self.event_monitor.log_debug(status) self.event_monitor.count("reads processed", 2*len(self.__batch)) with self.event_monitor.time_block("build_bwsa"): # we need to rebuild sequences each time, bwa functions modify them. bwsa = bwa.build_bws_array(self.__batch, qtype=self.qformat, trim_qual=self.trim_qual) self.__count_bases(bwsa, len(self.__batch)) # bwa_iterator.analyze_pairs performs the alignment, then lets you iterate # over the results with self.event_monitor.time_block("analyze_pairs (cal_+sw+refgap+process)"): # The python speed optimization tips suggest removing non-local variable look-ups # from loops. # http://wiki.python.org/moin/PythonSpeed/PerformanceTips h_proc = self.hit_visitor.process for hit1, hit2 in self.__iterator.analyze_pairs(bwsa, len(self.__batch)): h_proc( (hit1, hit2) ) with self.event_monitor.time_block("destroy_sequences"): for j in 0, 1: bwa.free_seq(len(self.__batch), bwsa[j]) def release_resources(self): if self.__iterator: self.__iterator.unload_reference() def __count_bases(self, bwsa, bwsa_size): self.event_monitor.start("count_bases") for r in 0,1: for i in xrange(bwsa_size): seq = bwsa[r][i] self.event_monitor.count("total bases", seq.full_len) self.event_monitor.count("trimmed bases", seq.full_len - seq.clip_len) self.event_monitor.stop("count_bases") def __check_reference(self): """ Checks that the indexed referenced at path self.reference includes files with all the expected extensions. Raises ValueError if something is missing. """ if self.reference[-1] == '.': self.reference = self.reference[0:-1] # remove the trailing '.', if any ref_extensions = set([ os.path.splitext(path)[1].lstrip('.') for path in glob.iglob(self.reference + ".*") ]) index_extensions = set([ e for e in ref_extensions if e in BWA_INDEX_EXT ]) # only extensions pertaining to index missing = BWA_INDEX_MANDATORY_EXT - index_extensions if missing: raise ValueError("Missing BWA index file types: %s" % ', '.join(missing)) if self.mmap_enabled and (BWA_INDEX_MMAP_EXT - index_extensions): raise ValueError("Missing BWA mmap index files: %s" % ', '.join(BWA_INDEX_MMAP_EXT - index_extensions)) elif not self.mmap_enabled and (BWA_INDEX_NORM_EXT - index_extensions): raise ValueError("Missing BWA index files: %s" % ', '.join(BWA_INDEX_NORM_EXT - index_extensions))
class BWAIterator(object): def __init__(self, root, gopt, popt, max_isize, pairing_batch_size, visitor=None, mmap_enabled=False): if visitor is None: self.visitor = QuietMonitor() else: self.visitor = visitor self.root = root self.gopt = gopt self.popt = popt self.max_isize = max_isize self.pairing_batch_size = pairing_batch_size self.mmap_enabled = mmap_enabled # reference structures, loaded on demand self.__bwts = None # the reference index # the reference itself self.__pacseq = None self.__bnsp = None self.__last_ii = None self.clean_isize_statistics() def clean_isize_statistics(self): self.__last_ii = bwa.isize_info_t() self.__last_ii.avg = -1.0 def reference_loaded(self): # we use a single variable (__bwts) to determine whether # the ref has been loaded. return self.__bwts is not None def load_reference(self): """ Load the reference at self.root. Remember to deallocate it when unload_reference() once finished!""" if self.reference_loaded(): raise RuntimeError("A reference is already loaded!") try: with self.visitor.time_block("restore_index"): self.__bwts = bwa.restore_index(self.root, self.mmap_enabled) with self.visitor.time_block("restore_reference"): self.__bnsp, self.__pacseq = bwa.restore_reference( self.root, self.mmap_enabled) except: self.unload_reference() raise def unload_reference(self): """Free an allocated reference, if any. Sets __bwts, __pacseq, and __bnsp to None. """ if self.__bwts: for j in 0, 1: bwa.bwt_destroy(self.__bwts[j], self.mmap_enabled) self.__bwts = None if self.__pacseq: del self.__pacseq self.__pacseq = None if self.__bnsp: bwa.bns_destroy(self.__bnsp, self.mmap_enabled) self.__bnsp = None def __analyze(self, bwsa, seq_pairs_read): ################################################## # This method performs the actual analysis work. # BWA writes its results directly into the bwsa structures. ################################################## if not self.reference_loaded(): self.load_reference() ii = bwa.isize_info_t() self.visitor.start("cal_sa_reg_gap") for i in 0, 1: bwa.cal_sa_reg_gap_mt(self.__bwts, seq_pairs_read, bwsa[i], self.gopt) self.visitor.stop("cal_sa_reg_gap") self.visitor.start("cal_pac_pos_pe") bwa.cal_pac_pos_pe(self.__bwts, seq_pairs_read, bwsa, ii, self.popt, self.gopt, self.__last_ii) self.visitor.stop("cal_pac_pos_pe") self.__last_ii = ii if ii.avg > self.max_isize: self.visitor.log_warning("skipping S-W, isize is too big (%.3f)" % ii.avg) else: self.visitor.start("paired_sw") if 0 < self.pairing_batch_size < seq_pairs_read: for offset in xrange(0, seq_pairs_read, self.pairing_batch_size): nseq = min(self.pairing_batch_size, seq_pairs_read - offset) self.visitor.start("paired_sw_batch") bwa.paired_sw(self.__bnsp, self.__pacseq, nseq, bwsa, self.popt, ii, offset) self.visitor.stop_batch("paired_sw_batch", offset, nseq) else: bwa.paired_sw(self.__bnsp, self.__pacseq, seq_pairs_read, bwsa, self.popt, ii) self.visitor.stop("paired_sw") self.visitor.start("refine_gapped") for i in 0, 1: bwa.refine_gapped(self.__bnsp, seq_pairs_read, bwsa[i], self.__pacseq) self.visitor.stop("refine_gapped") def analyze(self, bwsa, seq_pairs_read): """ Align the seq_pairs_read sequence pairs in bwsa and iterate through the matches one read at a time. For each read in each pair in bwsa, this method will yield pairs of BwaMapping objects (mread1, mread2) and (mread2, mread1). """ self.__analyze(bwsa, seq_pairs_read) for i in xrange(seq_pairs_read): for j in 0, 1: yield BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[j][i], bwsa[j ^ 1][i]) def analyze_pairs(self, bwsa, seq_pairs_read): """ Align the seq_pairs_read sequence pairs in bwsa and iterate through the matches two reads at a time. For each pair in bwsa this method will yield a pairs of BwaMapping objects (mread1, mread2) and (mread2, mread1). To analyze the match objects use bwa.analyze_hit """ self.__analyze(bwsa, seq_pairs_read) for i in xrange(seq_pairs_read): # XXX LP: if we leave in this check our filtered map counts will be wrong. # since it doesn't seem to make much of a difference in speed I'll leave it in. #if bwsa[0][i].type == bwa.BWA_TYPE_NO_MATCH or bwsa[1][i].type == bwa.BWA_TYPE_NO_MATCH: yield BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[0][i], bwsa[1][i]), BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[1][i], bwsa[0][i])