Example #1
0
 def __init__(self,
              root,
              gopt,
              popt,
              max_isize,
              pairing_batch_size,
              visitor=None,
              mmap_enabled=False):
     if visitor is None:
         self.visitor = QuietMonitor()
     else:
         self.visitor = visitor
     self.root = root
     self.gopt = gopt
     self.popt = popt
     self.max_isize = max_isize
     self.pairing_batch_size = pairing_batch_size
     self.mmap_enabled = mmap_enabled
     # reference structures, loaded on demand
     self.__bwts = None  # the reference index
     # the reference itself
     self.__pacseq = None
     self.__bnsp = None
     self.__last_ii = None
     self.clean_isize_statistics()
Example #2
0
    def __init__(self):
        self.event_monitor = QuietMonitor()
        self.logger = QuietMonitor()
        self.__gopt = bwa.gap_init_opt()
        self.__popt = bwa.pe_init_opt()
        self.__batch = []

        self.qformat = "fastq-illumina"
        self.max_isize = 1000
        self.nthreads = 1
        self.trim_qual = 0
        self.mmap_enabled = False
        # .reference is the path to the indexed BWA reference to use.  Include the
        # "root name" of the reference files--i.e. the names without the file extensions.
        # The trailing dot, if included, will be removed.
        # e.g. my_references/hg_18.bwt
        #      my_references/hg_18.rsax
        #      my_references/hg_18.sax   => "my_references/hg_18"
        #      my_references/hg_18.pac
        #      ...
        self.reference = None
        self.hit_visitor = None
        self.__iterator = None
Example #3
0
 def __init__(self, root, gopt, popt, max_isize, pairing_batch_size,
              visitor=None, mmap_enabled=False):
   if visitor is None:
     self.visitor = QuietMonitor()
   else:
     self.visitor = visitor
   self.root = root
   self.gopt = gopt
   self.popt = popt
   self.max_isize = max_isize
   self.pairing_batch_size = pairing_batch_size
   self.mmap_enabled = mmap_enabled
   # reference structures, loaded on demand
   self.__bwts = None # the reference index
   # the reference itself
   self.__pacseq = None
   self.__bnsp = None
   self.__last_ii = None
   self.clean_isize_statistics()
Example #4
0
  def __init__(self):
    self.event_monitor = QuietMonitor()
    self.logger = QuietMonitor()
    self.__gopt = bwa.gap_init_opt()
    self.__popt = bwa.pe_init_opt()
    self.__batch = []

    self.qformat = "fastq-illumina"
    self.max_isize = 1000
    self.nthreads = 1
    self.trim_qual = 0
    self.mmap_enabled = False
    # .reference is the path to the indexed BWA reference to use.  Include the
    # "root name" of the reference files--i.e. the names without the file extensions.
    # The trailing dot, if included, will be removed.
    # e.g. my_references/hg_18.bwt
    #      my_references/hg_18.rsax
    #      my_references/hg_18.sax   => "my_references/hg_18"
    #      my_references/hg_18.pac
    #      ...
    self.reference = None
    self.hit_visitor = None
    self.__iterator = None
Example #5
0
class BWAIterator(object):

  def __init__(self, root, gopt, popt, max_isize, pairing_batch_size,
               visitor=None, mmap_enabled=False):
    if visitor is None:
      self.visitor = QuietMonitor()
    else:
      self.visitor = visitor
    self.root = root
    self.gopt = gopt
    self.popt = popt
    self.max_isize = max_isize
    self.pairing_batch_size = pairing_batch_size
    self.mmap_enabled = mmap_enabled
    # reference structures, loaded on demand
    self.__bwts = None # the reference index
    # the reference itself
    self.__pacseq = None
    self.__bnsp = None
    self.__last_ii = None
    self.clean_isize_statistics()

  def clean_isize_statistics(self):
    self.__last_ii = bwa.isize_info_t()
    self.__last_ii.avg = -1.0

  def reference_loaded(self):
    # we use a single variable (__bwts) to determine whether
    # the ref has been loaded.
    return self.__bwts is not None

  def load_reference(self):
    """
    Load the reference at self.root.
    Remember to deallocate it when unload_reference() once finished!"""
    if self.reference_loaded():
      raise RuntimeError("A reference is already loaded!")
    try:
      with self.visitor.time_block("restore_index"):
        self.__bwts = bwa.restore_index(self.root, self.mmap_enabled)

      with self.visitor.time_block("restore_reference"):
        self.__bnsp, self.__pacseq = bwa.restore_reference(self.root,
                                                           self.mmap_enabled)
    except:
      self.unload_reference()
      raise

  def unload_reference(self):
    """Free an allocated reference, if any.
    Sets __bwts, __pacseq, and __bnsp to None.
    """
    if self.__bwts:
      for j in 0, 1:
        bwa.bwt_destroy(self.__bwts[j], self.mmap_enabled)
      self.__bwts = None

    if self.__pacseq:
      del self.__pacseq
      self.__pacseq = None

    if self.__bnsp:
      bwa.bns_destroy(self.__bnsp, self.mmap_enabled)
      self.__bnsp = None

  def __analyze(self, bwsa, seq_pairs_read):
    ##################################################
    # This method performs the actual analysis work.
    # BWA writes its results directly into the bwsa structures.
    ##################################################
    if not self.reference_loaded():
      self.load_reference()

    ii = bwa.isize_info_t()

    self.visitor.start("cal_sa_reg_gap")
    for i in 0, 1:
      bwa.cal_sa_reg_gap_mt(self.__bwts, seq_pairs_read, bwsa[i], self.gopt)
    self.visitor.stop("cal_sa_reg_gap")

    self.visitor.start("cal_pac_pos_pe")
    bwa.cal_pac_pos_pe(self.__bwts, seq_pairs_read, bwsa, ii,
                                 self.popt, self.gopt, self.__last_ii)
    self.visitor.stop("cal_pac_pos_pe")
    self.__last_ii = ii

    if ii.avg > self.max_isize:
      self.visitor.log_warning("skipping S-W, isize is too big (%.3f)" % ii.avg)
    else:
      self.visitor.start("paired_sw")
      if 0 < self.pairing_batch_size < seq_pairs_read:
        for offset in xrange(0, seq_pairs_read, self.pairing_batch_size):
          nseq = min(self.pairing_batch_size, seq_pairs_read - offset)
          self.visitor.start("paired_sw_batch")
          bwa.paired_sw(self.__bnsp, self.__pacseq, nseq, bwsa, self.popt, ii, offset)
          self.visitor.stop_batch("paired_sw_batch", offset, nseq)
      else:
        bwa.paired_sw(self.__bnsp, self.__pacseq, seq_pairs_read, bwsa, self.popt, ii)
      self.visitor.stop("paired_sw")

    self.visitor.start("refine_gapped")
    for i in 0, 1:
      bwa.refine_gapped(self.__bnsp, seq_pairs_read, bwsa[i], self.__pacseq)
    self.visitor.stop("refine_gapped")


  def analyze(self, bwsa, seq_pairs_read):
    """
    Align the seq_pairs_read sequence pairs in bwsa and iterate
    through the matches one read at a time.  For each read
    in each pair in bwsa, this method will yield
    pairs of BwaMapping objects (mread1, mread2) and (mread2, mread1).
    """
    self.__analyze(bwsa, seq_pairs_read)
    for i in xrange(seq_pairs_read):
      for j in 0, 1:
        yield BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[j][i], bwsa[j^1][i])


  def analyze_pairs(self, bwsa, seq_pairs_read):
    """
    Align the seq_pairs_read sequence pairs in bwsa and iterate
    through the matches two reads at a time.  For each pair
    in bwsa this method will yield a
    pairs of BwaMapping objects (mread1, mread2) and (mread2, mread1).

    To analyze the match objects use bwa.analyze_hit
    """
    self.__analyze(bwsa, seq_pairs_read)
    for i in xrange(seq_pairs_read):
      # XXX LP: if we leave in this check our filtered map counts will be wrong.
      # since it doesn't seem to make much of a difference in speed I'll leave it in.
      #if bwsa[0][i].type == bwa.BWA_TYPE_NO_MATCH or bwsa[1][i].type == bwa.BWA_TYPE_NO_MATCH:
        yield BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[0][i], bwsa[1][i]), BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[1][i], bwsa[0][i])
Example #6
0
class BwaAligner510(object):
    """
  Object oriented interface to perform bwa aln + bwa sampe
  using libbwa and the bwa module.
  """
    def __init__(self):
        self.event_monitor = QuietMonitor()
        self.logger = QuietMonitor()
        self.__gopt = bwa.gap_init_opt()
        self.__popt = bwa.pe_init_opt()
        self.__batch = []

        self.qformat = "fastq-illumina"
        self.max_isize = 1000
        self.nthreads = 1
        self.trim_qual = 0
        self.mmap_enabled = False
        # .reference is the path to the indexed BWA reference to use.  Include the
        # "root name" of the reference files--i.e. the names without the file extensions.
        # The trailing dot, if included, will be removed.
        # e.g. my_references/hg_18.bwt
        #      my_references/hg_18.rsax
        #      my_references/hg_18.sax   => "my_references/hg_18"
        #      my_references/hg_18.pac
        #      ...
        self.reference = None
        self.hit_visitor = None
        self.__iterator = None

    def load_pair_record(self, record):
        """
    Append a tuple of the format (id, seq1, qual1, seq2, qual2) to this aligner's work batch.
    """
        self.__batch.append(record)
        return len(self.__batch)

    def get_batch_size(self):
        return len(self.__batch)

    def clear_batch(self):
        self.__batch = []

    def run_alignment(self):
        if not self.reference:
            raise ValueError(
                "You must set the reference path before calling run_alignment")
        if not self.hit_visitor:
            raise ValueError(
                "You must set the hit_visitor before calling run_alignment (else you'll lose the alignment results)"
            )

        self.__check_reference()

        # update __gopt to reflect any changes to our public attributes.
        self.__gopt.contents.n_threads = self.nthreads
        self.event_monitor.log_debug("BWA using %d threads",
                                     self.__gopt.contents.n_threads)

        if self.__iterator is None:
            self.__iterator = BWAIterator(self.reference,
                                          self.__gopt,
                                          self.__popt,
                                          self.max_isize,
                                          len(self.__batch),
                                          visitor=self.event_monitor,
                                          mmap_enabled=self.mmap_enabled)

        status = "aligning against %s" % os.path.basename(self.reference)
        self.event_monitor.new_status(status)
        self.event_monitor.log_debug(status)

        self.event_monitor.count("reads processed", 2 * len(self.__batch))

        with self.event_monitor.time_block("build_bwsa"):
            # we need to rebuild sequences each time, bwa functions modify them.
            bwsa = bwa.build_bws_array(self.__batch,
                                       qtype=self.qformat,
                                       trim_qual=self.trim_qual)

        self.__count_bases(bwsa, len(self.__batch))

        # bwa_iterator.analyze_pairs performs the alignment, then lets you iterate
        # over the results
        with self.event_monitor.time_block(
                "analyze_pairs (cal_+sw+refgap+process)"):
            # The python speed optimization tips suggest removing non-local variable look-ups
            # from loops.
            # http://wiki.python.org/moin/PythonSpeed/PerformanceTips
            h_proc = self.hit_visitor.process
            for hit1, hit2 in self.__iterator.analyze_pairs(
                    bwsa, len(self.__batch)):
                h_proc((hit1, hit2))

        with self.event_monitor.time_block("destroy_sequences"):
            for j in 0, 1:
                bwa.free_seq(len(self.__batch), bwsa[j])

    def release_resources(self):
        if self.__iterator:
            self.__iterator.unload_reference()

    def __count_bases(self, bwsa, bwsa_size):
        self.event_monitor.start("count_bases")
        for r in 0, 1:
            for i in xrange(bwsa_size):
                seq = bwsa[r][i]
                self.event_monitor.count("total bases", seq.full_len)
                self.event_monitor.count("trimmed bases",
                                         seq.full_len - seq.clip_len)
        self.event_monitor.stop("count_bases")

    def __check_reference(self):
        """
    Checks that the indexed referenced at path self.reference includes
    files with all the expected extensions.
    Raises ValueError if something is missing.
    """
        if self.reference[-1] == '.':
            self.reference = self.reference[
                0:-1]  # remove the trailing '.', if any
        ref_extensions = set([
            os.path.splitext(path)[1].lstrip('.')
            for path in glob.iglob(self.reference + ".*")
        ])
        index_extensions = set([
            e for e in ref_extensions if e in BWA_INDEX_EXT
        ])  # only extensions pertaining to index
        missing = BWA_INDEX_MANDATORY_EXT - index_extensions
        if missing:
            raise ValueError("Missing BWA index file types: %s" %
                             ', '.join(missing))
        if self.mmap_enabled and (BWA_INDEX_MMAP_EXT - index_extensions):
            raise ValueError("Missing BWA mmap index files: %s" %
                             ', '.join(BWA_INDEX_MMAP_EXT - index_extensions))
        elif not self.mmap_enabled and (BWA_INDEX_NORM_EXT - index_extensions):
            raise ValueError("Missing BWA index files: %s" %
                             ', '.join(BWA_INDEX_NORM_EXT - index_extensions))
Example #7
0
class BwaAligner510(object):
  """
  Object oriented interface to perform bwa aln + bwa sampe
  using libbwa and the bwa module.
  """
  def __init__(self):
    self.event_monitor = QuietMonitor()
    self.logger = QuietMonitor()
    self.__gopt = bwa.gap_init_opt()
    self.__popt = bwa.pe_init_opt()
    self.__batch = []

    self.qformat = "fastq-illumina"
    self.max_isize = 1000
    self.nthreads = 1
    self.trim_qual = 0
    self.mmap_enabled = False
    # .reference is the path to the indexed BWA reference to use.  Include the
    # "root name" of the reference files--i.e. the names without the file extensions.
    # The trailing dot, if included, will be removed.
    # e.g. my_references/hg_18.bwt
    #      my_references/hg_18.rsax
    #      my_references/hg_18.sax   => "my_references/hg_18"
    #      my_references/hg_18.pac
    #      ...
    self.reference = None
    self.hit_visitor = None
    self.__iterator = None

  def load_pair_record(self, record):
    """
    Append a tuple of the format (id, seq1, qual1, seq2, qual2) to this aligner's work batch.
    """
    self.__batch.append(record)
    return len(self.__batch)

  def get_batch_size(self):
    return len(self.__batch)

  def clear_batch(self):
    self.__batch = []

  def run_alignment(self):
    if not self.reference:
      raise ValueError("You must set the reference path before calling run_alignment")
    if not self.hit_visitor:
      raise ValueError("You must set the hit_visitor before calling run_alignment (else you'll lose the alignment results)")

    self.__check_reference()

    # update __gopt to reflect any changes to our public attributes.
    self.__gopt.contents.n_threads = self.nthreads
    self.event_monitor.log_debug("BWA using %d threads", self.__gopt.contents.n_threads)

    if self.__iterator is None:
      self.__iterator = BWAIterator(self.reference, self.__gopt, self.__popt,
                                    self.max_isize, len(self.__batch),
                                    visitor=self.event_monitor,
                                    mmap_enabled=self.mmap_enabled)

    status = "aligning against %s" % os.path.basename(self.reference)
    self.event_monitor.new_status(status)
    self.event_monitor.log_debug(status)

    self.event_monitor.count("reads processed", 2*len(self.__batch))

    with self.event_monitor.time_block("build_bwsa"):
      # we need to rebuild sequences each time, bwa functions modify them.
      bwsa = bwa.build_bws_array(self.__batch, qtype=self.qformat, trim_qual=self.trim_qual)

    self.__count_bases(bwsa, len(self.__batch))

    # bwa_iterator.analyze_pairs performs the alignment, then lets you iterate
    # over the results
    with self.event_monitor.time_block("analyze_pairs (cal_+sw+refgap+process)"):
      # The python speed optimization tips suggest removing non-local variable look-ups
      # from loops.
      # http://wiki.python.org/moin/PythonSpeed/PerformanceTips
      h_proc = self.hit_visitor.process
      for hit1, hit2 in self.__iterator.analyze_pairs(bwsa, len(self.__batch)):
        h_proc( (hit1, hit2) )

    with self.event_monitor.time_block("destroy_sequences"):
      for j in 0, 1:
        bwa.free_seq(len(self.__batch), bwsa[j])

  def release_resources(self):
    if self.__iterator:
      self.__iterator.unload_reference()

  def __count_bases(self, bwsa, bwsa_size):
    self.event_monitor.start("count_bases")
    for r in 0,1:
      for i in xrange(bwsa_size):
        seq = bwsa[r][i]
        self.event_monitor.count("total bases", seq.full_len)
        self.event_monitor.count("trimmed bases", seq.full_len - seq.clip_len)
    self.event_monitor.stop("count_bases")


  def __check_reference(self):
    """
    Checks that the indexed referenced at path self.reference includes
    files with all the expected extensions.
    Raises ValueError if something is missing.
    """
    if self.reference[-1] == '.':
      self.reference = self.reference[0:-1] # remove the trailing '.', if any
    ref_extensions = set([ os.path.splitext(path)[1].lstrip('.') for path in glob.iglob(self.reference + ".*") ])
    index_extensions = set([ e for e in ref_extensions if e in BWA_INDEX_EXT ]) # only extensions pertaining to index
    missing = BWA_INDEX_MANDATORY_EXT - index_extensions
    if missing:
      raise ValueError("Missing BWA index file types: %s" % ', '.join(missing))
    if self.mmap_enabled and (BWA_INDEX_MMAP_EXT - index_extensions):
      raise ValueError("Missing BWA mmap index files: %s" % ', '.join(BWA_INDEX_MMAP_EXT - index_extensions))
    elif not self.mmap_enabled and (BWA_INDEX_NORM_EXT - index_extensions):
      raise ValueError("Missing BWA index files: %s" % ', '.join(BWA_INDEX_NORM_EXT - index_extensions))
Example #8
0
class BWAIterator(object):
    def __init__(self,
                 root,
                 gopt,
                 popt,
                 max_isize,
                 pairing_batch_size,
                 visitor=None,
                 mmap_enabled=False):
        if visitor is None:
            self.visitor = QuietMonitor()
        else:
            self.visitor = visitor
        self.root = root
        self.gopt = gopt
        self.popt = popt
        self.max_isize = max_isize
        self.pairing_batch_size = pairing_batch_size
        self.mmap_enabled = mmap_enabled
        # reference structures, loaded on demand
        self.__bwts = None  # the reference index
        # the reference itself
        self.__pacseq = None
        self.__bnsp = None
        self.__last_ii = None
        self.clean_isize_statistics()

    def clean_isize_statistics(self):
        self.__last_ii = bwa.isize_info_t()
        self.__last_ii.avg = -1.0

    def reference_loaded(self):
        # we use a single variable (__bwts) to determine whether
        # the ref has been loaded.
        return self.__bwts is not None

    def load_reference(self):
        """
    Load the reference at self.root.
    Remember to deallocate it when unload_reference() once finished!"""
        if self.reference_loaded():
            raise RuntimeError("A reference is already loaded!")
        try:
            with self.visitor.time_block("restore_index"):
                self.__bwts = bwa.restore_index(self.root, self.mmap_enabled)

            with self.visitor.time_block("restore_reference"):
                self.__bnsp, self.__pacseq = bwa.restore_reference(
                    self.root, self.mmap_enabled)
        except:
            self.unload_reference()
            raise

    def unload_reference(self):
        """Free an allocated reference, if any.
    Sets __bwts, __pacseq, and __bnsp to None.
    """
        if self.__bwts:
            for j in 0, 1:
                bwa.bwt_destroy(self.__bwts[j], self.mmap_enabled)
            self.__bwts = None

        if self.__pacseq:
            del self.__pacseq
            self.__pacseq = None

        if self.__bnsp:
            bwa.bns_destroy(self.__bnsp, self.mmap_enabled)
            self.__bnsp = None

    def __analyze(self, bwsa, seq_pairs_read):
        ##################################################
        # This method performs the actual analysis work.
        # BWA writes its results directly into the bwsa structures.
        ##################################################
        if not self.reference_loaded():
            self.load_reference()

        ii = bwa.isize_info_t()

        self.visitor.start("cal_sa_reg_gap")
        for i in 0, 1:
            bwa.cal_sa_reg_gap_mt(self.__bwts, seq_pairs_read, bwsa[i],
                                  self.gopt)
        self.visitor.stop("cal_sa_reg_gap")

        self.visitor.start("cal_pac_pos_pe")
        bwa.cal_pac_pos_pe(self.__bwts, seq_pairs_read, bwsa, ii, self.popt,
                           self.gopt, self.__last_ii)
        self.visitor.stop("cal_pac_pos_pe")
        self.__last_ii = ii

        if ii.avg > self.max_isize:
            self.visitor.log_warning("skipping S-W, isize is too big (%.3f)" %
                                     ii.avg)
        else:
            self.visitor.start("paired_sw")
            if 0 < self.pairing_batch_size < seq_pairs_read:
                for offset in xrange(0, seq_pairs_read,
                                     self.pairing_batch_size):
                    nseq = min(self.pairing_batch_size,
                               seq_pairs_read - offset)
                    self.visitor.start("paired_sw_batch")
                    bwa.paired_sw(self.__bnsp, self.__pacseq, nseq, bwsa,
                                  self.popt, ii, offset)
                    self.visitor.stop_batch("paired_sw_batch", offset, nseq)
            else:
                bwa.paired_sw(self.__bnsp, self.__pacseq, seq_pairs_read, bwsa,
                              self.popt, ii)
            self.visitor.stop("paired_sw")

        self.visitor.start("refine_gapped")
        for i in 0, 1:
            bwa.refine_gapped(self.__bnsp, seq_pairs_read, bwsa[i],
                              self.__pacseq)
        self.visitor.stop("refine_gapped")

    def analyze(self, bwsa, seq_pairs_read):
        """
    Align the seq_pairs_read sequence pairs in bwsa and iterate
    through the matches one read at a time.  For each read
    in each pair in bwsa, this method will yield
    pairs of BwaMapping objects (mread1, mread2) and (mread2, mread1).
    """
        self.__analyze(bwsa, seq_pairs_read)
        for i in xrange(seq_pairs_read):
            for j in 0, 1:
                yield BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[j][i],
                                 bwsa[j ^ 1][i])

    def analyze_pairs(self, bwsa, seq_pairs_read):
        """
    Align the seq_pairs_read sequence pairs in bwsa and iterate
    through the matches two reads at a time.  For each pair
    in bwsa this method will yield a
    pairs of BwaMapping objects (mread1, mread2) and (mread2, mread1).

    To analyze the match objects use bwa.analyze_hit
    """
        self.__analyze(bwsa, seq_pairs_read)
        for i in xrange(seq_pairs_read):
            # XXX LP: if we leave in this check our filtered map counts will be wrong.
            # since it doesn't seem to make much of a difference in speed I'll leave it in.
            #if bwsa[0][i].type == bwa.BWA_TYPE_NO_MATCH or bwsa[1][i].type == bwa.BWA_TYPE_NO_MATCH:
            yield BwaMapping(self.gopt[0], self.__bnsp[0], bwsa[0][i],
                             bwsa[1][i]), BwaMapping(self.gopt[0],
                                                     self.__bnsp[0],
                                                     bwsa[1][i], bwsa[0][i])