Example #1
0
    def __init__(self, ctx):
        super(type(self), self).__init__(ctx)
        self.__get_configuration(ctx)
        logging.basicConfig(level=self.log_level)
        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS,
                                                logging.getLogger("mapper"),
                                                ctx)

        self.aligner = BwaAligner()
        self.aligner.event_monitor = self.event_monitor
        self.aligner.qformat = self.format
        self.aligner.max_isize = self.max_isize
        self.aligner.nthreads = self.nthreads
        self.aligner.trim_qual = self.trim_qual
        self.aligner.mmap_enabled = True

        ######## assemble hit processor chain
        chain = FilterLink(self.event_monitor)
        chain.remove_unmapped = self.remove_unmapped
        chain.min_hit_quality = self.min_hit_quality
        if self.__map_only:
            chain.set_next(EmitSamLink(ctx, self.event_monitor))
        else:
            chain.set_next(MarkDuplicatesEmitter(ctx, self.event_monitor))
        self.aligner.hit_visitor = chain

        ######## set the path to the reference index
        self.ref_archive = utils.get_ref_archive(ctx.getJobConf())
        self.aligner.reference = self.get_reference_root(self.ref_archive)

        # part of the code is a workaround for accumulating records, see #331
        isplit = InputSplit(ctx.getInputSplit())
        self.split_end = isplit.offset + isplit.length
Example #2
0
def run_bwa_py_sampe(refseq_fname, read_fname, mate_fname,
                     log_level=logging.INFO, pairing_batch_size=None,
                     seq_list_len=None, fastq_subfmt="fastq-illumina"):
  logger = logging.getLogger("PY")
  logger.setLevel(log_level)
  logger.info("RUNNING PYTHON VERSION")
  def debug_dump(seq, state):
    logger.debug("%s: name=%s" % (state, seq.get_name()))
    logger.debug("%s: qual=%s" % (state, seq.get_qual_string()))
    logger.debug("%s: strand=%d" % (state, seq.strand))
    logger.debug("%s: pos=%d" % (state, seq.pos))
    logger.debug("%s: mapQ=%d" % (state, seq.mapQ))

  read_flow = Bio.SeqIO.parse(open(read_fname), fastq_subfmt)
  mate_flow = Bio.SeqIO.parse(open(mate_fname), fastq_subfmt)
  pairs_flow = it.izip(read_flow, mate_flow)

  class ResultCollector(object):
    def __init__(self):
      self.result = []
    def process(self, pair):
      self.result.append(pair[0])
      self.result.append(pair[1])
  result = ResultCollector()

  while 1:
    start = time.time()
    pairs = list(it.islice(pairs_flow, 0, seq_list_len))
    if len(pairs) == 0:
      break
    # turn the biopython SeqRecords into simple tuples
    tuples = map(lambda t: (t[0].name, t[0].seq.tostring(), None, t[1].seq.tostring(), None), pairs[0:5])
    for t in tuples:
      print t
    logger.info('reading seqs %f sec' % (time.time() - start))

    start = time.time()
    aligner = BwaAligner()
    aligner.reference = refseq_fname
    aligner.hit_visitor = result
    for t in tuples[0:5]:
      aligner.load_pair_record(t)
    aligner.run_alignment()
    aligner.clear_batch()
    logger.info('alignment %f sec' % (time.time() - start))

  # map bwa mappings to dictionaries
  def bwam_to_hash(bwa_m):
    h = dict(
        name=bwa_m.name,
        aux=bwa_m.tags,
        seq=bwa_m.get_seq_5()
        )
    return h

  return map(bwam_to_hash, result.result)
Example #3
0
    def setUp(self):
        utils.build_ref_index()
        self.aligner = BwaAligner()
        self.aligner.reference = utils.reference
        self.aligner.hit_visitor = type(self).SimpleVisitor()

        self.pairs = []
        with open(utils.get_fixture_path("pairs.txt")) as f:
            for line in f:
                if not line.startswith("#"):  # leave #-lines for comments
                    self.pairs.append(line.rstrip("\r\n").split("\t"))
Example #4
0
    def setUp(self):
        self.aligner = BwaAligner()
        test_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '..'))
        self.aligner.reference = os.path.join(test_dir, 'seal',
                                              'mini_ref_fixture',
                                              'mini_ref.fasta')
        self.aligner.hit_visitor = MappingsCollector()
        self.aligner.qformat = "fastq-sanger"

        self.pair = (
            "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904",
            "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG",
            "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################",
            "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT",
            "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>"
        )