Example #1
0
    def __init__(self, ctx):
        super(type(self), self).__init__(ctx)
        self.__get_configuration(ctx)
        logging.basicConfig(level=self.log_level)
        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS,
                                                logging.getLogger("mapper"),
                                                ctx)

        self.aligner = BwaAligner()
        self.aligner.event_monitor = self.event_monitor
        self.aligner.qformat = self.format
        self.aligner.max_isize = self.max_isize
        self.aligner.nthreads = self.nthreads
        self.aligner.trim_qual = self.trim_qual
        self.aligner.mmap_enabled = True

        ######## assemble hit processor chain
        chain = FilterLink(self.event_monitor)
        chain.remove_unmapped = self.remove_unmapped
        chain.min_hit_quality = self.min_hit_quality
        if self.__map_only:
            chain.set_next(EmitSamLink(ctx, self.event_monitor))
        else:
            chain.set_next(MarkDuplicatesEmitter(ctx, self.event_monitor))
        self.aligner.hit_visitor = chain

        ######## set the path to the reference index
        self.ref_archive = utils.get_ref_archive(ctx.getJobConf())
        self.aligner.reference = self.get_reference_root(self.ref_archive)

        # part of the code is a workaround for accumulating records, see #331
        isplit = InputSplit(ctx.getInputSplit())
        self.split_end = isplit.offset + isplit.length
Example #2
0
def run_bwa_py_sampe(refseq_fname, read_fname, mate_fname,
                     log_level=logging.INFO, pairing_batch_size=None,
                     seq_list_len=None, fastq_subfmt="fastq-illumina"):
  logger = logging.getLogger("PY")
  logger.setLevel(log_level)
  logger.info("RUNNING PYTHON VERSION")
  def debug_dump(seq, state):
    logger.debug("%s: name=%s" % (state, seq.get_name()))
    logger.debug("%s: qual=%s" % (state, seq.get_qual_string()))
    logger.debug("%s: strand=%d" % (state, seq.strand))
    logger.debug("%s: pos=%d" % (state, seq.pos))
    logger.debug("%s: mapQ=%d" % (state, seq.mapQ))

  read_flow = Bio.SeqIO.parse(open(read_fname), fastq_subfmt)
  mate_flow = Bio.SeqIO.parse(open(mate_fname), fastq_subfmt)
  pairs_flow = it.izip(read_flow, mate_flow)

  class ResultCollector(object):
    def __init__(self):
      self.result = []
    def process(self, pair):
      self.result.append(pair[0])
      self.result.append(pair[1])
  result = ResultCollector()

  while 1:
    start = time.time()
    pairs = list(it.islice(pairs_flow, 0, seq_list_len))
    if len(pairs) == 0:
      break
    # turn the biopython SeqRecords into simple tuples
    tuples = map(lambda t: (t[0].name, t[0].seq.tostring(), None, t[1].seq.tostring(), None), pairs[0:5])
    for t in tuples:
      print t
    logger.info('reading seqs %f sec' % (time.time() - start))

    start = time.time()
    aligner = BwaAligner()
    aligner.reference = refseq_fname
    aligner.hit_visitor = result
    for t in tuples[0:5]:
      aligner.load_pair_record(t)
    aligner.run_alignment()
    aligner.clear_batch()
    logger.info('alignment %f sec' % (time.time() - start))

  # map bwa mappings to dictionaries
  def bwam_to_hash(bwa_m):
    h = dict(
        name=bwa_m.name,
        aux=bwa_m.tags,
        seq=bwa_m.get_seq_5()
        )
    return h

  return map(bwam_to_hash, result.result)
Example #3
0
    def setUp(self):
        utils.build_ref_index()
        self.aligner = BwaAligner()
        self.aligner.reference = utils.reference
        self.aligner.hit_visitor = type(self).SimpleVisitor()

        self.pairs = []
        with open(utils.get_fixture_path("pairs.txt")) as f:
            for line in f:
                if not line.startswith("#"):  # leave #-lines for comments
                    self.pairs.append(line.rstrip("\r\n").split("\t"))
Example #4
0
File: mapper.py Project: pinno/seal
    def __init__(self, ctx):
        super(type(self), self).__init__(ctx)
        self.__get_configuration(ctx)
        logging.basicConfig(level=self.log_level)
        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx)

        self.aligner = BwaAligner()
        self.aligner.event_monitor = self.event_monitor
        self.aligner.qformat = self.format
        self.aligner.max_isize = self.max_isize
        self.aligner.nthreads = self.nthreads
        self.aligner.trim_qual = self.trim_qual
        self.aligner.mmap_enabled = True

        ######## assemble hit processor chain
        chain = FilterLink(self.event_monitor)
        chain.remove_unmapped = self.remove_unmapped
        chain.min_hit_quality = self.min_hit_quality
        if self.__map_only:
            chain.set_next( EmitSamLink(ctx, self.event_monitor) )
        else:
            chain.set_next( MarkDuplicatesEmitter(ctx, self.event_monitor) )
        self.aligner.hit_visitor = chain

        ######## set the path to the reference index
        self.ref_archive = utils.get_ref_archive(ctx.getJobConf())
        self.aligner.reference = self.get_reference_root(self.ref_archive)

        # part of the code is a workaround for accumulating records, see #331
        isplit = InputSplit(ctx.getInputSplit())
        self.split_end = isplit.offset + isplit.length
Example #5
0
    def setUp(self):
        self.aligner = BwaAligner()
        test_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '..'))
        self.aligner.reference = os.path.join(test_dir, 'seal',
                                              'mini_ref_fixture',
                                              'mini_ref.fasta')
        self.aligner.hit_visitor = MappingsCollector()
        self.aligner.qformat = "fastq-sanger"

        self.pair = (
            "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904",
            "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG",
            "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################",
            "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT",
            "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>"
        )
  def setUp(self):
    utils.build_ref_index()
    self.aligner = BwaAligner()
    self.aligner.reference = utils.reference
    self.aligner.hit_visitor = type(self).SimpleVisitor()

    self.pairs = []
    with open(utils.get_fixture_path("pairs.txt")) as f:
      for line in f:
        if not line.startswith("#"): # leave #-lines for comments
          self.pairs.append(line.rstrip("\r\n").split("\t"))
class TestBwaAligner(unittest.TestCase):

  def setUp(self):
    self.aligner = BwaAligner()
    test_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
    self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta')
    self.aligner.hit_visitor = MappingsCollector()
    self.aligner.qformat = "fastq-sanger"

    self.pair = (
      "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904",
      "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG",
      "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################",
      "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT",
      "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>")

  def test_pair(self):
    self.aligner.load_pair_record(self.pair)
    self.aligner.run_alignment()
    self.aligner.clear_batch()

    results = sorted(self.aligner.hit_visitor.mappings)
    self.assertEqual(
     "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904	133	chr1	24762	0	*	=	24762	0	AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT	@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>",
     results[0])
    self.assertEqual(
      "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904	73	chr1	24762	37	101M	=	24762	0	GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG	?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################	XT:A:U	NM:i:2	SM:i:37	AM:i:0	X0:i:1	X1:i:0	XM:i:2	XO:i:0	XG:i:0	MD:Z:7T83G9",
      results[1])
Example #8
0
class TestBwaAligner(unittest.TestCase):
    def setUp(self):
        self.aligner = BwaAligner()
        test_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '..'))
        self.aligner.reference = os.path.join(test_dir, 'seal',
                                              'mini_ref_fixture',
                                              'mini_ref.fasta')
        self.aligner.hit_visitor = MappingsCollector()
        self.aligner.qformat = "fastq-sanger"

        self.pair = (
            "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904",
            "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG",
            "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################",
            "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT",
            "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>"
        )

    def test_pair(self):
        self.aligner.load_pair_record(self.pair)
        self.aligner.run_alignment()
        self.aligner.clear_batch()

        results = sorted(self.aligner.hit_visitor.mappings)
        self.assertEqual(
            "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904	133	chr1	24762	0	*	=	24762	0	AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT	@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>",
            results[0])
        self.assertEqual(
            "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904	73	chr1	24762	37	101M	=	24762	0	GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG	?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################	XT:A:U	NM:i:2	SM:i:37	AM:i:0	X0:i:1	X1:i:0	XM:i:2	XO:i:0	XG:i:0	MD:Z:7T83G9",
            results[1])
  def setUp(self):
    self.aligner = BwaAligner()
    test_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
    self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta')
    self.aligner.hit_visitor = MappingsCollector()
    self.aligner.qformat = "fastq-sanger"

    self.pair = (
      "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904",
      "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG",
      "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################",
      "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT",
      "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>")
Example #10
0
File: mapper.py Project: pinno/seal
class mapper(Mapper):
    """
    Aligns sequences to a reference genome.

    @input-record: C{key} does not matter (standard LineRecordReader);
    C{value} is a tab-separated text line with 5 fields: ID, read_seq,
    read_qual, mate_seq, mate_qual.

    @output-record: protobuf-serialized mapped pairs (map-reduce job) or alignment
    records in SAM format (map-only job).

    @jobconf-param: C{mapred.reduce.tasks} number of Hadoop reduce tasks to launch.
    If the value of this property is set to 0, then the mapper will directly output
    the mappings in SAM format, like BWA.  If set to a value > 0 the mapper will output
    mappings in the protobuf serialized format for the rmdup reducer.

    @jobconf-param: C{seal.seqal.log.level} logging level,
    specified as a logging module literal.

    @jobconf-param: C{mapred.cache.archives} distributed
    cache entry for the bwa index archive. The entry
    is of the form HDFS_PATH#LINK_NAME. The archive for a given
    chromosome must contain (at the top level, i.e., no directories) all
    files generated by 'bwa index' for that chromosome.

    @jobconf-param: C{seal.seqal.alignment.max.isize}: if the
    inferred isize is greater than this value, Smith-Waterman alignment
    for unmapped reads will be skipped.

    @jobconf-param: C{seal.seqal.pairing.batch.size}: how many
    sequences should be processed at a time by the pairing
    function. Status will be updated at each new batch: therefore,
    lowering this value can help avoid timeouts.

    @jobconf-param: C{seal.seqal.fastq-subformat} Specifies base quality
    score encoding.  Supported types are: 'fastq-sanger' and 'fastq-illumina'.

    @jobconf-param: C{mapred.create.symlink} must be set to 'yes'.

    @jobconf-param: C{seal.seqal.min_hit_quality} mapping quality
    threshold below which the mapping will be discarded.
    """
    SUPPORTED_FORMATS = "fastq-illumina", "fastq-sanger"
    DEFAULT_FORMAT = "fastq-sanger"
    COUNTER_CLASS = "SEQAL"
    DeprecationMap = {
      "seal.seqal.log.level":           "bl.seqal.log.level",
      "seal.seqal.alignment.max.isize": "bl.seqal.alignment.max.isize",
      "seal.seqal.pairing.batch.size":  "bl.seqal.pairing.batch.size",
      "seal.seqal.fastq-subformat":     "bl.seqal.fastq-subformat",
      "seal.seqal.min_hit_quality":     "bl.seqal.min_hit_quality",
      "seal.seqal.remove_unmapped":     "bl.seqal.remove_unmapped",
      "seal.seqal.discard_duplicates":  "bl.seqal.discard_duplicates",
      "seal.seqal.nthreads":            "bl.seqal.nthreads",
      "seal.seqal.trim.qual":           "bl.seqal.trim.qual",
    }

    def __get_configuration(self, ctx):
        # TODO:  refactor settings common to mapper and reducer
        jc = ctx.getJobConf()

        logger = logging.getLogger("seqal")
        jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger)

        jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO')
        jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT)
        jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000)
        jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000)
        jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0)
        jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False)
        jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1)
        jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0)

        try:
            self.log_level = getattr(logging, self.log_level)
        except AttributeError:
            raise ValueError("Unsupported log level: %r" % self.log_level)

        if self.format not in self.SUPPORTED_FORMATS:
            raise_pydoop_exception(
              "seal.seqal.fastq-subformat must be one of %r" %
              (self.SUPPORTED_FORMATS,)
              )

        if self.max_isize <= 0:
            raise ValueError("'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]")

        if self.batch_size <= 0:
            raise ValueError("'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]")

        # minimum qual value required for a hit to be kept.  By default outputs all the
        # hits BWA returns.
        if self.min_hit_quality < 0:
            raise ValueError("'seal.seqal.min_hit_quality' must be >= 0, if specified [0]")

        # number of concurrent threads for main alignment operation
        if self.nthreads <= 0:
            raise ValueError("'seal.seqal.nthreads' must be > 0, if specified [1]")

        # trim quality parameter used by BWA from read trimming.  Equivalent to
        # the -q parameter for bwa align
        if self.trim_qual < 0:
            raise ValueError("'seal.seqal.trim.qual' must be >= 0, if specified [0]")

        if jc.hasKey('mapred.reduce.tasks') and jc.getInt('mapred.reduce.tasks') > 0:
            self.__map_only = False
        else:
            self.__map_only = True

    def __is_last_record(self, k, v):
        return k + len(v) + 2 >= self.split_end


    def get_reference_root(self, ref_dir):
        """
        Given a directory containing a BWA indexed reference,
        such that all its files have a common name (except the extension),
        this method find the path to the reference including the common name.
         e.g. my_reference/hg_18.bwt
              my_reference/hg_18.rsax
              my_reference/hg_18.sax   => "my_references/hg_18"
              my_reference/hg_18.pac
              my_reference/irrelevant_file
        """
        index_paths = filter(lambda tpl: tpl[1].lstrip('.') in BWA_INDEX_EXT,
                                map(os.path.splitext, os.listdir(ref_dir)))
        roots = set( zip(*index_paths)[0] )
        if len(roots) == 0:
            raise(ValueError, "Missing references.  Didn't find any files with required extensions (%s) at path %s" % (BWA_INDEX_EXT, ref_dir))
        elif len(roots) != 1:
            raise(ValueError, "multiple references? Found reference roots %s" % (roots,))

        return os.path.join(ref_dir, tuple(roots)[0])

    def __init__(self, ctx):
        super(type(self), self).__init__(ctx)
        self.__get_configuration(ctx)
        logging.basicConfig(level=self.log_level)
        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx)

        self.aligner = BwaAligner()
        self.aligner.event_monitor = self.event_monitor
        self.aligner.qformat = self.format
        self.aligner.max_isize = self.max_isize
        self.aligner.nthreads = self.nthreads
        self.aligner.trim_qual = self.trim_qual
        self.aligner.mmap_enabled = True

        ######## assemble hit processor chain
        chain = FilterLink(self.event_monitor)
        chain.remove_unmapped = self.remove_unmapped
        chain.min_hit_quality = self.min_hit_quality
        if self.__map_only:
            chain.set_next( EmitSamLink(ctx, self.event_monitor) )
        else:
            chain.set_next( MarkDuplicatesEmitter(ctx, self.event_monitor) )
        self.aligner.hit_visitor = chain

        ######## set the path to the reference index
        self.ref_archive = utils.get_ref_archive(ctx.getJobConf())
        self.aligner.reference = self.get_reference_root(self.ref_archive)

        # part of the code is a workaround for accumulating records, see #331
        isplit = InputSplit(ctx.getInputSplit())
        self.split_end = isplit.offset + isplit.length

    def map(self, ctx):
        # Accumulates reads in self.pairs, until batch size is reached or
        # until the input is finished.	At that point it calls run_alignment
        # and emits the output.
        k = struct.unpack(">q", ctx.getInputKey())[0]
        v = ctx.getInputValue()
        self.aligner.load_pair_record(v.split("\t"))
        is_last_record = self.__is_last_record(k, v)
        if self.aligner.get_batch_size() >= self.batch_size or is_last_record:
            self.aligner.run_alignment()
            self.aligner.clear_batch()

        if is_last_record:
            self.aligner.release_resources()
Example #11
0
class mapper(Mapper):
    """
	Aligns sequences to a reference genome.

	@input-record: C{key} does not matter (standard LineRecordReader);
	C{value} is a tab-separated text line with 5 fields: ID, read_seq,
	read_qual, mate_seq, mate_qual.

	@output-record: protobuf-serialized mapped pairs (map-reduce job) or alignment
	records in SAM format (map-only job).

	@jobconf-param: C{mapred.reduce.tasks} number of Hadoop reduce tasks to launch.
	If the value of this property is set to 0, then the mapper will directly output
	the mappings in SAM format, like BWA.  If set to a value > 0 the mapper will output
	mappings in the protobuf serialized format for the rmdup reducer.

	@jobconf-param: C{seal.seqal.log.level} logging level,
	specified as a logging module literal.

	@jobconf-param: C{mapred.cache.archives} distributed
	cache entry for the bwa index archive. The entry
	is of the form HDFS_PATH#LINK_NAME. The archive for a given
	chromosome must contain (at the top level, i.e., no directories) all
	files generated by 'bwa index' for that chromosome.

	@jobconf-param: C{seal.seqal.alignment.max.isize}: if the
	inferred isize is greater than this value, Smith-Waterman alignment
	for unmapped reads will be skipped.

	@jobconf-param: C{seal.seqal.pairing.batch.size}: how many
	sequences should be processed at a time by the pairing
	function. Status will be updated at each new batch: therefore,
	lowering this value can help avoid timeouts.

	@jobconf-param: C{seal.seqal.fastq-subformat} Specifies base quality
	score encoding.  Supported types are: 'fastq-sanger' and 'fastq-illumina'.

	@jobconf-param: C{mapred.create.symlink} must be set to 'yes'.

	@jobconf-param: C{seal.seqal.min_hit_quality} mapping quality
	threshold below which the mapping will be discarded.
	"""
    SUPPORTED_FORMATS = "fastq-illumina", "fastq-sanger"
    DEFAULT_FORMAT = "fastq-sanger"
    COUNTER_CLASS = "SEQAL"
    DeprecationMap = {
        "seal.seqal.log.level": "bl.seqal.log.level",
        "seal.seqal.alignment.max.isize": "bl.seqal.alignment.max.isize",
        "seal.seqal.pairing.batch.size": "bl.seqal.pairing.batch.size",
        "seal.seqal.fastq-subformat": "bl.seqal.fastq-subformat",
        "seal.seqal.min_hit_quality": "bl.seqal.min_hit_quality",
        "seal.seqal.remove_unmapped": "bl.seqal.remove_unmapped",
        "seal.seqal.discard_duplicates": "bl.seqal.discard_duplicates",
        "seal.seqal.nthreads": "bl.seqal.nthreads",
        "seal.seqal.trim.qual": "bl.seqal.trim.qual",
    }

    def __get_configuration(self, ctx):
        # TODO:  refactor settings common to mapper and reducer
        jc = ctx.getJobConf()

        logger = logging.getLogger("seqal")
        jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap,
                                                     logger)

        jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level',
                     'INFO')
        jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format",
                     self.DEFAULT_FORMAT)
        jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize',
                         'max_isize', 1000)
        jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size',
                         'batch_size', 10000)
        jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality',
                         'min_hit_quality', 0)
        jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped',
                          'remove_unmapped', False)
        jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1)
        jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0)

        try:
            self.log_level = getattr(logging, self.log_level)
        except AttributeError:
            raise ValueError("Unsupported log level: %r" % self.log_level)

        if self.format not in self.SUPPORTED_FORMATS:
            raise_pydoop_exception(
                "seal.seqal.fastq-subformat must be one of %r" %
                (self.SUPPORTED_FORMATS, ))

        if self.max_isize <= 0:
            raise ValueError(
                "'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]"
            )

        if self.batch_size <= 0:
            raise ValueError(
                "'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]"
            )

        # minimum qual value required for a hit to be kept.  By default outputs all the
        # hits BWA returns.
        if self.min_hit_quality < 0:
            raise ValueError(
                "'seal.seqal.min_hit_quality' must be >= 0, if specified [0]")

        # number of concurrent threads for main alignment operation
        if self.nthreads <= 0:
            raise ValueError(
                "'seal.seqal.nthreads' must be > 0, if specified [1]")

        # trim quality parameter used by BWA from read trimming.  Equivalent to
        # the -q parameter for bwa align
        if self.trim_qual < 0:
            raise ValueError(
                "'seal.seqal.trim.qual' must be >= 0, if specified [0]")

        if jc.hasKey('mapred.reduce.tasks'
                     ) and jc.getInt('mapred.reduce.tasks') > 0:
            self.__map_only = False
        else:
            self.__map_only = True

    def __is_last_record(self, k, v):
        return k + len(v) + 2 >= self.split_end

    def get_reference_root(self, ref_dir):
        """
		Given a directory containing a BWA indexed reference,
		such that all its files have a common name (except the extension),
		this method find the path to the reference including the common name.
		 e.g. my_reference/hg_18.bwt
		      my_reference/hg_18.rsax
		      my_reference/hg_18.sax   => "my_references/hg_18"
		      my_reference/hg_18.pac
		      my_reference/irrelevant_file
		"""
        index_paths = filter(lambda tpl: tpl[1].lstrip('.') in BWA_INDEX_EXT,
                             map(os.path.splitext, os.listdir(ref_dir)))
        roots = set(zip(*index_paths)[0])
        if len(roots) == 0:
            raise (
                ValueError,
                "Missing references.  Didn't find any files with required extensions (%s) at path %s"
                % (BWA_INDEX_EXT, ref_dir))
        elif len(roots) != 1:
            raise (ValueError,
                   "multiple references? Found reference roots %s" % (roots, ))

        return os.path.join(ref_dir, tuple(roots)[0])

    def __init__(self, ctx):
        super(type(self), self).__init__(ctx)
        self.__get_configuration(ctx)
        logging.basicConfig(level=self.log_level)
        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS,
                                                logging.getLogger("mapper"),
                                                ctx)

        self.aligner = BwaAligner()
        self.aligner.event_monitor = self.event_monitor
        self.aligner.qformat = self.format
        self.aligner.max_isize = self.max_isize
        self.aligner.nthreads = self.nthreads
        self.aligner.trim_qual = self.trim_qual
        self.aligner.mmap_enabled = True

        ######## assemble hit processor chain
        chain = FilterLink(self.event_monitor)
        chain.remove_unmapped = self.remove_unmapped
        chain.min_hit_quality = self.min_hit_quality
        if self.__map_only:
            chain.set_next(EmitSamLink(ctx, self.event_monitor))
        else:
            chain.set_next(MarkDuplicatesEmitter(ctx, self.event_monitor))
        self.aligner.hit_visitor = chain

        ######## set the path to the reference index
        self.ref_archive = utils.get_ref_archive(ctx.getJobConf())
        self.aligner.reference = self.get_reference_root(self.ref_archive)

        # part of the code is a workaround for accumulating records, see #331
        isplit = InputSplit(ctx.getInputSplit())
        self.split_end = isplit.offset + isplit.length

    def map(self, ctx):
        # Accumulates reads in self.pairs, until batch size is reached or
        # until the input is finished.	At that point it calls run_alignment
        # and emits the output.
        k = struct.unpack(">q", ctx.getInputKey())[0]
        v = ctx.getInputValue()
        self.aligner.load_pair_record(v.split("\t"))
        is_last_record = self.__is_last_record(k, v)
        if self.aligner.get_batch_size() >= self.batch_size or is_last_record:
            self.aligner.run_alignment()
            self.aligner.clear_batch()

        if is_last_record:
            self.aligner.release_resources()
Example #12
0
 def setUp(self):
   self.aligner = BwaAligner()
   test_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
   self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta')
   self.aligner.hit_visitor = MappingsCollector()
   self.aligner.qformat = "fastq-sanger"
Example #13
0
class TestBwaAligner(unittest.TestCase):

  def setUp(self):
    self.aligner = BwaAligner()
    test_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
    self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta')
    self.aligner.hit_visitor = MappingsCollector()
    self.aligner.qformat = "fastq-sanger"

  def test_pair(self):
    pair = (
      "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904",
      "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG",
      "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################",
      "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT",
      "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>")
    results = self._align_pair(pair)
    self.assertEqual(
     "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904	133	chr1	24762	0	*	=	24762	0	AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT	@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>",
     results[0])
    self.assertEqual(
      "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904	73	chr1	24762	37	101M	=	24762	0	GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG	?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################	XT:A:U	NM:i:2	SM:i:37	AM:i:0	X0:i:1	X1:i:0	XM:i:2	XO:i:0	XG:i:0	MD:Z:7T83G9",
      results[1])

  def test_easy_align(self):
    pair = (
      "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904",
      # pos: 361
      "TAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC",
      "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAE",
      # pos: 541
      "AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCC",
      "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAF")
    results = self._align_pair(pair)
    self.assertEqual(
      ["HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904",
        "129","chr1","541","37","60M","=","361","-180",
        "AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCC",
        "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAF"],
      self._get_sam_fields(results[0]))
    self.assertEqual(
      ["HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904",
        "65","chr1","361","37","60M","=","541","180",
        "TAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC",
        "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAE"],
      self._get_sam_fields(results[1]))

  def test_align_pair_of_rev_complements(self):
    # These are the same reads as the above, but reversed
    # and complemented. Remember that the above were taken
    # directly from the mini reference sequence.
    pair = (
      "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904",
      # pos: 361
      "GTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGGTTAGGGTTA",
      "EAEHEEEEAGIIGHFEGC?B?HGIGEGCHGEGEEGFAAGEFE?BF@HFCFDAA=<?B@@?",
      # pos: 541
      "GGCGGAGTTGCGTTCTCCTCAGCACAGACCCGGAGAGCACCGCGAGGGCGGAGCTGCGTT",
      "FAF@ECFCGFGIIGGCGECIHEBC:B?HEFDHGEIHI<BEEFIHEHE<DDHBDDDFD@@@")
    results = self._align_pair(pair)
    self.assertEqual(
      ['HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904',
        '113','chr1','361','37','60M','=','541','180',
        'TAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC',
        '?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAE'],
      self._get_sam_fields(results[0])
    )
    self.assertEqual(
      ['HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904',
        '177','chr1','541','37','60M','=','361','-180',
        'AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCC',
        '@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAF'],
      self._get_sam_fields(results[1])
    )


  def _align_pair(self, pair):
    self.aligner.load_pair_record(pair)
    self.aligner.run_alignment()
    self.aligner.clear_batch()
    return sorted(self.aligner.hit_visitor.mappings)

  @staticmethod
  def _get_sam_fields(sam_record):
    return sam_record.split('\t')[0:11]
class TestBwaAligner(unittest.TestCase):
  REFERENCE = os.path.join(os.path.dirname(__file__), 'fixtures/foobar.fa')
  BUILD_REFERENCE = True

  class SimpleVisitor(object):
    def __init__(self):
      self.sam = SamFormatter()
      self.output = StringIO.StringIO()

    def process(self, pair):
      for hit in pair:
        print >>self.output, self.sam.format(hit)

  def setUp(self):
    utils.build_ref_index()
    self.aligner = BwaAligner()
    self.aligner.reference = utils.reference
    self.aligner.hit_visitor = type(self).SimpleVisitor()

    self.pairs = []
    with open(utils.get_fixture_path("pairs.txt")) as f:
      for line in f:
        if not line.startswith("#"): # leave #-lines for comments
          self.pairs.append(line.rstrip("\r\n").split("\t"))

  def tearDown(self):
    utils.remove_ref_index()

  def test_load_clear_batch(self):
    for row in self.pairs:
      self.aligner.load_pair_record(row)
    self.assertEqual(len(self.pairs), self.aligner.get_batch_size())
    self.aligner.clear_batch()
    self.assertEqual(0, self.aligner.get_batch_size())

  def test_defaults(self):
    self.assertEqual("fastq-illumina", self.aligner.qformat)
    self.assertEqual(1000, self.aligner.max_isize)
    self.assertEqual(1, self.aligner.nthreads)
    self.assertEqual(0, self.aligner.trim_qual)

  def test_alignment(self):
    for row in self.pairs:
      self.aligner.load_pair_record(row)
    self.aligner.run_alignment()
    # TODO:  write a more useful test, but for that we'll need a complete test fixture
    self.assertTrue( len(self.aligner.hit_visitor.output.getvalue()) > 0 )

  def test_alignment_mmap(self):
    self.aligner.mmap_enabled = True
    # Generate the .sax and .rsax indices. They will be removed by tearDown.
    bwa.make_suffix_arrays_for_mmap(utils.reference)
    for row in self.pairs:
      self.aligner.load_pair_record(row)
    self.aligner.run_alignment()
    # TODO:  write a more useful test, but for that we'll need a complete test fixture
    self.assertTrue( len(self.aligner.hit_visitor.output.getvalue()) > 0 )

  def test_missing_mmap_index(self):
    self.aligner.mmap_enabled = True
    for row in self.pairs:
      self.aligner.load_pair_record(row)
    self.assertRaises(ValueError, self.aligner.run_alignment)
Example #15
0
class TestBwaAligner(unittest.TestCase):
    REFERENCE = os.path.join(os.path.dirname(__file__), 'fixtures/foobar.fa')
    BUILD_REFERENCE = True

    class SimpleVisitor(object):
        def __init__(self):
            self.sam = SamFormatter()
            self.output = StringIO.StringIO()

        def process(self, pair):
            for hit in pair:
                print >> self.output, self.sam.format(hit)

    def setUp(self):
        utils.build_ref_index()
        self.aligner = BwaAligner()
        self.aligner.reference = utils.reference
        self.aligner.hit_visitor = type(self).SimpleVisitor()

        self.pairs = []
        with open(utils.get_fixture_path("pairs.txt")) as f:
            for line in f:
                if not line.startswith("#"):  # leave #-lines for comments
                    self.pairs.append(line.rstrip("\r\n").split("\t"))

    def tearDown(self):
        utils.remove_ref_index()

    def test_load_clear_batch(self):
        for row in self.pairs:
            self.aligner.load_pair_record(row)
        self.assertEqual(len(self.pairs), self.aligner.get_batch_size())
        self.aligner.clear_batch()
        self.assertEqual(0, self.aligner.get_batch_size())

    def test_defaults(self):
        self.assertEqual("fastq-illumina", self.aligner.qformat)
        self.assertEqual(1000, self.aligner.max_isize)
        self.assertEqual(1, self.aligner.nthreads)
        self.assertEqual(0, self.aligner.trim_qual)

    def test_alignment(self):
        for row in self.pairs:
            self.aligner.load_pair_record(row)
        self.aligner.run_alignment()
        # TODO:  write a more useful test, but for that we'll need a complete test fixture
        self.assertTrue(len(self.aligner.hit_visitor.output.getvalue()) > 0)

    def test_alignment_mmap(self):
        self.aligner.mmap_enabled = True
        # Generate the .sax and .rsax indices. They will be removed by tearDown.
        bwa.make_suffix_arrays_for_mmap(utils.reference)
        for row in self.pairs:
            self.aligner.load_pair_record(row)
        self.aligner.run_alignment()
        # TODO:  write a more useful test, but for that we'll need a complete test fixture
        self.assertTrue(len(self.aligner.hit_visitor.output.getvalue()) > 0)

    def test_missing_mmap_index(self):
        self.aligner.mmap_enabled = True
        for row in self.pairs:
            self.aligner.load_pair_record(row)
        self.assertRaises(ValueError, self.aligner.run_alignment)