Esempio n. 1
0
class TestHiRapiProperties(unittest.TestCase):
    def setUp(self):
        self.hi = HiRapiAligner('rapi_bwa')

    def tearDown(self):
        self.hi.release_resources()

    def test_defaults(self):
        self.assertTrue(self.hi.paired)
        self.assertEqual(pyrapi.rapi.QENC_SANGER, self.hi.q_offset)

    def test_get_plugin_info(self):
        self.assertEquals('bwa-mem', self.hi.aligner_name)
        self.assertTrue(self.hi.aligner_version)
        self.assertTrue(self.hi.plugin_version)

    def test_set_some_options(self):
        self.hi.opts.n_threads = 11
        self.assertEquals(11, self.hi.opts.n_threads)

        self.hi.opts.mapq_min = 5
        self.assertEquals(5, self.hi.opts.mapq_min)

        self.hi.opts.isize_min = 250
        self.assertEquals(250, self.hi.opts.isize_min)

        self.hi.opts.isize_max = 500
        self.assertEquals(500, self.hi.opts.isize_max)
Esempio n. 2
0
class TestHiRapiProperties(unittest.TestCase):

    def setUp(self):
        self.hi = HiRapiAligner('rapi_bwa')

    def tearDown(self):
        self.hi.release_resources()

    def test_defaults(self):
        self.assertTrue(self.hi.paired)
        self.assertEqual(pyrapi.rapi.QENC_SANGER, self.hi.q_offset)

    def test_get_plugin_info(self):
        self.assertEquals('bwa-mem', self.hi.aligner_name)
        self.assertTrue(self.hi.aligner_version)
        self.assertTrue(self.hi.plugin_version)

    def test_set_some_options(self):
        self.hi.opts.n_threads = 11
        self.assertEquals(11, self.hi.opts.n_threads)

        self.hi.opts.mapq_min = 5
        self.assertEquals(5, self.hi.opts.mapq_min)

        self.hi.opts.isize_min = 250
        self.assertEquals(250, self.hi.opts.isize_min)

        self.hi.opts.isize_max = 500
        self.assertEquals(500, self.hi.opts.isize_max)
Esempio n. 3
0
 def setUp(self):
     self.hi = HiRapiAligner('rapi_bwa')
     self.reads = test_utils.get_mini_ref_seqs()
     for row in self.reads:
         if len(row) != 5:
             raise RuntimeError(
                 "Unexpected number of fields in mini_ref read record")
         self.hi.load_pair(*row)
Esempio n. 4
0
 def setUp(self):
     self.hi = HiRapiAligner('rapi_bwa')
     self.reads = test_utils.get_mini_ref_seqs()
     for row in self.reads:
         if len(row) != 5:
             raise RuntimeError("Unexpected number of fields in mini_ref read record")
         self.hi.load_pair(*row)
Esempio n. 5
0
    def test_base_quality(self):
        hi = HiRapiAligner('rapi_bwa', paired=False)
        one_read = self.reads[0][0:3]
        hi.q_offset = self.hi.Qenc_Sanger
        hi.load_read('sanger_read', one_read[1], one_read[2])

        # 64:  Illumina base quality offset
        # 33:  Sanger base quality offset
        ill_quality = ''.join(chr(ord(c) + (64 - 33)) for c in one_read[2])
        hi.q_offset = self.hi.Qenc_Illumina
        hi.load_read('illumina_read', one_read[1], ill_quality)

        loaded_qualities = [frag[0].qual for frag in hi.ifragments()]
        self.assertEquals(2, len(loaded_qualities))
        self.assertEquals(loaded_qualities[0], loaded_qualities[1])
Esempio n. 6
0
    def __init__(self, ctx):
        super(mapper, self).__init__(ctx)
        self.logger = logging.getLogger("seqal")
        self.__get_configuration(ctx)
        logging.basicConfig(level=self.log_level)
        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx)

        pe = True # single-end sequencen alignment not yet supported by Seqal
        self.hi_rapi = HiRapiAligner('rapi_bwa', paired=pe)

        # opts
        self.hi_rapi.opts.n_threads = self.nthreads
        self.hi_rapi.opts.isize_max = self.max_isize
        if self.min_isize is not None:
            self.hi_rapi.opts.isize_min = self.min_isize
        self.hi_rapi.qoffset = self.hi_rapi.Qenc_Illumina if self.format == "fastq-illumina" else self.hi_rapi.Qenc_Sanger
        # end opts

        self.logger.info("Using the %s aligner plugin, aligner version %s, plugin version %s",
                self.hi_rapi.aligner_name, self.hi_rapi.aligner_version, self.hi_rapi.plugin_version)
        self.logger.info("Working in %s mode", 'paired-end' if pe else 'single-end')

        # allocate space for reads
        self.logger.debug("Reserving batch space for %s reads", self.batch_size)
        self.hi_rapi.reserve_space(self.batch_size) 

        # load reference
        reference_root = self.get_reference_root_from_archive(utils.get_ref_archive(ctx.getJobConf()))
        self.logger.info("Full reference path (prefix): %s", reference_root)
        with self.event_monitor.time_block("Loading reference %s" % reference_root):
            self.hi_rapi.load_ref(reference_root)

        ######## assemble hit processor chain
        chain = RapiFilterLink(self.event_monitor)
        chain.remove_unmapped = self.remove_unmapped
        chain.min_hit_quality = self.min_hit_quality
        if self.__map_only:
            chain.set_next( RapiEmitSamLink(ctx, self.event_monitor, self.hi_rapi) )
        else:
            raise NotImplementedError("Only mapping mode is supported at the moment")
        self.hit_visitor_chain = chain
Esempio n. 7
0
    def test_base_quality(self):
        hi = HiRapiAligner('rapi_bwa', paired=False)
        one_read = self.reads[0][0:3]
        hi.q_offset = self.hi.Qenc_Sanger
        hi.load_read('sanger_read', one_read[1], one_read[2])

        # 64:  Illumina base quality offset
        # 33:  Sanger base quality offset
        ill_quality = ''.join( chr(ord(c) + (64-33)) for c in one_read[2] )
        hi.q_offset = self.hi.Qenc_Illumina
        hi.load_read('illumina_read', one_read[1], ill_quality)

        loaded_qualities = [ frag[0].qual for frag in hi.ifragments() ]
        self.assertEquals(2, len(loaded_qualities))
        self.assertEquals(loaded_qualities[0], loaded_qualities[1])
Esempio n. 8
0
class TestHiRapiBatch(unittest.TestCase):

    def setUp(self):
        self.hi = HiRapiAligner('rapi_bwa')
        self.reads = test_utils.get_mini_ref_seqs()
        for row in self.reads:
            if len(row) != 5:
                raise RuntimeError("Unexpected number of fields in mini_ref read record")
            self.hi.load_pair(*row)

    def tearDown(self):
        self.hi.release_resources()

    def test_fragment_iteration(self):
        read_id_counts = dict()
        for frag in self.hi.ifragments():
            for read in frag:
                read_id = read.id
                read_id_counts[read_id] = 1 + read_id_counts.get(read_id, 0)

        # 5 pairs
        self.assertEquals(5, len(read_id_counts))
        unique_counts = set(read_id_counts.values())
        # all ids appearing twice
        self.assertEquals(1, len(unique_counts))
        self.assertEquals(2, unique_counts.pop())

    def test_batch_management(self):
        self.assertEquals(10, self.hi.batch_size)
        self.hi.clear_batch()
        self.assertEquals(0, self.hi.batch_size)
        self.hi.load_ref(test_utils.MiniRefMemPath)
        self.hi.align_batch() # should not raise just because it's empty

        for _ in self.hi.ifragments():
            self.fail("iterating over an empty batch!")

    def test_base_quality(self):
        hi = HiRapiAligner('rapi_bwa', paired=False)
        one_read = self.reads[0][0:3]
        hi.q_offset = self.hi.Qenc_Sanger
        hi.load_read('sanger_read', one_read[1], one_read[2])

        # 64:  Illumina base quality offset
        # 33:  Sanger base quality offset
        ill_quality = ''.join( chr(ord(c) + (64-33)) for c in one_read[2] )
        hi.q_offset = self.hi.Qenc_Illumina
        hi.load_read('illumina_read', one_read[1], ill_quality)

        loaded_qualities = [ frag[0].qual for frag in hi.ifragments() ]
        self.assertEquals(2, len(loaded_qualities))
        self.assertEquals(loaded_qualities[0], loaded_qualities[1])
Esempio n. 9
0
 def setUp(self):
     self.hi = HiRapiAligner('rapi_bwa')
     self._align_mini_ref_seqs()
Esempio n. 10
0
class TestHiRapiAlignments(unittest.TestCase):

    def setUp(self):
        self.hi = HiRapiAligner('rapi_bwa')
        self._align_mini_ref_seqs()

    def tearDown(self):
        self.hi.release_resources()

    def test_load_reference_again(self):
        # should "just work"
        self.hi.load_ref(test_utils.MiniRefMemPath)

    def test_sam(self):
        io = StringIO()
        self.hi.write_sam(io, include_header=False)
        sam = io.getvalue()
        expected_sam = test_utils.rapi_mini_ref_seqs_sam_no_header()
        self.assertEquals(expected_sam, sam)

    def _align_mini_ref_seqs(self):
        self.hi.load_ref(test_utils.MiniRefMemPath)
        reads = test_utils.get_mini_ref_seqs()
        for row in reads:
            if len(row) != 5:
                raise RuntimeError("Unexpected number of fields in mini_ref read record")
            self.hi.load_pair(*row)
        self.hi.align_batch()
Esempio n. 11
0
 def setUp(self):
     self.hi = HiRapiAligner('rapi_bwa')
Esempio n. 12
0
class TestHiRapiBatch(unittest.TestCase):
    def setUp(self):
        self.hi = HiRapiAligner('rapi_bwa')
        self.reads = test_utils.get_mini_ref_seqs()
        for row in self.reads:
            if len(row) != 5:
                raise RuntimeError(
                    "Unexpected number of fields in mini_ref read record")
            self.hi.load_pair(*row)

    def tearDown(self):
        self.hi.release_resources()

    def test_fragment_iteration(self):
        read_id_counts = dict()
        for frag in self.hi.ifragments():
            for read in frag:
                read_id = read.id
                read_id_counts[read_id] = 1 + read_id_counts.get(read_id, 0)

        # 5 pairs
        self.assertEquals(5, len(read_id_counts))
        unique_counts = set(read_id_counts.values())
        # all ids appearing twice
        self.assertEquals(1, len(unique_counts))
        self.assertEquals(2, unique_counts.pop())

    def test_batch_management(self):
        self.assertEquals(10, self.hi.batch_size)
        self.hi.clear_batch()
        self.assertEquals(0, self.hi.batch_size)
        self.hi.load_ref(test_utils.MiniRefMemPath)
        self.hi.align_batch()  # should not raise just because it's empty

        for _ in self.hi.ifragments():
            self.fail("iterating over an empty batch!")

    def test_base_quality(self):
        hi = HiRapiAligner('rapi_bwa', paired=False)
        one_read = self.reads[0][0:3]
        hi.q_offset = self.hi.Qenc_Sanger
        hi.load_read('sanger_read', one_read[1], one_read[2])

        # 64:  Illumina base quality offset
        # 33:  Sanger base quality offset
        ill_quality = ''.join(chr(ord(c) + (64 - 33)) for c in one_read[2])
        hi.q_offset = self.hi.Qenc_Illumina
        hi.load_read('illumina_read', one_read[1], ill_quality)

        loaded_qualities = [frag[0].qual for frag in hi.ifragments()]
        self.assertEquals(2, len(loaded_qualities))
        self.assertEquals(loaded_qualities[0], loaded_qualities[1])
Esempio n. 13
0
 def setUp(self):
     self.hi = HiRapiAligner('rapi_bwa')
     self._align_mini_ref_seqs()
Esempio n. 14
0
class TestHiRapiAlignments(unittest.TestCase):
    def setUp(self):
        self.hi = HiRapiAligner('rapi_bwa')
        self._align_mini_ref_seqs()

    def tearDown(self):
        self.hi.release_resources()

    def test_load_reference_again(self):
        # should "just work"
        self.hi.load_ref(test_utils.MiniRefMemPath)

    def test_sam(self):
        io = StringIO()
        self.hi.write_sam(io, include_header=False)
        sam = io.getvalue()
        expected_sam = test_utils.rapi_mini_ref_seqs_sam_no_header()
        self.assertEquals(expected_sam, sam)

    def _align_mini_ref_seqs(self):
        self.hi.load_ref(test_utils.MiniRefMemPath)
        reads = test_utils.get_mini_ref_seqs()
        for row in reads:
            if len(row) != 5:
                raise RuntimeError(
                    "Unexpected number of fields in mini_ref read record")
            self.hi.load_pair(*row)
        self.hi.align_batch()
Esempio n. 15
0
 def setUp(self):
     self.hi = HiRapiAligner('rapi_bwa')
Esempio n. 16
0
class mapper(Mapper):
    """
    Aligns sequences to a reference genome.

    @input-record: C{key} does not matter (standard LineRecordReader);
    C{value} is a tab-separated text line with 5 fields: ID, read_seq,
    read_qual, mate_seq, mate_qual.

    @output-record: protobuf-serialized mapped pairs (map-reduce job) or alignment
    records in SAM format (map-only job).

    @jobconf-param: C{mapred.reduce.tasks} number of Hadoop reduce tasks to launch.
    If the value of this property is set to 0, then the mapper will directly output
    the mappings in SAM format, like BWA.  If set to a value > 0 the mapper will output
    mappings in the protobuf serialized format for the rmdup reducer.

    @jobconf-param: C{seal.seqal.log.level} logging level,
    specified as a logging module literal.

    @jobconf-param: C{mapred.cache.archives} distributed
    cache entry for the bwa index archive. The entry
    is of the form HDFS_PATH#LINK_NAME. The archive for a given
    chromosome must contain (at the top level, i.e., no directories) all
    files generated by 'bwa index' for that chromosome.

    @jobconf-param: C{seal.seqal.alignment.max.isize}: if the
    inferred isize is greater than this value, Smith-Waterman alignment
    for unmapped reads will be skipped.

    @jobconf-param: C{seal.seqal.pairing.batch.size}: how many
    sequences should be processed at a time by the pairing
    function. Status will be updated at each new batch: therefore,
    lowering this value can help avoid timeouts.

    @jobconf-param: C{seal.seqal.fastq-subformat} Specifies base quality
    score encoding.  Supported types are: 'fastq-sanger' and 'fastq-illumina'.

    @jobconf-param: C{mapred.create.symlink} must be set to 'yes'.

    @jobconf-param: C{seal.seqal.min_hit_quality} mapping quality
    threshold below which the mapping will be discarded.
    """
    SUPPORTED_FORMATS = "fastq-illumina", "fastq-sanger"
    DEFAULT_FORMAT = "fastq-sanger"
    COUNTER_CLASS = "SEQAL"
    DeprecationMap = {
      "seal.seqal.log.level":           "bl.seqal.log.level",
      "seal.seqal.alignment.max.isize": "bl.seqal.alignment.max.isize",
      "seal.seqal.pairing.batch.size":  "bl.seqal.pairing.batch.size",
      "seal.seqal.fastq-subformat":     "bl.seqal.fastq-subformat",
      "seal.seqal.min_hit_quality":     "bl.seqal.min_hit_quality",
      "seal.seqal.remove_unmapped":     "bl.seqal.remove_unmapped",
      "seal.seqal.discard_duplicates":  "bl.seqal.discard_duplicates",
      "seal.seqal.nthreads":            "bl.seqal.nthreads",
      "seal.seqal.trim.qual":           "bl.seqal.trim.qual",
    }

    def __get_configuration(self, ctx):
        # TODO:  refactor settings common to mapper and reducer
        jc = ctx.getJobConf()

        jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, self.logger)

        jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO')
        jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT)
        jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000)
        jc_configure_int(self, jobconf, 'seal.seqal.alignment.min.isize', 'min_isize', None)
        jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000)
        jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0)
        jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False)
        jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1)
        jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0)

        try:
            self.log_level = getattr(logging, self.log_level)
        except AttributeError:
            raise ValueError("Unsupported log level: %r" % self.log_level)

        if self.format not in self.SUPPORTED_FORMATS:
            raise_pydoop_exception(
              "seal.seqal.fastq-subformat must be one of %r" %
              (self.SUPPORTED_FORMATS,)
              )

        if self.remove_unmapped:
            raise NotImplementedError("seal.seqal.remove_unmapped is currently unsupported")
        if self.min_hit_quality > 0:
            raise NotImplementedError("seal.seqal.min_hit_quality is currently unsupported")
        if self.trim_qual > 0:
            raise NotImplementedError("seal.seqal.trim_qual is currently unsupported")

        if self.max_isize <= 0:
            raise ValueError("'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]")

        if self.batch_size <= 0:
            raise ValueError("'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]")

        # minimum qual value required for a hit to be kept.  By default outputs all the
        # hits BWA returns.
        if self.min_hit_quality < 0:
            raise ValueError("'seal.seqal.min_hit_quality' must be >= 0, if specified [0]")

        # number of concurrent threads for main alignment operation
        if self.nthreads <= 0:
            raise ValueError("'seal.seqal.nthreads' must be > 0, if specified [1]")

        # trim quality parameter used by BWA from read trimming.  Equivalent to
        # the -q parameter for bwa align
        if self.trim_qual < 0:
            raise ValueError("'seal.seqal.trim.qual' must be >= 0, if specified [0]")

        if jc.hasKey('mapred.reduce.tasks') and jc.getInt('mapred.reduce.tasks') > 0:
            self.__map_only = False
        else:
            self.__map_only = True


    def get_reference_root_from_archive(self, ref_dir):
        """
        Given a directory containing an indexed reference,
        such that all its files have a common name (except the extension),
        this method find the path to the reference including the common name.
         e.g. my_reference/hg_18.bwt
              my_reference/hg_18.rsax
              my_reference/hg_18.sax   => "my_references/hg_18"
              my_reference/hg_18.pac
              my_reference/.irrelevant_file
        """
        file_list = [ p for p in os.listdir(ref_dir) ]

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("file_list extracted from reference archive: %s", file_list)

        filtered_file_list = [ p for p in file_list if not p.startswith('.') and os.path.splitext(p)[1].lstrip('.') in _BWA_INDEX_EXT ]
        prefix = os.path.commonprefix(filtered_file_list).rstrip('.')
        if not prefix:
            raise RuntimeError("Could not determine common prefix from list of files (%s)" %\
                    filtered_file_list if len(filtered_file_list) < 15 else "{}, ...".format(', '.join(filtered_file_list[0:15])))
        full_prefix = os.path.join(ref_dir, prefix)
        return full_prefix

    def __init__(self, ctx):
        super(mapper, self).__init__(ctx)
        self.logger = logging.getLogger("seqal")
        self.__get_configuration(ctx)
        logging.basicConfig(level=self.log_level)
        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx)

        pe = True # single-end sequencen alignment not yet supported by Seqal
        self.hi_rapi = HiRapiAligner('rapi_bwa', paired=pe)

        # opts
        self.hi_rapi.opts.n_threads = self.nthreads
        self.hi_rapi.opts.isize_max = self.max_isize
        if self.min_isize is not None:
            self.hi_rapi.opts.isize_min = self.min_isize
        self.hi_rapi.qoffset = self.hi_rapi.Qenc_Illumina if self.format == "fastq-illumina" else self.hi_rapi.Qenc_Sanger
        # end opts

        self.logger.info("Using the %s aligner plugin, aligner version %s, plugin version %s",
                self.hi_rapi.aligner_name, self.hi_rapi.aligner_version, self.hi_rapi.plugin_version)
        self.logger.info("Working in %s mode", 'paired-end' if pe else 'single-end')

        # allocate space for reads
        self.logger.debug("Reserving batch space for %s reads", self.batch_size)
        self.hi_rapi.reserve_space(self.batch_size) 

        # load reference
        reference_root = self.get_reference_root_from_archive(utils.get_ref_archive(ctx.getJobConf()))
        self.logger.info("Full reference path (prefix): %s", reference_root)
        with self.event_monitor.time_block("Loading reference %s" % reference_root):
            self.hi_rapi.load_ref(reference_root)

        ######## assemble hit processor chain
        chain = RapiFilterLink(self.event_monitor)
        chain.remove_unmapped = self.remove_unmapped
        chain.min_hit_quality = self.min_hit_quality
        if self.__map_only:
            chain.set_next( RapiEmitSamLink(ctx, self.event_monitor, self.hi_rapi) )
        else:
            raise NotImplementedError("Only mapping mode is supported at the moment")
        self.hit_visitor_chain = chain

    def _visit_hits(self):
        for read_tpl in self.hi_rapi.ifragments():
            self.hit_visitor_chain.process(read_tpl)

    def map(self, ctx):
        # Accumulates reads in self.pairs, until batch size is reached.
        # At that point it calls run_alignment and emits the output.
        v = ctx.value
        f_id, r1, q1, r2, q2 = v.split("\t")
        self.hi_rapi.load_pair(f_id, r1, q1, r2, q2)
        if self.hi_rapi.batch_size >= self.batch_size:
            self.hi_rapi.align_batch()
            self._visit_hits()
            self.hi_rapi.clear_batch()

    def close(self):
        # If there are any reads left in the aligner batch,
        # align them too
        if self.hi_rapi.batch_size > 0:
            self.hi_rapi.align_batch()
            self._visit_hits()
            self.hi_rapi.clear_batch()
        self.hi_rapi.release_resources()