def __init__(self, ctx): super(reducer, self).__init__(ctx) jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx) self.__output_sink = EmitSamLink(ctx, self.event_monitor)
def __init__(self, ctx): super(type(self), self).__init__(ctx) self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) self.aligner = BwaAligner() self.aligner.event_monitor = self.event_monitor self.aligner.qformat = self.format self.aligner.max_isize = self.max_isize self.aligner.nthreads = self.nthreads self.aligner.trim_qual = self.trim_qual self.aligner.mmap_enabled = True ######## assemble hit processor chain chain = FilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next(EmitSamLink(ctx, self.event_monitor)) else: chain.set_next(MarkDuplicatesEmitter(ctx, self.event_monitor)) self.aligner.hit_visitor = chain ######## set the path to the reference index self.ref_archive = utils.get_ref_archive(ctx.getJobConf()) self.aligner.reference = self.get_reference_root(self.ref_archive) # part of the code is a workaround for accumulating records, see #331 isplit = InputSplit(ctx.getInputSplit()) self.split_end = isplit.offset + isplit.length
def setUp(self): self.map_ctx = map_context(None, None) self.count_group = "Test" self.logger = SavingLogger() self.monitor = HadoopEventMonitor(self.count_group, self.logger, self.map_ctx) self.emitter = EmitSamLink(self.map_ctx, self.monitor) # create two mappings, m1, m2. We put them in self.pair # m1 has: # name = first # tid = tid1 # m2 has: # name = second # tid = tid2 self.pair = [ SimpleMapping(), SimpleMapping() ] self.m1, self.m2 = self.pair self.m1.set_name("first") self.m1.tid = "tid1" self.m2.set_name("second") self.m2.tid = "tid2"
def test_constructor_link(self): h = EmitSamLink(self.map_ctx, self.monitor) self.assertTrue(h.next_link is None) other = HitProcessorChainLink() h = EmitSamLink(self.map_ctx, self.monitor, other) self.assertEqual(other, h.next_link)
class TestEmitSamLink(unittest.TestCase): def setUp(self): self.map_ctx = map_context(None, None) self.count_group = "Test" self.logger = SavingLogger() self.monitor = HadoopEventMonitor(self.count_group, self.logger, self.map_ctx) self.emitter = EmitSamLink(self.map_ctx, self.monitor) # create two mappings, m1, m2. We put them in self.pair # m1 has: # name = first # tid = tid1 # m2 has: # name = second # tid = tid2 self.pair = [ SimpleMapping(), SimpleMapping() ] self.m1, self.m2 = self.pair self.m1.set_name("first") self.m1.tid = "tid1" self.m2.set_name("second") self.m2.tid = "tid2" def test_constructor_link(self): h = EmitSamLink(self.map_ctx, self.monitor) self.assertTrue(h.next_link is None) other = HitProcessorChainLink() h = EmitSamLink(self.map_ctx, self.monitor, other) self.assertEqual(other, h.next_link) def test_process(self): self.emitter.process(self.pair) self.assertEqual(["first", "second"], sorted(self.map_ctx.emitted.keys())) self.assertEqual(1, len(self.map_ctx.emitted["first"])) self.assertTrue(re.search("tid1", self.map_ctx.emitted["first"][0])) self.assertEqual(1, len(self.map_ctx.emitted["second"])) self.assertTrue(re.search("tid2", self.map_ctx.emitted["second"][0])) def test_emitted_type(self): self.emitter.process(self.pair) for k in self.map_ctx.emitted.keys(): self.assertTrue(isinstance(k, str)) for v in [ item for ary in self.map_ctx.emitted.values() for item in ary ]: self.assertTrue(isinstance(v, str)) def test_first_null(self): self.pair[0] = None self.emitter.process(self.pair) self.assertEqual(["second"], self.map_ctx.emitted.keys()) self.assertEqual(1, len(self.map_ctx.emitted["second"])) self.assertTrue(re.search("tid2", self.map_ctx.emitted["second"][0])) def test_second_null(self): self.pair[1] = None self.emitter.process(self.pair) self.assertEqual(["first"], self.map_ctx.emitted.keys()) self.assertEqual(1, len(self.map_ctx.emitted["first"])) self.assertTrue(re.search("tid1", self.map_ctx.emitted["first"][0])) def test_forward_pair(self): class Receiver(object): def process(self, pair): self.received = pair receiver = Receiver() self.emitter.set_next(receiver) self.emitter.process(self.pair) self.assertEqual(self.pair, receiver.received)
class reducer(Reducer): COUNTER_CLASS = "SEQAL" # TODO: refactor so that mapper and reducer have a common place for things like this constant DeprecationMap = { 'seal.seqal.log.level': 'bl.seqal.log.level', 'seal.seqal.discard_duplicates': 'bl.seqal.discard_duplicates' } def __init__(self, ctx): super(reducer, self).__init__(ctx) jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx) self.__output_sink = EmitSamLink(ctx, self.event_monitor) def __process_unmapped_pairs(self, ctx): while ctx.nextValue(): value = ctx.getInputValue() pair = protobuf_mapping.unserialize_pair(value) self.__output_sink.process(pair) def reduce(self, ctx): # create the "workspace" self.__pairs = [] self.__unpaired = [] # gather input key_values = ctx.getInputKey().split(':') if key_values[0] == seqal_app.UNMAPPED_STRING: # pair of unmapped sequences self.__process_unmapped_pairs(ctx) else: if len(key_values) != 3: raise RuntimeError("Unexpected key length %d. Expected key format is ref_id:pos:orient" % len(key)) # convert key values and make it a tuple key = (int(key_values[0]), int(key_values[1]), key_values[2] == 'R') # last value is True if reverse strand have_pairs = False # keep track of whether we have at least one real pair. # load mappings while ctx.nextValue(): value = ctx.getInputValue() if value == seqal_app.PAIR_STRING: have_pairs = True else: pair = protobuf_mapping.unserialize_pair(value) if pair[0] is None or pair[0].is_unmapped(): # Sanity check. pair[0] should never be None or unmapped here. raise ValueError("Error! Got None or unmapped in first read for key %s. pair: %s" % (key, pair)) if pair[1] and pair[1].is_unmapped(): self.__output_sink.process( (pair[1], None) ) self.__unpaired.append( (pair[0], None) ) elif pair[1] is None: self.__unpaired.append(pair) else: # Two mapped reads. # pair[0] should never be unmapped. That case should be handled by # __process_unmapped_pairs. self.__pairs.append(pair) have_pairs = True self.__process_pairs() self.__process_fragments(have_pairs) # clean-up the workspace self.__pairs = None self.__unpaired = None def __process_pairs(self): # All pairs whose 5'-most coordinate matches the key, # and are not duplicate pairs, will be emitted keep_pairs = dict() for p in self.__pairs: p_key = get_pair_key(p) # makes the key on which we base the comparison between pairs # If we already have a pair with this key, then keep the one with the highest score. # If we haven't already seen the key, put the pair in the hash. if keep_pairs.has_key(p_key): if get_map_pair_score(keep_pairs[p_key]) < get_map_pair_score(p): dup_pair = keep_pairs[p_key] keep_pairs[p_key] = p else: dup_pair = p self.event_monitor.count("duplicate pairs") if not self.discard_duplicates: # emit the duplicates if we need to for r in dup_pair: r.set_duplicate(True) self.__output_sink.process(dup_pair) else: keep_pairs[p_key] = p # finally, emit the pairs that we've kept self.event_monitor.count("rmdup unique pairs", len(keep_pairs)) for pair in keep_pairs.itervalues(): self.__output_sink.process(pair) def __process_fragments(self, with_pairs): # All fragments that are not the duplicate of another # fragment, be it in a pair or alone, will be emitted. # # All fragments we analyze here will have been emitted for the same coordinate # (the one referenced by the key). Therefore, they automatically have a # duplicate in any pairs we have received. As a consequence, we only look at # them if we haven't seen any pairs. # # with_pairs => implies we have proper pairs for the key position, # so all lone fragments are to be discarded as duplicates. # # not with_pairs => we have no proper pairs for the key position. # Duplicates will be selected by quality.""" if with_pairs: # all fragments are duplicates self.event_monitor.count("duplicate fragments", len(self.__unpaired)) if not self.discard_duplicates: for dup in self.__unpaired: # for each unpaired fragment dup[0].set_duplicate(True) self.__output_sink.process(dup) else: fragments = dict() for m,none in self.__unpaired: # for each unpaired fragment k = get_mapping_key(m) if fragments.has_key(k): if get_map_score(fragments[k]) < get_map_score(m): dup = fragments[k] fragments[k] = m else: dup = m self.event_monitor.count("duplicate fragments") if not self.discard_duplicates: dup.set_duplicate(True) self.__output_sink.process((dup,None)) else: fragments[k] = m # now emit the remaining fragments self.event_monitor.count("rmdup unique fragments", len(fragments)) for m in fragments.itervalues(): self.__output_sink.process( (m,None) )
class reducer(Reducer): COUNTER_CLASS = "SEQAL" # TODO: refactor so that mapper and reducer have a common place for things like this constant DeprecationMap = { 'seal.seqal.log.level': 'bl.seqal.log.level', 'seal.seqal.discard_duplicates': 'bl.seqal.discard_duplicates' } def __init__(self, ctx): super(reducer, self).__init__(ctx) jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx) self.__output_sink = EmitSamLink(ctx, self.event_monitor) def __process_unmapped_pairs(self, ctx): while ctx.nextValue(): value = ctx.getInputValue() pair = protobuf_mapping.unserialize_pair(value) self.__output_sink.process(pair) def reduce(self, ctx): # create the "workspace" self.__pairs = [] self.__unpaired = [] # gather input key_values = ctx.getInputKey().split(':') if key_values[0] == seqal_app.UNMAPPED_STRING: # pair of unmapped sequences self.__process_unmapped_pairs(ctx) else: if len(key_values) != 3: raise RuntimeError( "Unexpected key length %d. Expected key format is ref_id:pos:orient" % len(key)) # convert key values and make it a tuple key = (int(key_values[0]), int(key_values[1]), key_values[2] == 'R' ) # last value is True if reverse strand have_pairs = False # keep track of whether we have at least one real pair. # load mappings while ctx.nextValue(): value = ctx.getInputValue() if value == seqal_app.PAIR_STRING: have_pairs = True else: pair = protobuf_mapping.unserialize_pair(value) if pair[0] is None or pair[0].is_unmapped(): # Sanity check. pair[0] should never be None or unmapped here. raise ValueError( "Error! Got None or unmapped in first read for key %s. pair: %s" % (key, pair)) if pair[1] and pair[1].is_unmapped(): self.__output_sink.process((pair[1], None)) self.__unpaired.append((pair[0], None)) elif pair[1] is None: self.__unpaired.append(pair) else: # Two mapped reads. # pair[0] should never be unmapped. That case should be handled by # __process_unmapped_pairs. self.__pairs.append(pair) have_pairs = True self.__process_pairs() self.__process_fragments(have_pairs) # clean-up the workspace self.__pairs = None self.__unpaired = None def __process_pairs(self): # All pairs whose 5'-most coordinate matches the key, # and are not duplicate pairs, will be emitted keep_pairs = dict() for p in self.__pairs: p_key = get_pair_key( p ) # makes the key on which we base the comparison between pairs # If we already have a pair with this key, then keep the one with the highest score. # If we haven't already seen the key, put the pair in the hash. if keep_pairs.has_key(p_key): if get_map_pair_score( keep_pairs[p_key]) < get_map_pair_score(p): dup_pair = keep_pairs[p_key] keep_pairs[p_key] = p else: dup_pair = p self.event_monitor.count("duplicate pairs") if not self.discard_duplicates: # emit the duplicates if we need to for r in dup_pair: r.set_duplicate(True) self.__output_sink.process(dup_pair) else: keep_pairs[p_key] = p # finally, emit the pairs that we've kept self.event_monitor.count("rmdup unique pairs", len(keep_pairs)) for pair in keep_pairs.itervalues(): self.__output_sink.process(pair) def __process_fragments(self, with_pairs): # All fragments that are not the duplicate of another # fragment, be it in a pair or alone, will be emitted. # # All fragments we analyze here will have been emitted for the same coordinate # (the one referenced by the key). Therefore, they automatically have a # duplicate in any pairs we have received. As a consequence, we only look at # them if we haven't seen any pairs. # # with_pairs => implies we have proper pairs for the key position, # so all lone fragments are to be discarded as duplicates. # # not with_pairs => we have no proper pairs for the key position. # Duplicates will be selected by quality.""" if with_pairs: # all fragments are duplicates self.event_monitor.count("duplicate fragments", len(self.__unpaired)) if not self.discard_duplicates: for dup in self.__unpaired: # for each unpaired fragment dup[0].set_duplicate(True) self.__output_sink.process(dup) else: fragments = dict() for m, none in self.__unpaired: # for each unpaired fragment k = get_mapping_key(m) if fragments.has_key(k): if get_map_score(fragments[k]) < get_map_score(m): dup = fragments[k] fragments[k] = m else: dup = m self.event_monitor.count("duplicate fragments") if not self.discard_duplicates: dup.set_duplicate(True) self.__output_sink.process((dup, None)) else: fragments[k] = m # now emit the remaining fragments self.event_monitor.count("rmdup unique fragments", len(fragments)) for m in fragments.itervalues(): self.__output_sink.process((m, None))