def __get_configuration(self, ctx): # TODO: refactor settings common to mapper and reducer jc = ctx.getJobConf() jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, self.logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT) jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000) jc_configure_int(self, jobconf, 'seal.seqal.alignment.min.isize', 'min_isize', None) jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000) jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0) jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False) jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1) jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0) try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise ValueError("Unsupported log level: %r" % self.log_level) if self.format not in self.SUPPORTED_FORMATS: raise_pydoop_exception( "seal.seqal.fastq-subformat must be one of %r" % (self.SUPPORTED_FORMATS,) ) if self.remove_unmapped: raise NotImplementedError("seal.seqal.remove_unmapped is currently unsupported") if self.min_hit_quality > 0: raise NotImplementedError("seal.seqal.min_hit_quality is currently unsupported") if self.trim_qual > 0: raise NotImplementedError("seal.seqal.trim_qual is currently unsupported") if self.max_isize <= 0: raise ValueError("'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]") if self.batch_size <= 0: raise ValueError("'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]") # minimum qual value required for a hit to be kept. By default outputs all the # hits BWA returns. if self.min_hit_quality < 0: raise ValueError("'seal.seqal.min_hit_quality' must be >= 0, if specified [0]") # number of concurrent threads for main alignment operation if self.nthreads <= 0: raise ValueError("'seal.seqal.nthreads' must be > 0, if specified [1]") # trim quality parameter used by BWA from read trimming. Equivalent to # the -q parameter for bwa align if self.trim_qual < 0: raise ValueError("'seal.seqal.trim.qual' must be >= 0, if specified [0]") if jc.hasKey('mapred.reduce.tasks') and jc.getInt('mapred.reduce.tasks') > 0: self.__map_only = False else: self.__map_only = True
def __init__(self, ctx): super(reducer, self).__init__(ctx) jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx) self.__output_sink = EmitSamLink(ctx, self.event_monitor)