def _configure(self): jc = self.ctx.getJobConf() pu.jc_configure_log_level(self, jc, "bl.mr.loglevel", "log_level", "INFO") self.logger = logging.getLogger("mapper") self.logger.setLevel(self.log_level) pu.jc_configure_int(self, jc, "mapred.task.timeout", "timeout") pu.jc_configure(self, jc, "bl.hdfs.user", "user", "")
def __init__(self, context): super(Writer, self).__init__(context) self.logger = logging.getLogger("Writer") jc = context.getJobConf() jc_configure_int(self, jc, "mapred.task.partition", "part") jc_configure(self, jc, "mapred.work.output.dir", "outdir") jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t") jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None) self.outfn = "%s/part-%05d" % (self.outdir, self.part) self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)
def __get_tiget_conf(self, jc): pu.jc_configure_int(self, jc, 'bl.mr.seq.tiget.max.hits', 'max_hits', 10) pu.jc_configure_int(self, jc, 'bl.mr.seq.tiget.max.start', 'max_start', 4) pu.jc_configure_float(self, jc, 'bl.mr.seq.tiget.min.identity.percent', 'min_identity', 95.0) pu.jc_configure_float(self, jc, 'bl.mr.seq.tiget.min.al2seq.percent', 'min_al2seq', 15.0) self.min_al2seq /= 100 pu.jc_configure_float(self, jc, 'bl.mr.seq.tiget.min.score.diff', 'min_score_diff', 20.0)
def __get_blastall_conf(self, jc): pu.jc_configure(self, jc, 'bl.mr.seq.blastall.exe', 'blastall_exe', '/usr/bin/blastall') pu.jc_configure(self, jc, 'bl.mr.seq.blastall.program', 'program', 'blastn') pu.jc_configure(self, jc, 'bl.mr.seq.blastall.db.name', 'db_name') pu.jc_configure_float(self, jc, 'bl.mr.seq.blastall.evalue', 'evalue', 1.0) pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.gap.cost', 'gap_cost', 1) pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.word.size', 'word_size', 20) pu.jc_configure_bool(self, jc, 'bl.mr.seq.blastall.filter', 'filter', False)
def __get_configuration(self, ctx): # TODO: refactor settings common to mapper and reducer jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT) jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000) jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000) jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0) jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False) jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1) jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0) try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise ValueError("Unsupported log level: %r" % self.log_level) if self.format not in self.SUPPORTED_FORMATS: raise_pydoop_exception( "seal.seqal.fastq-subformat must be one of %r" % (self.SUPPORTED_FORMATS,) ) if self.max_isize <= 0: raise ValueError("'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]") if self.batch_size <= 0: raise ValueError("'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]") # minimum qual value required for a hit to be kept. By default outputs all the # hits BWA returns. if self.min_hit_quality < 0: raise ValueError("'seal.seqal.min_hit_quality' must be >= 0, if specified [0]") # number of concurrent threads for main alignment operation if self.nthreads <= 0: raise ValueError("'seal.seqal.nthreads' must be > 0, if specified [1]") # trim quality parameter used by BWA from read trimming. Equivalent to # the -q parameter for bwa align if self.trim_qual < 0: raise ValueError("'seal.seqal.trim.qual' must be >= 0, if specified [0]") if jc.hasKey('mapred.reduce.tasks') and jc.getInt('mapred.reduce.tasks') > 0: self.__map_only = False else: self.__map_only = True
def __get_configuration(self, jc): jc_configure(self, jc, 'bl.mr.fasta-reader.log.level', 'log_level', self.DEFAULT_LOG_LEVEL) try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise_pydoop_exception("Unsupported log level: %r" % self.log_level) jc_configure(self, jc, "bl.libhdfs.opts", "libhdfs_opts", "") if self.libhdfs_opts: os.environ["LIBHDFS_OPTS"] = self.libhdfs_opts jc_configure_bool(self, jc, 'bl.mr.fasta-reader.compress.header', 'compress_header', False) jc_configure_bool(self, jc, 'bl.mr.fasta-reader.compress.seq', 'compress_seq', True) jc_configure_int(self, jc, 'bl.mr.fasta-reader.compression.level', 'compression_level', 6)
def __get_configuration(self, jc): pu.jc_configure(self, jc, 'bl.mr.seq.blastall.log.level', 'log_level', 'WARNING') try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise ValueError("Unsupported log level: %r" % self.log_level) pu.jc_configure(self, jc, 'bl.mr.seq.blastall.exe', 'blastall_exe', '/usr/bin/blastall') pu.jc_configure(self, jc, 'bl.mr.seq.blastall.program', 'program', 'blastn') pu.jc_configure(self, jc, 'bl.mr.seq.blastall.db.name', 'db_name') pu.jc_configure_float(self, jc, 'bl.mr.seq.blastall.evalue', 'evalue', 1.0) pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.gap.cost', 'gap_cost', 1) pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.word.size', 'word_size', 20) pu.jc_configure_bool(self, jc, 'bl.mr.seq.blastall.filter', 'filter', False)
def test_jc_configure_default(self): w = CONFIGURE_EXAMPLES d = {} for k in w.keys(): d[k] = w[k][1] jc = pp.get_JobConf_object(d) o = Obj() for k in w.keys(): nk = 'not-here-%s' % k self.assertFalse(jc.hasKey(nk)) if w[k][0] == 'str': pu.jc_configure(o, jc, nk, k, w[k][1]) self.assertEqual(getattr(o,k), w[k][1]) elif w[k][0] == 'int': pu.jc_configure_int(o, jc, nk, k, int(w[k][1])) self.assertEqual(getattr(o, k), int(w[k][1])) elif w[k][0] == 'bool': pu.jc_configure_bool(o, jc, nk, k, w[k][1]=='true') self.assertEqual(getattr(o, k), w[k][1] == 'true') elif w[k][0] == 'log_level': pu.jc_configure_log_level(o, jc, nk, k, w[k][1]) self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
def test_jc_configure_default(self): w = CONFIGURE_EXAMPLES d = {} for k in w.keys(): d[k] = w[k][1] jc = pp.get_JobConf_object(d) o = Obj() for k in w.keys(): nk = 'not-here-%s' % k self.assertFalse(jc.hasKey(nk)) if w[k][0] == 'str': pu.jc_configure(o, jc, nk, k, w[k][1]) self.assertEqual(getattr(o, k), w[k][1]) elif w[k][0] == 'int': pu.jc_configure_int(o, jc, nk, k, int(w[k][1])) self.assertEqual(getattr(o, k), int(w[k][1])) elif w[k][0] == 'bool': pu.jc_configure_bool(o, jc, nk, k, w[k][1] == 'true') self.assertEqual(getattr(o, k), w[k][1] == 'true') elif w[k][0] == 'log_level': pu.jc_configure_log_level(o, jc, nk, k, w[k][1]) self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
def test_jc_configure_plain(self): w = CONFIGURE_EXAMPLES d = {} for k in w.keys(): d[k] = w[k][1] jc = pp.get_JobConf_object(d) o = Obj() for k in w.keys(): self.assertTrue(jc.hasKey(k)) if w[k][0] == 'str': pu.jc_configure(o, jc, k, k) self.assertEqual(getattr(o,k), w[k][1]) elif w[k][0] == 'int': pu.jc_configure_int(o, jc, k, k) self.assertEqual(getattr(o, k), int(w[k][1])) elif w[k][0] == 'bool': pu.jc_configure_bool(o, jc, k, k) self.assertEqual(getattr(o, k), w[k][1] == 'true') elif w[k][0] == 'float': pu.jc_configure_float(o, jc, k, k) self.assertAlmostEqual(getattr(o, k), float(w[k][1])) elif w[k][0] == 'log_level': pu.jc_configure_log_level(o, jc, k, k) self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
def test_jc_configure_plain(self): w = CONFIGURE_EXAMPLES d = {} for k in w.keys(): d[k] = w[k][1] jc = pp.get_JobConf_object(d) o = Obj() for k in w.keys(): self.assertTrue(jc.hasKey(k)) if w[k][0] == 'str': pu.jc_configure(o, jc, k, k) self.assertEqual(getattr(o, k), w[k][1]) elif w[k][0] == 'int': pu.jc_configure_int(o, jc, k, k) self.assertEqual(getattr(o, k), int(w[k][1])) elif w[k][0] == 'bool': pu.jc_configure_bool(o, jc, k, k) self.assertEqual(getattr(o, k), w[k][1] == 'true') elif w[k][0] == 'float': pu.jc_configure_float(o, jc, k, k) self.assertAlmostEqual(getattr(o, k), float(w[k][1])) elif w[k][0] == 'log_level': pu.jc_configure_log_level(o, jc, k, k) self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
def __init__(self, context): super(FilterMapper, self).__init__(context) jc = context.getJobConf() jc_configure_int(self, jc, "filter.occurrence.threshold", "threshold")
def __get_configuration(self, ctx): # TODO: refactor settings common to mapper and reducer jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT) jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000) jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000) jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0) jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False) jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1) jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0) try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise ValueError("Unsupported log level: %r" % self.log_level) if self.format not in self.SUPPORTED_FORMATS: raise_pydoop_exception( "seal.seqal.fastq-subformat must be one of %r" % (self.SUPPORTED_FORMATS, )) if self.max_isize <= 0: raise ValueError( "'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]" ) if self.batch_size <= 0: raise ValueError( "'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]" ) # minimum qual value required for a hit to be kept. By default outputs all the # hits BWA returns. if self.min_hit_quality < 0: raise ValueError( "'seal.seqal.min_hit_quality' must be >= 0, if specified [0]") # number of concurrent threads for main alignment operation if self.nthreads <= 0: raise ValueError( "'seal.seqal.nthreads' must be > 0, if specified [1]") # trim quality parameter used by BWA from read trimming. Equivalent to # the -q parameter for bwa align if self.trim_qual < 0: raise ValueError( "'seal.seqal.trim.qual' must be >= 0, if specified [0]") if jc.hasKey('mapred.reduce.tasks' ) and jc.getInt('mapred.reduce.tasks') > 0: self.__map_only = False else: self.__map_only = True