Example #1
0
 def _configure(self):
     jc = self.ctx.getJobConf()
     pu.jc_configure_log_level(self, jc, "bl.mr.loglevel", "log_level",
                               "INFO")
     self.logger = logging.getLogger("mapper")
     self.logger.setLevel(self.log_level)
     pu.jc_configure_int(self, jc, "mapred.task.timeout", "timeout")
     pu.jc_configure(self, jc, "bl.hdfs.user", "user", "")
Example #2
0
 def __init__(self, context):
   super(Writer, self).__init__(context)
   self.logger = logging.getLogger("Writer")
   jc = context.getJobConf()
   jc_configure_int(self, jc, "mapred.task.partition", "part")
   jc_configure(self, jc, "mapred.work.output.dir", "outdir")
   jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t")
   jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
   self.outfn = "%s/part-%05d" % (self.outdir, self.part)
   self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)
Example #3
0
File: mapper.py Project: crs4/vispa
 def __get_tiget_conf(self, jc):
   pu.jc_configure_int(self, jc, 'bl.mr.seq.tiget.max.hits', 'max_hits', 10)
   pu.jc_configure_int(self, jc, 'bl.mr.seq.tiget.max.start', 'max_start', 4)
   pu.jc_configure_float(self, jc, 'bl.mr.seq.tiget.min.identity.percent',
                         'min_identity', 95.0)
   pu.jc_configure_float(self, jc, 'bl.mr.seq.tiget.min.al2seq.percent',
                         'min_al2seq', 15.0)
   self.min_al2seq /= 100
   pu.jc_configure_float(self, jc, 'bl.mr.seq.tiget.min.score.diff',
                         'min_score_diff', 20.0)
Example #4
0
 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = logging.getLogger("Writer")
     jc = context.getJobConf()
     jc_configure_int(self, jc, "mapred.task.partition", "part")
     jc_configure(self, jc, "mapred.work.output.dir", "outdir")
     jc_configure(self, jc, "mapred.textoutputformat.separator", "sep",
                  "\t")
     jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
     self.outfn = "%s/part-%05d" % (self.outdir, self.part)
     self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)
Example #5
0
File: mapper.py Project: crs4/vispa
 def __get_blastall_conf(self, jc):
   pu.jc_configure(self, jc, 'bl.mr.seq.blastall.exe',
                   'blastall_exe', '/usr/bin/blastall')
   pu.jc_configure(self, jc, 'bl.mr.seq.blastall.program', 'program', 'blastn')
   pu.jc_configure(self, jc, 'bl.mr.seq.blastall.db.name', 'db_name')
   pu.jc_configure_float(self, jc, 'bl.mr.seq.blastall.evalue', 'evalue', 1.0)
   pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.gap.cost', 'gap_cost', 1)
   pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.word.size',
                       'word_size', 20)
   pu.jc_configure_bool(self, jc, 'bl.mr.seq.blastall.filter',
                       'filter', False)
Example #6
0
File: mapper.py Project: pinno/seal
    def __get_configuration(self, ctx):
        # TODO:  refactor settings common to mapper and reducer
        jc = ctx.getJobConf()

        logger = logging.getLogger("seqal")
        jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger)

        jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO')
        jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT)
        jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000)
        jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000)
        jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0)
        jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False)
        jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1)
        jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0)

        try:
            self.log_level = getattr(logging, self.log_level)
        except AttributeError:
            raise ValueError("Unsupported log level: %r" % self.log_level)

        if self.format not in self.SUPPORTED_FORMATS:
            raise_pydoop_exception(
              "seal.seqal.fastq-subformat must be one of %r" %
              (self.SUPPORTED_FORMATS,)
              )

        if self.max_isize <= 0:
            raise ValueError("'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]")

        if self.batch_size <= 0:
            raise ValueError("'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]")

        # minimum qual value required for a hit to be kept.  By default outputs all the
        # hits BWA returns.
        if self.min_hit_quality < 0:
            raise ValueError("'seal.seqal.min_hit_quality' must be >= 0, if specified [0]")

        # number of concurrent threads for main alignment operation
        if self.nthreads <= 0:
            raise ValueError("'seal.seqal.nthreads' must be > 0, if specified [1]")

        # trim quality parameter used by BWA from read trimming.  Equivalent to
        # the -q parameter for bwa align
        if self.trim_qual < 0:
            raise ValueError("'seal.seqal.trim.qual' must be >= 0, if specified [0]")

        if jc.hasKey('mapred.reduce.tasks') and jc.getInt('mapred.reduce.tasks') > 0:
            self.__map_only = False
        else:
            self.__map_only = True
Example #7
0
 def __get_configuration(self, jc):
     jc_configure(self, jc, 'bl.mr.fasta-reader.log.level', 'log_level',
                  self.DEFAULT_LOG_LEVEL)
     try:
         self.log_level = getattr(logging, self.log_level)
     except AttributeError:
         raise_pydoop_exception("Unsupported log level: %r" %
                                self.log_level)
     jc_configure(self, jc, "bl.libhdfs.opts", "libhdfs_opts", "")
     if self.libhdfs_opts:
         os.environ["LIBHDFS_OPTS"] = self.libhdfs_opts
     jc_configure_bool(self, jc, 'bl.mr.fasta-reader.compress.header',
                       'compress_header', False)
     jc_configure_bool(self, jc, 'bl.mr.fasta-reader.compress.seq',
                       'compress_seq', True)
     jc_configure_int(self, jc, 'bl.mr.fasta-reader.compression.level',
                      'compression_level', 6)
Example #8
0
 def __get_configuration(self, jc):
     pu.jc_configure(self, jc, 'bl.mr.seq.blastall.log.level', 'log_level',
                     'WARNING')
     try:
         self.log_level = getattr(logging, self.log_level)
     except AttributeError:
         raise ValueError("Unsupported log level: %r" % self.log_level)
     pu.jc_configure(self, jc, 'bl.mr.seq.blastall.exe', 'blastall_exe',
                     '/usr/bin/blastall')
     pu.jc_configure(self, jc, 'bl.mr.seq.blastall.program', 'program',
                     'blastn')
     pu.jc_configure(self, jc, 'bl.mr.seq.blastall.db.name', 'db_name')
     pu.jc_configure_float(self, jc, 'bl.mr.seq.blastall.evalue', 'evalue',
                           1.0)
     pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.gap.cost',
                         'gap_cost', 1)
     pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.word.size',
                         'word_size', 20)
     pu.jc_configure_bool(self, jc, 'bl.mr.seq.blastall.filter', 'filter',
                          False)
Example #9
0
 def test_jc_configure_default(self):
   w = CONFIGURE_EXAMPLES
   d = {}
   for k in w.keys():
     d[k] = w[k][1]
   jc = pp.get_JobConf_object(d)
   o = Obj()
   for k in w.keys():
     nk = 'not-here-%s' % k
     self.assertFalse(jc.hasKey(nk))
     if w[k][0] == 'str':
       pu.jc_configure(o, jc, nk, k, w[k][1])
       self.assertEqual(getattr(o,k), w[k][1])
     elif w[k][0] == 'int':
       pu.jc_configure_int(o, jc, nk, k, int(w[k][1]))
       self.assertEqual(getattr(o, k), int(w[k][1]))
     elif w[k][0] == 'bool':
       pu.jc_configure_bool(o, jc, nk, k, w[k][1]=='true')
       self.assertEqual(getattr(o, k), w[k][1] == 'true')
     elif w[k][0] == 'log_level':
       pu.jc_configure_log_level(o, jc, nk, k, w[k][1])
       self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
Example #10
0
 def test_jc_configure_default(self):
     w = CONFIGURE_EXAMPLES
     d = {}
     for k in w.keys():
         d[k] = w[k][1]
     jc = pp.get_JobConf_object(d)
     o = Obj()
     for k in w.keys():
         nk = 'not-here-%s' % k
         self.assertFalse(jc.hasKey(nk))
         if w[k][0] == 'str':
             pu.jc_configure(o, jc, nk, k, w[k][1])
             self.assertEqual(getattr(o, k), w[k][1])
         elif w[k][0] == 'int':
             pu.jc_configure_int(o, jc, nk, k, int(w[k][1]))
             self.assertEqual(getattr(o, k), int(w[k][1]))
         elif w[k][0] == 'bool':
             pu.jc_configure_bool(o, jc, nk, k, w[k][1] == 'true')
             self.assertEqual(getattr(o, k), w[k][1] == 'true')
         elif w[k][0] == 'log_level':
             pu.jc_configure_log_level(o, jc, nk, k, w[k][1])
             self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
Example #11
0
 def test_jc_configure_plain(self):
   w = CONFIGURE_EXAMPLES
   d = {}
   for k in w.keys():
     d[k] = w[k][1]
   jc = pp.get_JobConf_object(d)
   o = Obj()
   for k in w.keys():
     self.assertTrue(jc.hasKey(k))
     if w[k][0] == 'str':
       pu.jc_configure(o, jc, k, k)
       self.assertEqual(getattr(o,k), w[k][1])
     elif w[k][0] == 'int':
       pu.jc_configure_int(o, jc, k, k)
       self.assertEqual(getattr(o, k), int(w[k][1]))
     elif w[k][0] == 'bool':
       pu.jc_configure_bool(o, jc, k, k)
       self.assertEqual(getattr(o, k), w[k][1] == 'true')
     elif w[k][0] == 'float':
       pu.jc_configure_float(o, jc, k, k)
       self.assertAlmostEqual(getattr(o, k), float(w[k][1]))
     elif w[k][0] == 'log_level':
       pu.jc_configure_log_level(o, jc, k, k)
       self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
Example #12
0
 def test_jc_configure_plain(self):
     w = CONFIGURE_EXAMPLES
     d = {}
     for k in w.keys():
         d[k] = w[k][1]
     jc = pp.get_JobConf_object(d)
     o = Obj()
     for k in w.keys():
         self.assertTrue(jc.hasKey(k))
         if w[k][0] == 'str':
             pu.jc_configure(o, jc, k, k)
             self.assertEqual(getattr(o, k), w[k][1])
         elif w[k][0] == 'int':
             pu.jc_configure_int(o, jc, k, k)
             self.assertEqual(getattr(o, k), int(w[k][1]))
         elif w[k][0] == 'bool':
             pu.jc_configure_bool(o, jc, k, k)
             self.assertEqual(getattr(o, k), w[k][1] == 'true')
         elif w[k][0] == 'float':
             pu.jc_configure_float(o, jc, k, k)
             self.assertAlmostEqual(getattr(o, k), float(w[k][1]))
         elif w[k][0] == 'log_level':
             pu.jc_configure_log_level(o, jc, k, k)
             self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
Example #13
0
 def __init__(self, context):
     super(FilterMapper, self).__init__(context)
     jc = context.getJobConf()
     jc_configure_int(self, jc, "filter.occurrence.threshold", "threshold")
Example #14
0
 def __init__(self, context):
   super(FilterMapper, self).__init__(context)
   jc = context.getJobConf()
   jc_configure_int(self, jc, "filter.occurrence.threshold", "threshold")
Example #15
0
    def __get_configuration(self, ctx):
        # TODO:  refactor settings common to mapper and reducer
        jc = ctx.getJobConf()

        logger = logging.getLogger("seqal")
        jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap,
                                                     logger)

        jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level',
                     'INFO')
        jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format",
                     self.DEFAULT_FORMAT)
        jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize',
                         'max_isize', 1000)
        jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size',
                         'batch_size', 10000)
        jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality',
                         'min_hit_quality', 0)
        jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped',
                          'remove_unmapped', False)
        jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1)
        jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0)

        try:
            self.log_level = getattr(logging, self.log_level)
        except AttributeError:
            raise ValueError("Unsupported log level: %r" % self.log_level)

        if self.format not in self.SUPPORTED_FORMATS:
            raise_pydoop_exception(
                "seal.seqal.fastq-subformat must be one of %r" %
                (self.SUPPORTED_FORMATS, ))

        if self.max_isize <= 0:
            raise ValueError(
                "'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]"
            )

        if self.batch_size <= 0:
            raise ValueError(
                "'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]"
            )

        # minimum qual value required for a hit to be kept.  By default outputs all the
        # hits BWA returns.
        if self.min_hit_quality < 0:
            raise ValueError(
                "'seal.seqal.min_hit_quality' must be >= 0, if specified [0]")

        # number of concurrent threads for main alignment operation
        if self.nthreads <= 0:
            raise ValueError(
                "'seal.seqal.nthreads' must be > 0, if specified [1]")

        # trim quality parameter used by BWA from read trimming.  Equivalent to
        # the -q parameter for bwa align
        if self.trim_qual < 0:
            raise ValueError(
                "'seal.seqal.trim.qual' must be >= 0, if specified [0]")

        if jc.hasKey('mapred.reduce.tasks'
                     ) and jc.getInt('mapred.reduce.tasks') > 0:
            self.__map_only = False
        else:
            self.__map_only = True