Ejemplo n.º 1
0
 def steps(self):
     return [
         MRStep(mapper_init=self._yield_none,
                mapper_pre_filter='cat',
                reducer_cmd='wc -l'),
         JarStep(jar='s3://bookat/binks_jar.jar')
     ]
Ejemplo n.º 2
0
 def steps(self):
     return [
         JarStep(
             jar='s3://{}.elasticmapreduce/libs/script-runner/script-runner.jar'.format(self.options.jar_region),
             args=['s3://tg-cloud-store-dev/admin/postgres-hive-metastore.sh'],
         ),
     ]
Ejemplo n.º 3
0
 def steps(self):
     return [
         JarStep(
             # Compress intermediate results but not the (brief) output:
             jobconf={
                 'mapred.compress.map.output': 'true',
                 'mapred.map.output.compression.codec':
                 'org.apache.hadoop.io.compress.GzipCodec',
                 'mapred.output.compress': 'false',
                 'mapred.reduce.max.attempts': '2'
             },
             jar=self.jar_path,
             main_class='uk.bl.wa.hadoop.indexer.WARCIndexerRunner',
             args=[
                 GENERIC_ARGS,  # This ensures the various jobconf etc. params are included.
                 "-files",
                 '%s#annotations.json,%s#openAccessSurts.txt' %
                 (self.options.annotations, self.options.oa_surts),
                 "-c",
                 self.options.config,
                 "-i",
                 self.options.warclist,  # Always use local file path.
                 "-o",
                 OUTPUT,
                 "-a",  # Apply annotations
                 "-w",  # Wait while the job runs
                 "--num-reducers",
                 str(self.options.num_reducers),  # An 'int' fails to run!
                 "--solr-zookeepers",
                 self.options.solr_zookeepers,
                 "--solr-collection",
                 self.options.solr_collection
             ])
     ]
Ejemplo n.º 4
0
 def steps(self):
     return [
         JarStep(
             jobconf={
                 'mapred.compress.map.output':
                 'true',
                 'mapred.output.compress':
                 'true',
                 'mapred.output.compression.codec':
                 'org.apache.hadoop.io.compress.GzipCodec'
             },
             jar=self.jar_path,
             main_class='uk.bl.wa.hadoop.mapreduce.hash.HdfsFileHasher',
             args=[
                 GENERIC_ARGS,  # This ensures the various jobconf etc. params are included.
                 '-i',
                 str(self.options.filelist),
                 '-o',
                 OUTPUT,
                 '-r',
                 str(
                     self.options.num_reducers
                 ),  # n.b. if you pass as int it fails mysteriously, use str!
             ])
     ]
Ejemplo n.º 5
0
 def steps(self):
     return [
         JarStep(
             jobconf={
                 'mapred.compress.map.output':
                 'true',
                 'mapred.output.compress':
                 'true',
                 'mapred.output.compression.codec':
                 'org.apache.hadoop.io.compress.GzipCodec'
             },
             jar=self.jar_path,
             main_class='uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator',
             args=[
                 GENERIC_ARGS,  # This ensures the various jobconf etc. params are included.
                 '-i',
                 INPUT,
                 '-o',
                 OUTPUT,
                 '-r',
                 str(
                     self.options.num_reducers
                 ),  # n.b. if you pass as int it fails mysteriously, use str!
                 '-w',
                 '-h',
                 '-m',
                 '',
                 '-t',
                 self.options.cdx_endpoint,
                 '-c',
                 "CDX N b a m s k r M S V g"
             ])
     ]
Ejemplo n.º 6
0
    def jar(cls, *args, **kwargs):
        """Alias for :py:class:`~mrjob.step.JarStep`.

        .. deprecated:: 0.4.2
        """
        log.warning('MRJob.jar() is deprecated and will be removed in'
                    ' v0.5.0. Use mrjob.step.JarStep directly.')
        return JarStep(*args, **kwargs)
Ejemplo n.º 7
0
 def steps(self):
     return [
         JarStep(
             jar=self.options.jar,
             args=['stuff', INPUT, OUTPUT]
         ),
         MRStep(mapper=self.mapper, reducer=self.reducer)
     ]
Ejemplo n.º 8
0
 def steps(self):
     return [
         JarStep(jar=HADOOP_EXAMPLES_JAR, args=['wordcount', INPUT,
                                                OUTPUT]),
         MRStep(mapper=self.mapper,
                combiner=self.reducer,
                reducer=self.reducer)
     ]
Ejemplo n.º 9
0
 def test_steps(self):
     j = self.SteppyJob(['--no-conf'])
     self.assertEqual(
         j.steps()[0],
         MRStep(mapper_init=j._yield_none,
                mapper_pre_filter='cat',
                reducer_cmd='wc -l'))
     self.assertEqual(j.steps()[1],
                      JarStep(jar='s3://bookat/binks_jar.jar'))
Ejemplo n.º 10
0
    def test_jar(self):
        kwargs = {
            'jar': 'binks.jar.jar',
            'main_class': 'MyMainMan',
            'args': ['argh', 'argh'],
        }

        with logger_disabled('mrjob.job'):
            self.assertEqual(MRJob.jar(**kwargs), JarStep(**kwargs))
Ejemplo n.º 11
0
 def test_middle_jar_step(self):
     self._assert_script_protocols([
         MRStep(mapper=self._yield_none,
                combiner=self._yield_none).description(0),
         JarStep(jar='binks_jar.jar').description(1),
         MRStep(reducer=self._yield_none).description(2)
     ], [(PickleProtocol, JSONProtocol), (JSONProtocol, JSONProtocol),
         (RawValueProtocol, RawValueProtocol),
         (JSONProtocol, JSONValueProtocol)])
Ejemplo n.º 12
0
 def steps(self):
     return [
          JarStep(
              name='',
              jar=self.options.jar,
              step_args=['stuff', JarStep.INPUT, JarStep.OUTPUT]
          ),
          self.mr(mapper=self.mapper, reducer=self.reducer)
      ]
Ejemplo n.º 13
0
 def steps(self):
     return [
         JarStep(jar=HADOOP_EXAMPLES_JAR,
                 name='',
                 step_args=['wordcount', JarStep.INPUT, JarStep.OUTPUT]),
         self.mr(mapper=self.mapper,
                 combiner=self.reducer,
                 reducer=self.reducer)
     ]
Ejemplo n.º 14
0
 def test_all(self):
     kwargs = {
         'jar': 'binks.jar.jar',
         'main_class': 'MyMainMan',
         'args': ['argh', 'argh'],
     }
     expected = kwargs.copy()
     expected['type'] = 'jar'
     self.assertEqual(JarStep(**kwargs).description(0), expected)
Ejemplo n.º 15
0
 def test_middle_jar_step(self):
     self._assert_script_protocols([
         MRStep(mapper=self._yield_none, combiner=self._yield_none),
         JarStep(jar='binks_jar.jar'),
         MRStep(reducer=self._yield_none)
     ], [
         dict(mapper=(PickleProtocol, JSONProtocol),
              combiner=(JSONProtocol, JSONProtocol)), None,
         dict(reducer=(JSONProtocol, JSONValueProtocol))
     ])
Ejemplo n.º 16
0
 def test_some(self):
     kwargs = {
         'jar': 'binks.jar.jar',
     }
     expected = kwargs.copy()
     expected.update({
         'type': 'jar',
         'main_class': None,
         'args': [],
     })
     self.assertEqual(JarStep(**kwargs).description(0), expected)
Ejemplo n.º 17
0
 def test_multistep(self):
     # reducer is only script step so it uses INPUT_PROTOCOL and
     # OUTPUT_PROTOCOL
     self._assert_script_protocols([
         MRStep(mapper_cmd='cat', reducer=self._yield_none),
         JarStep(jar='binks_jar.jar'),
         MRStep(mapper=self._yield_none)
     ], [
         dict(mapper=None, reducer=(PickleProtocol, JSONProtocol)), None,
         dict(mapper=(JSONProtocol, JSONValueProtocol))
     ])
Ejemplo n.º 18
0
 def test_multistep(self):
     # reducer is only script step so it uses INPUT_PROTOCOL and
     # OUTPUT_PROTOCOL
     self._assert_script_protocols([
         MRStep(mapper_cmd='cat', reducer=self._yield_none).description(0),
         JarStep(jar='binks_jar.jar').description(1),
         MRStep(mapper=self._yield_none).description(2)
     ],
                                   [(RawValueProtocol, RawValueProtocol),
                                    (PickleProtocol, JSONProtocol),
                                    (RawValueProtocol, RawValueProtocol),
                                    (JSONProtocol, JSONValueProtocol)])
Ejemplo n.º 19
0
    def steps(self):
        jar = self.options.examples_jar

        if self.options.use_main_class:
            jar_step = JarStep(
                jar=jar,
                args=[GENERIC_ARGS, INPUT, OUTPUT],
                main_class=_WORDCOUNT_MAIN_CLASS,
            )
        else:
            jar_step = JarStep(
                jar=jar,
                args=['wordcount', GENERIC_ARGS, INPUT, OUTPUT],
            )

        return [
            jar_step,
            MRStep(
                mapper=self.mapper,
                combiner=self.reducer,
                reducer=self.reducer,
            )
        ]
Ejemplo n.º 20
0
    def steps(self):
        jar = _RUNNER_TO_EXAMPLES_JAR[self.options.runner]

        if self.options.use_main_class:
            jar_step = JarStep(
                jar=jar,
                args=[INPUT, OUTPUT],
                main_class=_WORDCOUNT_MAIN_CLASS,
            )
        else:
            jar_step = JarStep(
                jar=jar,
                args=['wordcount', INPUT, OUTPUT],
            )

        return [
            jar_step,
            MRStep(
                mapper=self.mapper,
                combiner=self.reducer,
                reducer=self.reducer,
            )
        ]
Ejemplo n.º 21
0
 def _jar_step(self, n, *args, **kwargs):
     return JarStep(*args, **kwargs).description(n)
Ejemplo n.º 22
0
 def steps(self):
     return [JarStep('', self.options.jar)]
Ejemplo n.º 23
0
 def steps(self):
     return [
         JarStep(args=['before', GENERIC_ARGS, 'after'],
                 jar=self.options.jar,
                 main_class=self.options.main_class)
     ]
Ejemplo n.º 24
0
 def steps(self):
     return [
         JarStep(jar=self.options.jar, main_class=self.options.main_class)
     ]
Ejemplo n.º 25
0
 def test_positional(self):
     with logger_disabled('mrjob.step'):
         self.assertEqual(
             JarStep('foo', 'bell.jar', 'First', ['one', '2']),
             JarStep(jar='bell.jar', main_class='First', args=['one', '2']))
Ejemplo n.º 26
0
 def test_name_kwarg(self):
     with logger_disabled('mrjob.step'):
         self.assertEqual(JarStep(jar='pickle.jar', name='Bubbies'),
                          JarStep(jar='pickle.jar'))
Ejemplo n.º 27
0
 def test_step_args_kwarg(self):
     with logger_disabled('mrjob.step'):
         self.assertEqual(JarStep(jar='bell.jar', step_args=['5', 'six']),
                          JarStep(jar='bell.jar', args=['5', 'six']))
Ejemplo n.º 28
0
 def test_mixed(self):
     with logger_disabled('mrjob.step'):
         self.assertEqual(
             JarStep('foo', jar='bell.jar', args=['3', 'four']),
             JarStep(jar='bell.jar', args=['3', 'four']))