def steps(self): return [ MRStep(mapper_init=self._yield_none, mapper_pre_filter='cat', reducer_cmd='wc -l'), JarStep(jar='s3://bookat/binks_jar.jar') ]
def steps(self): return [ JarStep( jar='s3://{}.elasticmapreduce/libs/script-runner/script-runner.jar'.format(self.options.jar_region), args=['s3://tg-cloud-store-dev/admin/postgres-hive-metastore.sh'], ), ]
def steps(self): return [ JarStep( # Compress intermediate results but not the (brief) output: jobconf={ 'mapred.compress.map.output': 'true', 'mapred.map.output.compression.codec': 'org.apache.hadoop.io.compress.GzipCodec', 'mapred.output.compress': 'false', 'mapred.reduce.max.attempts': '2' }, jar=self.jar_path, main_class='uk.bl.wa.hadoop.indexer.WARCIndexerRunner', args=[ GENERIC_ARGS, # This ensures the various jobconf etc. params are included. "-files", '%s#annotations.json,%s#openAccessSurts.txt' % (self.options.annotations, self.options.oa_surts), "-c", self.options.config, "-i", self.options.warclist, # Always use local file path. "-o", OUTPUT, "-a", # Apply annotations "-w", # Wait while the job runs "--num-reducers", str(self.options.num_reducers), # An 'int' fails to run! "--solr-zookeepers", self.options.solr_zookeepers, "--solr-collection", self.options.solr_collection ]) ]
def steps(self): return [ JarStep( jobconf={ 'mapred.compress.map.output': 'true', 'mapred.output.compress': 'true', 'mapred.output.compression.codec': 'org.apache.hadoop.io.compress.GzipCodec' }, jar=self.jar_path, main_class='uk.bl.wa.hadoop.mapreduce.hash.HdfsFileHasher', args=[ GENERIC_ARGS, # This ensures the various jobconf etc. params are included. '-i', str(self.options.filelist), '-o', OUTPUT, '-r', str( self.options.num_reducers ), # n.b. if you pass as int it fails mysteriously, use str! ]) ]
def steps(self): return [ JarStep( jobconf={ 'mapred.compress.map.output': 'true', 'mapred.output.compress': 'true', 'mapred.output.compression.codec': 'org.apache.hadoop.io.compress.GzipCodec' }, jar=self.jar_path, main_class='uk.bl.wa.hadoop.mapreduce.cdx.ArchiveCDXGenerator', args=[ GENERIC_ARGS, # This ensures the various jobconf etc. params are included. '-i', INPUT, '-o', OUTPUT, '-r', str( self.options.num_reducers ), # n.b. if you pass as int it fails mysteriously, use str! '-w', '-h', '-m', '', '-t', self.options.cdx_endpoint, '-c', "CDX N b a m s k r M S V g" ]) ]
def jar(cls, *args, **kwargs): """Alias for :py:class:`~mrjob.step.JarStep`. .. deprecated:: 0.4.2 """ log.warning('MRJob.jar() is deprecated and will be removed in' ' v0.5.0. Use mrjob.step.JarStep directly.') return JarStep(*args, **kwargs)
def steps(self): return [ JarStep( jar=self.options.jar, args=['stuff', INPUT, OUTPUT] ), MRStep(mapper=self.mapper, reducer=self.reducer) ]
def steps(self): return [ JarStep(jar=HADOOP_EXAMPLES_JAR, args=['wordcount', INPUT, OUTPUT]), MRStep(mapper=self.mapper, combiner=self.reducer, reducer=self.reducer) ]
def test_steps(self): j = self.SteppyJob(['--no-conf']) self.assertEqual( j.steps()[0], MRStep(mapper_init=j._yield_none, mapper_pre_filter='cat', reducer_cmd='wc -l')) self.assertEqual(j.steps()[1], JarStep(jar='s3://bookat/binks_jar.jar'))
def test_jar(self): kwargs = { 'jar': 'binks.jar.jar', 'main_class': 'MyMainMan', 'args': ['argh', 'argh'], } with logger_disabled('mrjob.job'): self.assertEqual(MRJob.jar(**kwargs), JarStep(**kwargs))
def test_middle_jar_step(self): self._assert_script_protocols([ MRStep(mapper=self._yield_none, combiner=self._yield_none).description(0), JarStep(jar='binks_jar.jar').description(1), MRStep(reducer=self._yield_none).description(2) ], [(PickleProtocol, JSONProtocol), (JSONProtocol, JSONProtocol), (RawValueProtocol, RawValueProtocol), (JSONProtocol, JSONValueProtocol)])
def steps(self): return [ JarStep( name='', jar=self.options.jar, step_args=['stuff', JarStep.INPUT, JarStep.OUTPUT] ), self.mr(mapper=self.mapper, reducer=self.reducer) ]
def steps(self): return [ JarStep(jar=HADOOP_EXAMPLES_JAR, name='', step_args=['wordcount', JarStep.INPUT, JarStep.OUTPUT]), self.mr(mapper=self.mapper, combiner=self.reducer, reducer=self.reducer) ]
def test_all(self): kwargs = { 'jar': 'binks.jar.jar', 'main_class': 'MyMainMan', 'args': ['argh', 'argh'], } expected = kwargs.copy() expected['type'] = 'jar' self.assertEqual(JarStep(**kwargs).description(0), expected)
def test_middle_jar_step(self): self._assert_script_protocols([ MRStep(mapper=self._yield_none, combiner=self._yield_none), JarStep(jar='binks_jar.jar'), MRStep(reducer=self._yield_none) ], [ dict(mapper=(PickleProtocol, JSONProtocol), combiner=(JSONProtocol, JSONProtocol)), None, dict(reducer=(JSONProtocol, JSONValueProtocol)) ])
def test_some(self): kwargs = { 'jar': 'binks.jar.jar', } expected = kwargs.copy() expected.update({ 'type': 'jar', 'main_class': None, 'args': [], }) self.assertEqual(JarStep(**kwargs).description(0), expected)
def test_multistep(self): # reducer is only script step so it uses INPUT_PROTOCOL and # OUTPUT_PROTOCOL self._assert_script_protocols([ MRStep(mapper_cmd='cat', reducer=self._yield_none), JarStep(jar='binks_jar.jar'), MRStep(mapper=self._yield_none) ], [ dict(mapper=None, reducer=(PickleProtocol, JSONProtocol)), None, dict(mapper=(JSONProtocol, JSONValueProtocol)) ])
def test_multistep(self): # reducer is only script step so it uses INPUT_PROTOCOL and # OUTPUT_PROTOCOL self._assert_script_protocols([ MRStep(mapper_cmd='cat', reducer=self._yield_none).description(0), JarStep(jar='binks_jar.jar').description(1), MRStep(mapper=self._yield_none).description(2) ], [(RawValueProtocol, RawValueProtocol), (PickleProtocol, JSONProtocol), (RawValueProtocol, RawValueProtocol), (JSONProtocol, JSONValueProtocol)])
def steps(self): jar = self.options.examples_jar if self.options.use_main_class: jar_step = JarStep( jar=jar, args=[GENERIC_ARGS, INPUT, OUTPUT], main_class=_WORDCOUNT_MAIN_CLASS, ) else: jar_step = JarStep( jar=jar, args=['wordcount', GENERIC_ARGS, INPUT, OUTPUT], ) return [ jar_step, MRStep( mapper=self.mapper, combiner=self.reducer, reducer=self.reducer, ) ]
def steps(self): jar = _RUNNER_TO_EXAMPLES_JAR[self.options.runner] if self.options.use_main_class: jar_step = JarStep( jar=jar, args=[INPUT, OUTPUT], main_class=_WORDCOUNT_MAIN_CLASS, ) else: jar_step = JarStep( jar=jar, args=['wordcount', INPUT, OUTPUT], ) return [ jar_step, MRStep( mapper=self.mapper, combiner=self.reducer, reducer=self.reducer, ) ]
def _jar_step(self, n, *args, **kwargs): return JarStep(*args, **kwargs).description(n)
def steps(self): return [JarStep('', self.options.jar)]
def steps(self): return [ JarStep(args=['before', GENERIC_ARGS, 'after'], jar=self.options.jar, main_class=self.options.main_class) ]
def steps(self): return [ JarStep(jar=self.options.jar, main_class=self.options.main_class) ]
def test_positional(self): with logger_disabled('mrjob.step'): self.assertEqual( JarStep('foo', 'bell.jar', 'First', ['one', '2']), JarStep(jar='bell.jar', main_class='First', args=['one', '2']))
def test_name_kwarg(self): with logger_disabled('mrjob.step'): self.assertEqual(JarStep(jar='pickle.jar', name='Bubbies'), JarStep(jar='pickle.jar'))
def test_step_args_kwarg(self): with logger_disabled('mrjob.step'): self.assertEqual(JarStep(jar='bell.jar', step_args=['5', 'six']), JarStep(jar='bell.jar', args=['5', 'six']))
def test_mixed(self): with logger_disabled('mrjob.step'): self.assertEqual( JarStep('foo', jar='bell.jar', args=['3', 'four']), JarStep(jar='bell.jar', args=['3', 'four']))