def test_generate_cmd_with_partitioner(self): _job_name = "test_streaming_job_%s" % uuid.uuid4() _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-D map.output.key.field.separator=| ' \ '-D mapreduce.partition.keypartitioner.options=-k1,2 ' \ '-mapper mapper.py ' \ '-reducer reducer.py ' \ '-numReduceTasks 0 ' \ '-input data ' \ '-output output.txt ' \ '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar' .format(os.path.dirname(os.path.realpath(__file__))), name=_job_name, executor=self.assert_generated_command(_expected_command) ).take('data').process_with(mapper='mapper.py', reducer='reducer.py', reducer_num=0).use( partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' ).save('output.txt').with_config_option( key='map.output.key.field.separator', value='|').with_config_option( key='mapreduce.partition.keypartitioner.options', value='-k1,2').run()
def test_streaming_map_only_job_generation(self): _config_file = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'resources', 'mapreduce', 'mapreduce_streaming_job.ini') metastore = IniFileMetaStore(file=_config_file) _config = Configuration.load(metastore=metastore) _job_name = 'streaming_test_job_map_only' _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-D value.delimiter.char=, ' \ '-D partition.to.process=20142010 ' \ '-mapper smapper.py ' \ '-reducer NONE ' \ '-numReduceTasks 0 ' \ '-input /raw/20102014 ' \ '-output /core/20102014'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format( os.path.dirname(os.path.realpath(__file__))), config=_config, name=_job_name, executor=self.assert_generated_command(_expected_command)).run()
def test_generate_job_cmd_with_config_injections(self): _job_name = "test_streaming_job_%s" % uuid.uuid4() _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-D value.delimiter.char=, ' \ '-D partition.to.process=20142010 ' \ '-mapper mapper.py ' \ '-reducer reducer.py ' \ '-input data ' \ '-output output.txt'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar' .format(os.path.dirname(os.path.realpath(__file__))), name=_job_name, executor=self.assert_generated_command(_expected_command) ).take( 'data' ).process_with( mapper='mapper.py', reducer='reducer.py' ).save( 'output.txt' ).with_config_option( key='value.delimiter.char', value=',' ).with_config_option( key='partition.to.process', value='20142010' ).run()
def test_streaming_map_only_job_generation(self): _config_file = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'resources', 'mapreduce', 'mapreduce_streaming_job.ini') metastore = IniFileMetaStore(file=_config_file) _config = Configuration.load(metastore=metastore) _job_name = 'streaming_test_job_map_only' _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-D value.delimiter.char=, ' \ '-D partition.to.process=20142010 ' \ '-mapper smapper.py ' \ '-reducer NONE ' \ '-numReduceTasks 0 ' \ '-input /raw/20102014 ' \ '-output /core/20102014'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar' .format(os.path.dirname(os.path.realpath(__file__))), config=_config, name=_job_name, executor=self.assert_generated_command(_expected_command) ).run()
def test_streaming_job_with_multiple_inputs(self): _job_name = 'streaming_test_job_with_multiple_inputs' _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-files dim1.txt ' \ '-libjars mr_001.jar,mr_002.jar ' \ '-mapper smapper.py ' \ '-reducer sreducer.py ' \ '-numReduceTasks 100 ' \ '-input /raw/20102014 ' \ '-input /raw/21102014 ' \ '-input /raw/22102014 ' \ '-output /core/20102014 ' \ '-inputformat \'org.mr.CustomInputFormat\' ' \ '-outputformat \'org.mr.CustomOutputFormat\' ' \ '-cmdenv JAVA_HOME=/java ' \ '-cmdenv tmp.dir=/tmp/streaming_test_job_with_multiple_inputs'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format( os.path.dirname(os.path.realpath(__file__))), config=self._config, name=_job_name, executor=self.assert_generated_command(_expected_command)).run()
def test_streaming_job_with_multiple_inputs(self): _job_name = 'streaming_test_job_with_multiple_inputs' _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-files dim1.txt ' \ '-libjars mr_001.jar,mr_002.jar ' \ '-mapper smapper.py ' \ '-reducer sreducer.py ' \ '-numReduceTasks 100 ' \ '-input /raw/20102014 ' \ '-input /raw/21102014 ' \ '-input /raw/22102014 ' \ '-output /core/20102014 ' \ '-inputformat \'org.mr.CustomInputFormat\' ' \ '-outputformat \'org.mr.CustomOutputFormat\' ' \ '-cmdenv JAVA_HOME=/java ' \ '-cmdenv tmp.dir=/tmp/streaming_test_job_with_multiple_inputs'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar' .format(os.path.dirname(os.path.realpath(__file__))), config=self._config, name=_job_name, executor=self.assert_generated_command(_expected_command) ).run()
def test_generate_cmd_with_partitioner(self): _job_name = "test_streaming_job_%s" % uuid.uuid4() _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-D map.output.key.field.separator=| ' \ '-D mapreduce.partition.keypartitioner.options=-k1,2 ' \ '-mapper mapper.py ' \ '-reducer reducer.py ' \ '-numReduceTasks 0 ' \ '-input data ' \ '-output output.txt ' \ '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format( os.path.dirname(os.path.realpath(__file__))), name=_job_name, executor=self.assert_generated_command(_expected_command) ).take('data').process_with( mapper='mapper.py', reducer='reducer.py', reducer_num=0 ).use( partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' ).save('output.txt').with_config_option( key='map.output.key.field.separator', value='|').with_config_option( key='mapreduce.partition.keypartitioner.options', value='-k1,2').run()
def test_mr_job_command_generation(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar demo.mr.Driver -D mapreduce.job.name={1}"\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar".format( os.path.dirname(os.path.realpath(__file__))), main_class="demo.mr.Driver", name=_job_name, executor=self.assert_generated_command(_expected_command)).run()
def test_mr_job_command_generation(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar demo.mr.Driver -D mapreduce.job.name={1}"\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar" .format(os.path.dirname(os.path.realpath(__file__))), main_class="demo.mr.Driver", name=_job_name, executor=self.assert_generated_command(_expected_command)).run()
def test_mr_job_command_generation(self): _expected_command = 'hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar ' \ 'test.mr.Driver ' \ '-D mapreduce.job.name=simple_mr_job ' \ '-D value.delimiter.char=, ' \ '-D partition.to.process=20142010 ' \ '/input/dir ' \ '/output/dir'.format(os.path.dirname(os.path.realpath(__file__))) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar".format( os.path.dirname(os.path.realpath(__file__))), main_class="test.mr.Driver", config=self._config, name='simple_mr_job', executor=self.assert_generated_command(_expected_command)).run()
def test_mr_job_command_generation(self): _expected_command = 'hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar ' \ 'test.mr.Driver ' \ '-D mapreduce.job.name=simple_mr_job ' \ '-D value.delimiter.char=, ' \ '-D partition.to.process=20142010 ' \ '/input/dir ' \ '/output/dir'.format(os.path.dirname(os.path.realpath(__file__))) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar" .format(os.path.dirname(os.path.realpath(__file__))), main_class="test.mr.Driver", config=self._config, name='simple_mr_job', executor=self.assert_generated_command(_expected_command) ).run()
def test_streaming_job_generation(self): _job_name = 'streaming_test_job' _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-mapper smapper.py ' \ '-reducer sreducer.py ' \ '-input /raw/20102014 ' \ '-output /core/20102014'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format( os.path.dirname(os.path.realpath(__file__))), config=self._config, name=_job_name, executor=self.assert_generated_command(_expected_command)).run()
def test_mr_job_command_generation_with_configurations(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar " \ "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \ "demo.mr.Driver " \ "-D mapreduce.job.name={1} " \ "-D job.input=/data/raw/24102014 " \ "-D mapreduce.job.reduces=10"\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar" .format(os.path.dirname(os.path.realpath(__file__))), main_class="demo.mr.Driver", name=_job_name, executor=self.assert_generated_command(_expected_command) ).with_config_option("job.input", "/data/raw/24102014").with_number_of_reducers(10).run()
def test_streaming_job_generation(self): _job_name = 'streaming_test_job' _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-mapper smapper.py ' \ '-reducer sreducer.py ' \ '-input /raw/20102014 ' \ '-output /core/20102014'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar' .format(os.path.dirname(os.path.realpath(__file__))), config=self._config, name=_job_name, executor=self.assert_generated_command(_expected_command) ).run()
def test_mr_job_command_generation_with_configurations(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar " \ "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \ "demo.mr.Driver " \ "-D mapreduce.job.name={1} " \ "-D job.input=/data/raw/24102014 " \ "-D mapreduce.job.reduces=10"\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar".format( os.path.dirname(os.path.realpath(__file__))), main_class="demo.mr.Driver", name=_job_name, executor=self.assert_generated_command( _expected_command)).with_config_option( "job.input", "/data/raw/24102014").with_number_of_reducers(10).run()
def test_streaming_job_without_reducers(self): _job_name = "test_streaming_job_%s" % uuid.uuid4() _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-mapper mapper.py ' \ '-reducer NONE ' \ '-numReduceTasks 0 ' \ '-input data ' \ '-output output.txt'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format( os.path.dirname(os.path.realpath(__file__))), name=_job_name, executor=self.assert_generated_command(_expected_command) ).take("data").map_with( mapper="mapper.py").disable_reducers().save("output.txt").run()
def test_load_streaming_job_with_config_injections(self): _job_name = 'streaming_test_job_with_custom_configurations' _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-D value.delimiter.char=, ' \ '-D partition.to.process=20142010 ' \ '-mapper smapper.py ' \ '-reducer sreducer.py ' \ '-input /raw/20102014 ' \ '-output /core/20102014'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format( os.path.dirname(os.path.realpath(__file__))), config=self._config, name=_job_name, executor=self.assert_generated_command(_expected_command)).run()
def test_load_streaming_job_with_config_injections(self): _job_name = 'streaming_test_job_with_custom_configurations' _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-D value.delimiter.char=, ' \ '-D partition.to.process=20142010 ' \ '-mapper smapper.py ' \ '-reducer sreducer.py ' \ '-input /raw/20102014 ' \ '-output /core/20102014'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar' .format(os.path.dirname(os.path.realpath(__file__))), config=self._config, name=_job_name, executor=self.assert_generated_command(_expected_command)).run()
def test_streaming_job(self): _job_name = "test_streaming_job_%s" % uuid.uuid4() _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-mapper mapper.py ' \ '-reducer reducer.py ' \ '-input data ' \ '-output output.txt'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar' .format(os.path.dirname(os.path.realpath(__file__))), name=_job_name, executor=self.assert_generated_command(_expected_command) ).take( "data" ).process_with( mapper="mapper.py", reducer="reducer.py").save("output.txt").run()
def test_mr_job_command_generation_with_arguments(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar " \ "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \ "wordcount " \ "-D mapreduce.job.name={1} " \ "-D split.by='\\t' " \ "-D mapreduce.job.reduces=3 " \ "/user/vagrant/dmode.txt " \ "/tmp/test".format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar" .format(os.path.dirname(os.path.realpath(__file__))), main_class="wordcount", name=_job_name, executor=self.assert_generated_command(_expected_command) ).with_config_option("split.by", "'\\t'") \ .with_number_of_reducers(3) \ .with_arguments() \ .run("/user/vagrant/dmode.txt", "/tmp/test")
def process_data(context): # Configure and run MapReduce job _mapreduce_job_ = MapReduce.prepare_mapreduce_job( jar=os.path.join(os.path.dirname(__file__), 'resources/WordsCount-1.0-SNAPSHOT.jar'), main_class="WordsCountJob", name="MAPREDUCE_Counting" ).with_config_option("input", "{0}/data_from_import".format(BASE_DIR)) \ .with_config_option("output", "{0}/data_to_export".format(BASE_DIR)) _mapreduce_job_.run() status = _mapreduce_job_.status() if not status.is_succeeded(): raise MapReduceJobException("MapReduce job failed: {}".format( status.failure_reason() if status.is_failed() else 'NONE'))
def test_generate_cmd_with_input_output_format(self): _job_name = "test_streaming_job_%s" % uuid.uuid4() _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-mapper mapper.py ' \ '-reducer reducer.py ' \ '-numReduceTasks 10 ' \ '-input data ' \ '-output output.txt ' \ '-inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat ' \ '-outputformat org.apache.hadoop.mapred.SequenceFileOutputFormat'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar' .format(os.path.dirname(os.path.realpath(__file__))), name=_job_name, executor=self.assert_generated_command(_expected_command) ).process_with(mapper='mapper.py', reducer='reducer.py', reducer_num=10).use( inputformat='org.apache.hadoop.mapred.KeyValueTextInputFormat', outputformat='org.apache.hadoop.mapred.SequenceFileOutputFormat' ).take('data').save('output.txt').run()
def test_generate_cmd_with_input_output_format(self): _job_name = "test_streaming_job_%s" % uuid.uuid4() _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-mapper mapper.py ' \ '-reducer reducer.py ' \ '-numReduceTasks 10 ' \ '-input data ' \ '-output output.txt ' \ '-inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat ' \ '-outputformat org.apache.hadoop.mapred.SequenceFileOutputFormat'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format( os.path.dirname(os.path.realpath(__file__))), name=_job_name, executor=self.assert_generated_command(_expected_command) ).process_with( mapper='mapper.py', reducer='reducer.py', reducer_num=10 ).use(inputformat='org.apache.hadoop.mapred.KeyValueTextInputFormat', outputformat='org.apache.hadoop.mapred.SequenceFileOutputFormat' ).take('data').save('output.txt').run()
def test_generate_job_cmd_with_config_injections(self): _job_name = "test_streaming_job_%s" % uuid.uuid4() _expected_command = 'hadoop jar ' \ '{0}/resources/mapreduce/hadoop-streaming.jar ' \ '-D mapreduce.job.name={1} ' \ '-D value.delimiter.char=, ' \ '-D partition.to.process=20142010 ' \ '-mapper mapper.py ' \ '-reducer reducer.py ' \ '-input data ' \ '-output output.txt'\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_streaming_job( jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format( os.path.dirname(os.path.realpath(__file__))), name=_job_name, executor=self.assert_generated_command(_expected_command) ).take('data').process_with( mapper='mapper.py', reducer='reducer.py').save('output.txt').with_config_option( key='value.delimiter.char', value=',').with_config_option(key='partition.to.process', value='20142010').run()
def test_mr_job_command_generation_with_arguments(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _base_dir = HDFS(os.path.join("/tmp", _job_name)) _base_dir.create_directory() try: jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar') # configure job inputs _job_input = HDFS(os.path.join(_base_dir.path, "input")) _job_input.create_directory() LocalFS(os.path.join( os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) # configure job output _job_output = HDFS(os.path.join(_base_dir.path, "output")) if not os.path.exists(jar): self.skipTest("'%s' not found" % jar) job = MapReduce.prepare_mapreduce_job(jar=jar, main_class="wordcount", name=_job_name) \ .with_config_option("split.by", "'\\t'") \ .with_number_of_reducers(3) \ .with_arguments( _job_input.path, _job_output.path ) _command_submission_result = job.run() _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed") self.assertTrue(_job_output.exists(), "Error: empty job output") # check counters self.assertEqual(6, _job_status.counter(group='File System Counters', counter='HDFS: Number of write operations')) self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks')) self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks')) self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read')) finally: _base_dir.delete_directory()
def _template_streaming_job_(self, base_dir="/tmp", map_only_job=False): if not os.path.exists(HADOOP_STREAMING_JAR): self.skip("Cannot allocate %s" % HADOOP_STREAMING_JAR) _hdfs_basdir = HDFS(base_dir) if not _hdfs_basdir.exists(): _hdfs_basdir.create_directory() _job_input = HDFS(os.path.join(_hdfs_basdir.path, "input")) _job_input.create_directory() _job_output = HDFS(os.path.join(_hdfs_basdir.path, "output")) home = os.path.dirname(__file__) _mapper = os.path.join(home, 'resources', 'mapreduce', 'mapper.py') _reducer = os.path.join(home, 'resources', 'mapreduce', 'reducer.py') LocalFS( os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) return MapReduce.prepare_streaming_job(name="test-mr-streaming-job{}".format(str(uuid.uuid4())), jar=HADOOP_STREAMING_JAR) \ .take(_job_input.path) \ .process_with(mapper=_mapper, reducer=None if map_only_job else _reducer) \ .save(_job_output.path)