Ejemplo n.º 1
0
    def test_generate_cmd_with_partitioner(self):
        _job_name = "test_streaming_job_%s" % uuid.uuid4()
        _expected_command = 'hadoop jar ' \
                            '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                            '-D mapreduce.job.name={1} ' \
                            '-D map.output.key.field.separator=| ' \
                            '-D mapreduce.partition.keypartitioner.options=-k1,2 ' \
                            '-mapper mapper.py ' \
                            '-reducer reducer.py ' \
                            '-numReduceTasks 0 ' \
                            '-input data ' \
                            '-output output.txt ' \
                            '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'\
            .format(os.path.dirname(os.path.realpath(__file__)),
                            _job_name)

        MapReduce.prepare_streaming_job(
            jar='{0}/resources/mapreduce/hadoop-streaming.jar'
            .format(os.path.dirname(os.path.realpath(__file__))),
            name=_job_name,
            executor=self.assert_generated_command(_expected_command)
        ).take('data').process_with(mapper='mapper.py', reducer='reducer.py', reducer_num=0).use(
            partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'
        ).save('output.txt').with_config_option(
            key='map.output.key.field.separator',
            value='|').with_config_option(
            key='mapreduce.partition.keypartitioner.options',
            value='-k1,2').run()
Ejemplo n.º 2
0
 def test_streaming_map_only_job_generation(self):
     _config_file = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), 'resources',
         'mapreduce', 'mapreduce_streaming_job.ini')
     metastore = IniFileMetaStore(file=_config_file)
     _config = Configuration.load(metastore=metastore)
     _job_name = 'streaming_test_job_map_only'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper smapper.py ' \
                         '-reducer NONE ' \
                         '-numReduceTasks 0 ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         config=_config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 3
0
 def test_generate_job_cmd_with_config_injections(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper mapper.py ' \
                         '-reducer reducer.py ' \
                         '-input data ' \
                         '-output output.txt'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).take(
         'data'
     ).process_with(
         mapper='mapper.py',
         reducer='reducer.py'
     ).save(
         'output.txt'
     ).with_config_option(
         key='value.delimiter.char',
         value=','
     ).with_config_option(
         key='partition.to.process',
         value='20142010'
     ).run()
Ejemplo n.º 4
0
 def test_streaming_map_only_job_generation(self):
     _config_file = os.path.join(
         os.path.dirname(os.path.realpath(__file__)),
         'resources',
         'mapreduce',
         'mapreduce_streaming_job.ini')
     metastore = IniFileMetaStore(file=_config_file)
     _config = Configuration.load(metastore=metastore)
     _job_name = 'streaming_test_job_map_only'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper smapper.py ' \
                         '-reducer NONE ' \
                         '-numReduceTasks 0 ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         config=_config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).run()
Ejemplo n.º 5
0
 def test_streaming_job_with_multiple_inputs(self):
     _job_name = 'streaming_test_job_with_multiple_inputs'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-files dim1.txt ' \
                         '-libjars mr_001.jar,mr_002.jar ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-numReduceTasks 100 ' \
                         '-input /raw/20102014 ' \
                         '-input /raw/21102014 ' \
                         '-input /raw/22102014 ' \
                         '-output /core/20102014 ' \
                         '-inputformat \'org.mr.CustomInputFormat\' ' \
                         '-outputformat \'org.mr.CustomOutputFormat\' ' \
                         '-cmdenv JAVA_HOME=/java ' \
                         '-cmdenv tmp.dir=/tmp/streaming_test_job_with_multiple_inputs'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 6
0
 def test_streaming_job_with_multiple_inputs(self):
     _job_name = 'streaming_test_job_with_multiple_inputs'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-files dim1.txt ' \
                         '-libjars mr_001.jar,mr_002.jar ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-numReduceTasks 100 ' \
                         '-input /raw/20102014 ' \
                         '-input /raw/21102014 ' \
                         '-input /raw/22102014 ' \
                         '-output /core/20102014 ' \
                         '-inputformat \'org.mr.CustomInputFormat\' ' \
                         '-outputformat \'org.mr.CustomOutputFormat\' ' \
                         '-cmdenv JAVA_HOME=/java ' \
                         '-cmdenv tmp.dir=/tmp/streaming_test_job_with_multiple_inputs'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).run()
Ejemplo n.º 7
0
    def test_generate_cmd_with_partitioner(self):
        _job_name = "test_streaming_job_%s" % uuid.uuid4()
        _expected_command = 'hadoop jar ' \
                            '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                            '-D mapreduce.job.name={1} ' \
                            '-D map.output.key.field.separator=| ' \
                            '-D mapreduce.partition.keypartitioner.options=-k1,2 ' \
                            '-mapper mapper.py ' \
                            '-reducer reducer.py ' \
                            '-numReduceTasks 0 ' \
                            '-input data ' \
                            '-output output.txt ' \
                            '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'\
            .format(os.path.dirname(os.path.realpath(__file__)),
                            _job_name)

        MapReduce.prepare_streaming_job(
            jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
                os.path.dirname(os.path.realpath(__file__))),
            name=_job_name,
            executor=self.assert_generated_command(_expected_command)
        ).take('data').process_with(
            mapper='mapper.py', reducer='reducer.py', reducer_num=0
        ).use(
            partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'
        ).save('output.txt').with_config_option(
            key='map.output.key.field.separator',
            value='|').with_config_option(
                key='mapreduce.partition.keypartitioner.options',
                value='-k1,2').run()
Ejemplo n.º 8
0
 def test_mr_job_command_generation(self):
     _job_name = "test_mr_job_%s" % uuid.uuid4()
     _expected_command = "hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar demo.mr.Driver -D mapreduce.job.name={1}"\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_mapreduce_job(
         jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar".format(
             os.path.dirname(os.path.realpath(__file__))),
         main_class="demo.mr.Driver",
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 9
0
 def test_mr_job_command_generation(self):
     _job_name = "test_mr_job_%s" % uuid.uuid4()
     _expected_command = "hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar demo.mr.Driver -D mapreduce.job.name={1}"\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_mapreduce_job(
         jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar"
         .format(os.path.dirname(os.path.realpath(__file__))),
         main_class="demo.mr.Driver",
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 10
0
 def test_mr_job_command_generation(self):
     _expected_command = 'hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar ' \
                         'test.mr.Driver ' \
                         '-D mapreduce.job.name=simple_mr_job ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '/input/dir ' \
                         '/output/dir'.format(os.path.dirname(os.path.realpath(__file__)))
     MapReduce.prepare_mapreduce_job(
         jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar".format(
             os.path.dirname(os.path.realpath(__file__))),
         main_class="test.mr.Driver",
         config=self._config,
         name='simple_mr_job',
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 11
0
 def test_mr_job_command_generation(self):
     _expected_command = 'hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar ' \
                         'test.mr.Driver ' \
                         '-D mapreduce.job.name=simple_mr_job ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '/input/dir ' \
                         '/output/dir'.format(os.path.dirname(os.path.realpath(__file__)))
     MapReduce.prepare_mapreduce_job(
         jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar"
         .format(os.path.dirname(os.path.realpath(__file__))),
         main_class="test.mr.Driver",
         config=self._config,
         name='simple_mr_job',
         executor=self.assert_generated_command(_expected_command)
     ).run()
Ejemplo n.º 12
0
 def test_streaming_job_generation(self):
     _job_name = 'streaming_test_job'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 13
0
 def test_mr_job_command_generation_with_configurations(self):
     _job_name = "test_mr_job_%s" % uuid.uuid4()
     _expected_command = "hadoop jar " \
                         "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \
                         "demo.mr.Driver " \
                         "-D mapreduce.job.name={1} " \
                         "-D job.input=/data/raw/24102014 " \
                         "-D mapreduce.job.reduces=10"\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_mapreduce_job(
         jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar"
         .format(os.path.dirname(os.path.realpath(__file__))),
         main_class="demo.mr.Driver",
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).with_config_option("job.input", "/data/raw/24102014").with_number_of_reducers(10).run()
Ejemplo n.º 14
0
 def test_streaming_job_generation(self):
     _job_name = 'streaming_test_job'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).run()
Ejemplo n.º 15
0
 def test_mr_job_command_generation_with_configurations(self):
     _job_name = "test_mr_job_%s" % uuid.uuid4()
     _expected_command = "hadoop jar " \
                         "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \
                         "demo.mr.Driver " \
                         "-D mapreduce.job.name={1} " \
                         "-D job.input=/data/raw/24102014 " \
                         "-D mapreduce.job.reduces=10"\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_mapreduce_job(
         jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar".format(
             os.path.dirname(os.path.realpath(__file__))),
         main_class="demo.mr.Driver",
         name=_job_name,
         executor=self.assert_generated_command(
             _expected_command)).with_config_option(
                 "job.input",
                 "/data/raw/24102014").with_number_of_reducers(10).run()
Ejemplo n.º 16
0
 def test_streaming_job_without_reducers(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper mapper.py ' \
                         '-reducer NONE ' \
                         '-numReduceTasks 0 ' \
                         '-input data ' \
                         '-output output.txt'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).take("data").map_with(
         mapper="mapper.py").disable_reducers().save("output.txt").run()
Ejemplo n.º 17
0
 def test_load_streaming_job_with_config_injections(self):
     _job_name = 'streaming_test_job_with_custom_configurations'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 18
0
 def test_load_streaming_job_with_config_injections(self):
     _job_name = 'streaming_test_job_with_custom_configurations'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 19
0
 def test_streaming_job(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper mapper.py ' \
                         '-reducer reducer.py ' \
                         '-input data ' \
                         '-output output.txt'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).take(
         "data"
     ).process_with(
         mapper="mapper.py",
         reducer="reducer.py").save("output.txt").run()
Ejemplo n.º 20
0
 def test_mr_job_command_generation_with_arguments(self):
     _job_name = "test_mr_job_%s" % uuid.uuid4()
     _expected_command = "hadoop jar " \
                         "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \
                         "wordcount " \
                         "-D mapreduce.job.name={1} " \
                         "-D split.by='\\t' " \
                         "-D mapreduce.job.reduces=3 " \
                         "/user/vagrant/dmode.txt " \
                         "/tmp/test".format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_mapreduce_job(
         jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar"
         .format(os.path.dirname(os.path.realpath(__file__))),
         main_class="wordcount",
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).with_config_option("split.by", "'\\t'") \
         .with_number_of_reducers(3) \
         .with_arguments() \
         .run("/user/vagrant/dmode.txt", "/tmp/test")
Ejemplo n.º 21
0
 def test_mr_job_command_generation_with_arguments(self):
     _job_name = "test_mr_job_%s" % uuid.uuid4()
     _expected_command = "hadoop jar " \
                         "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \
                         "wordcount " \
                         "-D mapreduce.job.name={1} " \
                         "-D split.by='\\t' " \
                         "-D mapreduce.job.reduces=3 " \
                         "/user/vagrant/dmode.txt " \
                         "/tmp/test".format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_mapreduce_job(
         jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar"
         .format(os.path.dirname(os.path.realpath(__file__))),
         main_class="wordcount",
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).with_config_option("split.by", "'\\t'") \
         .with_number_of_reducers(3) \
         .with_arguments() \
         .run("/user/vagrant/dmode.txt", "/tmp/test")
Ejemplo n.º 22
0
def process_data(context):
    # Configure and run MapReduce job
    _mapreduce_job_ = MapReduce.prepare_mapreduce_job(
        jar=os.path.join(os.path.dirname(__file__), 'resources/WordsCount-1.0-SNAPSHOT.jar'),
        main_class="WordsCountJob",
        name="MAPREDUCE_Counting"
    ).with_config_option("input", "{0}/data_from_import".format(BASE_DIR)) \
        .with_config_option("output", "{0}/data_to_export".format(BASE_DIR))
    _mapreduce_job_.run()
    status = _mapreduce_job_.status()
    if not status.is_succeeded():
        raise MapReduceJobException("MapReduce job failed: {}".format(
            status.failure_reason() if status.is_failed() else 'NONE'))
Ejemplo n.º 23
0
 def test_generate_cmd_with_input_output_format(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper mapper.py ' \
                         '-reducer reducer.py ' \
                         '-numReduceTasks 10 ' \
                         '-input data ' \
                         '-output output.txt ' \
                         '-inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat ' \
                         '-outputformat org.apache.hadoop.mapred.SequenceFileOutputFormat'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).process_with(mapper='mapper.py', reducer='reducer.py', reducer_num=10).use(
         inputformat='org.apache.hadoop.mapred.KeyValueTextInputFormat',
         outputformat='org.apache.hadoop.mapred.SequenceFileOutputFormat'
     ).take('data').save('output.txt').run()
Ejemplo n.º 24
0
 def test_generate_cmd_with_input_output_format(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper mapper.py ' \
                         '-reducer reducer.py ' \
                         '-numReduceTasks 10 ' \
                         '-input data ' \
                         '-output output.txt ' \
                         '-inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat ' \
                         '-outputformat org.apache.hadoop.mapred.SequenceFileOutputFormat'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).process_with(
         mapper='mapper.py', reducer='reducer.py', reducer_num=10
     ).use(inputformat='org.apache.hadoop.mapred.KeyValueTextInputFormat',
           outputformat='org.apache.hadoop.mapred.SequenceFileOutputFormat'
           ).take('data').save('output.txt').run()
Ejemplo n.º 25
0
 def test_generate_job_cmd_with_config_injections(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper mapper.py ' \
                         '-reducer reducer.py ' \
                         '-input data ' \
                         '-output output.txt'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).take('data').process_with(
         mapper='mapper.py',
         reducer='reducer.py').save('output.txt').with_config_option(
             key='value.delimiter.char',
             value=',').with_config_option(key='partition.to.process',
                                           value='20142010').run()
Ejemplo n.º 26
0
    def test_mr_job_command_generation_with_arguments(self):
        _job_name = "test_mr_job_%s" % uuid.uuid4()

        _base_dir = HDFS(os.path.join("/tmp", _job_name))
        _base_dir.create_directory()
        try:
            jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar')
            # configure job inputs
            _job_input = HDFS(os.path.join(_base_dir.path, "input"))
            _job_input.create_directory()
            LocalFS(os.path.join(
                os.path.dirname(__file__),
                'resources',
                'mapreduce', 'raw-data.txt')
            ).copy_to_hdfs(
                _job_input.path
            )

            # configure job output
            _job_output = HDFS(os.path.join(_base_dir.path, "output"))
            if not os.path.exists(jar):
                self.skipTest("'%s' not found" % jar)

            job = MapReduce.prepare_mapreduce_job(jar=jar,
                                                  main_class="wordcount",
                                                  name=_job_name) \
                .with_config_option("split.by", "'\\t'") \
                .with_number_of_reducers(3) \
                .with_arguments(
                _job_input.path,
                _job_output.path
            )
            _command_submission_result = job.run()
            _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed")
            self.assertTrue(_job_output.exists(), "Error: empty job output")
            #     check counters
            self.assertEqual(6, _job_status.counter(group='File System Counters',
                                                    counter='HDFS: Number of write operations'))
            self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks'))
            self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks'))
            self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read'))
        finally:
            _base_dir.delete_directory()
Ejemplo n.º 27
0
    def _template_streaming_job_(self, base_dir="/tmp", map_only_job=False):
        if not os.path.exists(HADOOP_STREAMING_JAR):
            self.skip("Cannot allocate %s" % HADOOP_STREAMING_JAR)
        _hdfs_basdir = HDFS(base_dir)
        if not _hdfs_basdir.exists():
            _hdfs_basdir.create_directory()
        _job_input = HDFS(os.path.join(_hdfs_basdir.path, "input"))
        _job_input.create_directory()
        _job_output = HDFS(os.path.join(_hdfs_basdir.path, "output"))
        home = os.path.dirname(__file__)
        _mapper = os.path.join(home, 'resources', 'mapreduce', 'mapper.py')
        _reducer = os.path.join(home, 'resources', 'mapreduce', 'reducer.py')

        LocalFS(
            os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt')
        ).copy_to_hdfs(
            _job_input.path
        )

        return MapReduce.prepare_streaming_job(name="test-mr-streaming-job{}".format(str(uuid.uuid4())), jar=HADOOP_STREAMING_JAR) \
            .take(_job_input.path) \
            .process_with(mapper=_mapper, reducer=None if map_only_job else _reducer) \
            .save(_job_output.path)