Ejemplo n.º 1
0
    def test_generate_cmd_with_partitioner(self):
        _job_name = "test_streaming_job_%s" % uuid.uuid4()
        _expected_command = 'hadoop jar ' \
                            '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                            '-D mapreduce.job.name={1} ' \
                            '-D map.output.key.field.separator=| ' \
                            '-D mapreduce.partition.keypartitioner.options=-k1,2 ' \
                            '-mapper mapper.py ' \
                            '-reducer reducer.py ' \
                            '-numReduceTasks 0 ' \
                            '-input data ' \
                            '-output output.txt ' \
                            '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'\
            .format(os.path.dirname(os.path.realpath(__file__)),
                            _job_name)

        MapReduce.prepare_streaming_job(
            jar='{0}/resources/mapreduce/hadoop-streaming.jar'
            .format(os.path.dirname(os.path.realpath(__file__))),
            name=_job_name,
            executor=self.assert_generated_command(_expected_command)
        ).take('data').process_with(mapper='mapper.py', reducer='reducer.py', reducer_num=0).use(
            partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'
        ).save('output.txt').with_config_option(
            key='map.output.key.field.separator',
            value='|').with_config_option(
            key='mapreduce.partition.keypartitioner.options',
            value='-k1,2').run()
Ejemplo n.º 2
0
 def test_streaming_map_only_job_generation(self):
     _config_file = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), 'resources',
         'mapreduce', 'mapreduce_streaming_job.ini')
     metastore = IniFileMetaStore(file=_config_file)
     _config = Configuration.load(metastore=metastore)
     _job_name = 'streaming_test_job_map_only'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper smapper.py ' \
                         '-reducer NONE ' \
                         '-numReduceTasks 0 ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         config=_config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 3
0
 def test_generate_job_cmd_with_config_injections(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper mapper.py ' \
                         '-reducer reducer.py ' \
                         '-input data ' \
                         '-output output.txt'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).take(
         'data'
     ).process_with(
         mapper='mapper.py',
         reducer='reducer.py'
     ).save(
         'output.txt'
     ).with_config_option(
         key='value.delimiter.char',
         value=','
     ).with_config_option(
         key='partition.to.process',
         value='20142010'
     ).run()
Ejemplo n.º 4
0
 def test_streaming_map_only_job_generation(self):
     _config_file = os.path.join(
         os.path.dirname(os.path.realpath(__file__)),
         'resources',
         'mapreduce',
         'mapreduce_streaming_job.ini')
     metastore = IniFileMetaStore(file=_config_file)
     _config = Configuration.load(metastore=metastore)
     _job_name = 'streaming_test_job_map_only'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper smapper.py ' \
                         '-reducer NONE ' \
                         '-numReduceTasks 0 ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         config=_config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).run()
Ejemplo n.º 5
0
 def test_streaming_job_with_multiple_inputs(self):
     _job_name = 'streaming_test_job_with_multiple_inputs'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-files dim1.txt ' \
                         '-libjars mr_001.jar,mr_002.jar ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-numReduceTasks 100 ' \
                         '-input /raw/20102014 ' \
                         '-input /raw/21102014 ' \
                         '-input /raw/22102014 ' \
                         '-output /core/20102014 ' \
                         '-inputformat \'org.mr.CustomInputFormat\' ' \
                         '-outputformat \'org.mr.CustomOutputFormat\' ' \
                         '-cmdenv JAVA_HOME=/java ' \
                         '-cmdenv tmp.dir=/tmp/streaming_test_job_with_multiple_inputs'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 6
0
    def test_generate_cmd_with_partitioner(self):
        _job_name = "test_streaming_job_%s" % uuid.uuid4()
        _expected_command = 'hadoop jar ' \
                            '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                            '-D mapreduce.job.name={1} ' \
                            '-D map.output.key.field.separator=| ' \
                            '-D mapreduce.partition.keypartitioner.options=-k1,2 ' \
                            '-mapper mapper.py ' \
                            '-reducer reducer.py ' \
                            '-numReduceTasks 0 ' \
                            '-input data ' \
                            '-output output.txt ' \
                            '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'\
            .format(os.path.dirname(os.path.realpath(__file__)),
                            _job_name)

        MapReduce.prepare_streaming_job(
            jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
                os.path.dirname(os.path.realpath(__file__))),
            name=_job_name,
            executor=self.assert_generated_command(_expected_command)
        ).take('data').process_with(
            mapper='mapper.py', reducer='reducer.py', reducer_num=0
        ).use(
            partitioner='org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner'
        ).save('output.txt').with_config_option(
            key='map.output.key.field.separator',
            value='|').with_config_option(
                key='mapreduce.partition.keypartitioner.options',
                value='-k1,2').run()
Ejemplo n.º 7
0
 def test_streaming_job_with_multiple_inputs(self):
     _job_name = 'streaming_test_job_with_multiple_inputs'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-files dim1.txt ' \
                         '-libjars mr_001.jar,mr_002.jar ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-numReduceTasks 100 ' \
                         '-input /raw/20102014 ' \
                         '-input /raw/21102014 ' \
                         '-input /raw/22102014 ' \
                         '-output /core/20102014 ' \
                         '-inputformat \'org.mr.CustomInputFormat\' ' \
                         '-outputformat \'org.mr.CustomOutputFormat\' ' \
                         '-cmdenv JAVA_HOME=/java ' \
                         '-cmdenv tmp.dir=/tmp/streaming_test_job_with_multiple_inputs'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).run()
Ejemplo n.º 8
0
 def test_streaming_job_generation(self):
     _job_name = 'streaming_test_job'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 9
0
 def test_streaming_job_generation(self):
     _job_name = 'streaming_test_job'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).run()
Ejemplo n.º 10
0
 def test_streaming_job_without_reducers(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper mapper.py ' \
                         '-reducer NONE ' \
                         '-numReduceTasks 0 ' \
                         '-input data ' \
                         '-output output.txt'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).take("data").map_with(
         mapper="mapper.py").disable_reducers().save("output.txt").run()
Ejemplo n.º 11
0
 def test_load_streaming_job_with_config_injections(self):
     _job_name = 'streaming_test_job_with_custom_configurations'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 12
0
 def test_load_streaming_job_with_config_injections(self):
     _job_name = 'streaming_test_job_with_custom_configurations'
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper smapper.py ' \
                         '-reducer sreducer.py ' \
                         '-input /raw/20102014 ' \
                         '-output /core/20102014'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         config=self._config,
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)).run()
Ejemplo n.º 13
0
 def test_streaming_job(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper mapper.py ' \
                         '-reducer reducer.py ' \
                         '-input data ' \
                         '-output output.txt'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).take(
         "data"
     ).process_with(
         mapper="mapper.py",
         reducer="reducer.py").save("output.txt").run()
Ejemplo n.º 14
0
 def test_generate_cmd_with_input_output_format(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper mapper.py ' \
                         '-reducer reducer.py ' \
                         '-numReduceTasks 10 ' \
                         '-input data ' \
                         '-output output.txt ' \
                         '-inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat ' \
                         '-outputformat org.apache.hadoop.mapred.SequenceFileOutputFormat'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'
         .format(os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).process_with(mapper='mapper.py', reducer='reducer.py', reducer_num=10).use(
         inputformat='org.apache.hadoop.mapred.KeyValueTextInputFormat',
         outputformat='org.apache.hadoop.mapred.SequenceFileOutputFormat'
     ).take('data').save('output.txt').run()
Ejemplo n.º 15
0
 def test_generate_cmd_with_input_output_format(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-mapper mapper.py ' \
                         '-reducer reducer.py ' \
                         '-numReduceTasks 10 ' \
                         '-input data ' \
                         '-output output.txt ' \
                         '-inputformat org.apache.hadoop.mapred.KeyValueTextInputFormat ' \
                         '-outputformat org.apache.hadoop.mapred.SequenceFileOutputFormat'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).process_with(
         mapper='mapper.py', reducer='reducer.py', reducer_num=10
     ).use(inputformat='org.apache.hadoop.mapred.KeyValueTextInputFormat',
           outputformat='org.apache.hadoop.mapred.SequenceFileOutputFormat'
           ).take('data').save('output.txt').run()
Ejemplo n.º 16
0
 def test_generate_job_cmd_with_config_injections(self):
     _job_name = "test_streaming_job_%s" % uuid.uuid4()
     _expected_command = 'hadoop jar ' \
                         '{0}/resources/mapreduce/hadoop-streaming.jar ' \
                         '-D mapreduce.job.name={1} ' \
                         '-D value.delimiter.char=, ' \
                         '-D partition.to.process=20142010 ' \
                         '-mapper mapper.py ' \
                         '-reducer reducer.py ' \
                         '-input data ' \
                         '-output output.txt'\
         .format(os.path.dirname(os.path.realpath(__file__)),
                         _job_name)
     MapReduce.prepare_streaming_job(
         jar='{0}/resources/mapreduce/hadoop-streaming.jar'.format(
             os.path.dirname(os.path.realpath(__file__))),
         name=_job_name,
         executor=self.assert_generated_command(_expected_command)
     ).take('data').process_with(
         mapper='mapper.py',
         reducer='reducer.py').save('output.txt').with_config_option(
             key='value.delimiter.char',
             value=',').with_config_option(key='partition.to.process',
                                           value='20142010').run()
Ejemplo n.º 17
0
    def _template_streaming_job_(self, base_dir="/tmp", map_only_job=False):
        if not os.path.exists(HADOOP_STREAMING_JAR):
            self.skip("Cannot allocate %s" % HADOOP_STREAMING_JAR)
        _hdfs_basdir = HDFS(base_dir)
        if not _hdfs_basdir.exists():
            _hdfs_basdir.create_directory()
        _job_input = HDFS(os.path.join(_hdfs_basdir.path, "input"))
        _job_input.create_directory()
        _job_output = HDFS(os.path.join(_hdfs_basdir.path, "output"))
        home = os.path.dirname(__file__)
        _mapper = os.path.join(home, 'resources', 'mapreduce', 'mapper.py')
        _reducer = os.path.join(home, 'resources', 'mapreduce', 'reducer.py')

        LocalFS(
            os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt')
        ).copy_to_hdfs(
            _job_input.path
        )

        return MapReduce.prepare_streaming_job(name="test-mr-streaming-job{}".format(str(uuid.uuid4())), jar=HADOOP_STREAMING_JAR) \
            .take(_job_input.path) \
            .process_with(mapper=_mapper, reducer=None if map_only_job else _reducer) \
            .save(_job_output.path)