def test_mr_job_command_generation(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar demo.mr.Driver -D mapreduce.job.name={1}"\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar" .format(os.path.dirname(os.path.realpath(__file__))), main_class="demo.mr.Driver", name=_job_name, executor=self.assert_generated_command(_expected_command)).run()
def test_mr_job_command_generation(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar demo.mr.Driver -D mapreduce.job.name={1}"\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar".format( os.path.dirname(os.path.realpath(__file__))), main_class="demo.mr.Driver", name=_job_name, executor=self.assert_generated_command(_expected_command)).run()
def test_mr_job_command_generation(self): _expected_command = 'hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar ' \ 'test.mr.Driver ' \ '-D mapreduce.job.name=simple_mr_job ' \ '-D value.delimiter.char=, ' \ '-D partition.to.process=20142010 ' \ '/input/dir ' \ '/output/dir'.format(os.path.dirname(os.path.realpath(__file__))) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar".format( os.path.dirname(os.path.realpath(__file__))), main_class="test.mr.Driver", config=self._config, name='simple_mr_job', executor=self.assert_generated_command(_expected_command)).run()
def test_mr_job_command_generation(self): _expected_command = 'hadoop jar {0}/resources/mapreduce/hadoop-mapreduce-examples.jar ' \ 'test.mr.Driver ' \ '-D mapreduce.job.name=simple_mr_job ' \ '-D value.delimiter.char=, ' \ '-D partition.to.process=20142010 ' \ '/input/dir ' \ '/output/dir'.format(os.path.dirname(os.path.realpath(__file__))) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar" .format(os.path.dirname(os.path.realpath(__file__))), main_class="test.mr.Driver", config=self._config, name='simple_mr_job', executor=self.assert_generated_command(_expected_command) ).run()
def test_mr_job_command_generation_with_configurations(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar " \ "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \ "demo.mr.Driver " \ "-D mapreduce.job.name={1} " \ "-D job.input=/data/raw/24102014 " \ "-D mapreduce.job.reduces=10"\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar" .format(os.path.dirname(os.path.realpath(__file__))), main_class="demo.mr.Driver", name=_job_name, executor=self.assert_generated_command(_expected_command) ).with_config_option("job.input", "/data/raw/24102014").with_number_of_reducers(10).run()
def test_mr_job_command_generation_with_configurations(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar " \ "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \ "demo.mr.Driver " \ "-D mapreduce.job.name={1} " \ "-D job.input=/data/raw/24102014 " \ "-D mapreduce.job.reduces=10"\ .format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar".format( os.path.dirname(os.path.realpath(__file__))), main_class="demo.mr.Driver", name=_job_name, executor=self.assert_generated_command( _expected_command)).with_config_option( "job.input", "/data/raw/24102014").with_number_of_reducers(10).run()
def test_mr_job_command_generation_with_arguments(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar " \ "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \ "wordcount " \ "-D mapreduce.job.name={1} " \ "-D split.by='\\t' " \ "-D mapreduce.job.reduces=3 " \ "/user/vagrant/dmode.txt " \ "/tmp/test".format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar" .format(os.path.dirname(os.path.realpath(__file__))), main_class="wordcount", name=_job_name, executor=self.assert_generated_command(_expected_command) ).with_config_option("split.by", "'\\t'") \ .with_number_of_reducers(3) \ .with_arguments() \ .run("/user/vagrant/dmode.txt", "/tmp/test")
def test_mr_job_command_generation_with_arguments(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _expected_command = "hadoop jar " \ "{0}/resources/mapreduce/hadoop-mapreduce-examples.jar " \ "wordcount " \ "-D mapreduce.job.name={1} " \ "-D split.by='\\t' " \ "-D mapreduce.job.reduces=3 " \ "/user/vagrant/dmode.txt " \ "/tmp/test".format(os.path.dirname(os.path.realpath(__file__)), _job_name) MapReduce.prepare_mapreduce_job( jar="{0}/resources/mapreduce/hadoop-mapreduce-examples.jar" .format(os.path.dirname(os.path.realpath(__file__))), main_class="wordcount", name=_job_name, executor=self.assert_generated_command(_expected_command) ).with_config_option("split.by", "'\\t'") \ .with_number_of_reducers(3) \ .with_arguments() \ .run("/user/vagrant/dmode.txt", "/tmp/test")
def process_data(context): # Configure and run MapReduce job _mapreduce_job_ = MapReduce.prepare_mapreduce_job( jar=os.path.join(os.path.dirname(__file__), 'resources/WordsCount-1.0-SNAPSHOT.jar'), main_class="WordsCountJob", name="MAPREDUCE_Counting" ).with_config_option("input", "{0}/data_from_import".format(BASE_DIR)) \ .with_config_option("output", "{0}/data_to_export".format(BASE_DIR)) _mapreduce_job_.run() status = _mapreduce_job_.status() if not status.is_succeeded(): raise MapReduceJobException("MapReduce job failed: {}".format( status.failure_reason() if status.is_failed() else 'NONE'))
def test_mr_job_command_generation_with_arguments(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _base_dir = HDFS(os.path.join("/tmp", _job_name)) _base_dir.create_directory() try: jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar') # configure job inputs _job_input = HDFS(os.path.join(_base_dir.path, "input")) _job_input.create_directory() LocalFS(os.path.join( os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) # configure job output _job_output = HDFS(os.path.join(_base_dir.path, "output")) if not os.path.exists(jar): self.skipTest("'%s' not found" % jar) job = MapReduce.prepare_mapreduce_job(jar=jar, main_class="wordcount", name=_job_name) \ .with_config_option("split.by", "'\\t'") \ .with_number_of_reducers(3) \ .with_arguments( _job_input.path, _job_output.path ) _command_submission_result = job.run() _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed") self.assertTrue(_job_output.exists(), "Error: empty job output") # check counters self.assertEqual(6, _job_status.counter(group='File System Counters', counter='HDFS: Number of write operations')) self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks')) self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks')) self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read')) finally: _base_dir.delete_directory()