def test_mapper_init(self): two_lines_path = self.makefile('two_lines', b'line\nother line\n') job = MRTestJobConf( ['-r', 'spark', '--emulate-map-input-file', two_lines_path]) with job.make_runner() as runner: runner.run() output = dict(job.parse_output(runner.cat_output())) self.assertEqual(output['mapreduce.map.input.file'], 'file://' + two_lines_path)
def test_empty_file(self): two_lines_path = self.makefile('two_lines', b'line\nother line\n') no_lines_path = self.makefile('no_lines', b'') job = MRTestJobConf( ['-r', 'spark', '--emulate-map-input-file', two_lines_path]) with job.make_runner() as runner: runner.run() paths = [ path for jobconf, path in job.parse_output(runner.cat_output()) if jobconf == 'mapreduce.map.input.file' ] # ideally, no_lines_path would appear too, but what we care # about is that we don't get a crash from trying to read # the "first" line of the file self.assertEqual(paths, ['file://' + two_lines_path])
def test_jobconf_simulated_by_runner(self): # use a .gz file so there's only one split input_gz_path = os.path.join(self.tmp_dir, 'input.gz') with gzip.GzipFile(input_gz_path, 'wb') as input_gz: input_gz.write(b'foo\n') input_gz_size = os.stat(input_gz_path)[stat.ST_SIZE] upload_path = os.path.join(self.tmp_dir, 'upload') with open(upload_path, 'wb') as upload_file: upload_file.write(b'PAYLOAD') # use --no-bootstrap-mrjob so we don't have to worry about # mrjob.tar.gz and the setup wrapper script self.add_mrjob_to_pythonpath() mr_job = MRTestJobConf(['-r', self.RUNNER, '--no-bootstrap-mrjob', '-D=user.defined=something', '--files', upload_path, input_gz_path]) mr_job.sandbox() results = {} # between the single line of input and setting mapred.map.tasks to 1, # we should be restricted to only one task, which will give more # predictable results with mr_job.make_runner() as runner: script_path = runner._script_path runner.run() results.update(dict(mr_job.parse_output(runner.cat_output()))) working_dir = results['mapreduce.job.local.dir'] self.assertEqual(working_dir, os.path.join(runner._get_local_tmp_dir(), 'step', '000', 'mapper', '00000', 'wd')) self.assertEqual(results['mapreduce.job.cache.archives'], '') expected_cache_files = [ script_path + '#mr_test_jobconf.py', upload_path + '#upload' ] + [ '%s#%s' % (path, name) for path, name in self._extra_expected_local_files(runner) ] self.assertEqual( sorted(results['mapreduce.job.cache.files'].split(',')), sorted(expected_cache_files)) self.assertEqual(results['mapreduce.job.cache.local.archives'], '') expected_local_files = [ os.path.join(working_dir, 'mr_test_jobconf.py'), os.path.join(working_dir, 'upload') ] + [ os.path.join(working_dir, name) for path, name in self._extra_expected_local_files(runner) ] self.assertEqual( sorted(results['mapreduce.job.cache.local.files'].split(',')), sorted(expected_local_files)) self.assertEqual(results['mapreduce.job.id'], runner._job_key) self.assertEqual(results['mapreduce.map.input.file'], input_gz_path) self.assertEqual(results['mapreduce.map.input.length'], str(input_gz_size)) self.assertEqual(results['mapreduce.map.input.start'], '0') self.assertEqual(results['mapreduce.task.attempt.id'], 'attempt_%s_mapper_00000_0' % runner._job_key) self.assertEqual(results['mapreduce.task.id'], 'task_%s_mapper_00000' % runner._job_key) self.assertEqual(results['mapreduce.task.ismap'], 'true') self.assertEqual(results['mapreduce.task.output.dir'], runner._output_dir) self.assertEqual(results['mapreduce.task.partition'], '0') self.assertEqual(results['user.defined'], 'something')
def test_jobconf_simulated_by_runner(self): # use a .gz file so there's only one split input_gz_path = os.path.join(self.tmp_dir, 'input.gz') with gzip.GzipFile(input_gz_path, 'wb') as input_gz: input_gz.write(b'foo\n') input_gz_size = os.stat(input_gz_path)[stat.ST_SIZE] upload_path = os.path.join(self.tmp_dir, 'upload') with open(upload_path, 'wb') as upload_file: upload_file.write(b'PAYLOAD') # use --no-bootstrap-mrjob so we don't have to worry about # mrjob.tar.gz and the setup wrapper script self.add_mrjob_to_pythonpath() mr_job = MRTestJobConf(['-r', self.RUNNER, '--no-bootstrap-mrjob', '--jobconf=user.defined=something', '--file', upload_path, input_gz_path]) mr_job.sandbox() results = {} # between the single line of input and setting mapred.map.tasks to 1, # we should be restricted to only one task, which will give more # predictable results with mr_job.make_runner() as runner: script_path = runner._script_path runner.run() results.update(dict(mr_job.parse_output(runner.cat_output()))) working_dir = results['mapreduce.job.local.dir'] self.assertEqual(working_dir, os.path.join(runner._get_local_tmp_dir(), 'step', '000', 'mapper', '00000', 'wd')) self.assertEqual(results['mapreduce.job.cache.archives'], '') expected_cache_files = [ script_path + '#mr_test_jobconf.py', upload_path + '#upload' ] + [ '%s#%s' % (path, name) for path, name in self._extra_expected_local_files(runner) ] self.assertEqual( sorted(results['mapreduce.job.cache.files'].split(',')), sorted(expected_cache_files)) self.assertEqual(results['mapreduce.job.cache.local.archives'], '') expected_local_files = [ os.path.join(working_dir, 'mr_test_jobconf.py'), os.path.join(working_dir, 'upload') ] + [ os.path.join(working_dir, name) for path, name in self._extra_expected_local_files(runner) ] self.assertEqual( sorted(results['mapreduce.job.cache.local.files'].split(',')), sorted(expected_local_files)) self.assertEqual(results['mapreduce.job.id'], runner._job_key) self.assertEqual(results['mapreduce.map.input.file'], input_gz_path) self.assertEqual(results['mapreduce.map.input.length'], str(input_gz_size)) self.assertEqual(results['mapreduce.map.input.start'], '0') self.assertEqual(results['mapreduce.task.attempt.id'], 'attempt_%s_mapper_00000_0' % runner._job_key) self.assertEqual(results['mapreduce.task.id'], 'task_%s_mapper_00000' % runner._job_key) self.assertEqual(results['mapreduce.task.ismap'], 'true') self.assertEqual(results['mapreduce.task.output.dir'], runner._output_dir) self.assertEqual(results['mapreduce.task.partition'], '0') self.assertEqual(results['user.defined'], 'something')