def test_get_file_splits_test(self): # set up input paths input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nqux\nfoo\nbar\nqux\nfoo\n') input_path2 = os.path.join(self.tmp_dir, 'input2') with open(input_path2, 'wb') as input_file: input_file.write(b'foo\nbar\nbar\n') runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_path, input_path2], 3) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: with open(file_name, 'rb') as f: content.extend(f.readlines()) self.assertEqual(sorted(content), [ b'bar\n', b'bar\n', b'bar\n', b'bar\n', b'foo\n', b'foo\n', b'foo\n', b'qux\n', b'qux\n' ])
def test_jobconf_from_step(self): jobconf = {"FOO": "bar", "BAZ": "qux"} # Hack in steps rather than creating a new MRJob subclass runner = LocalMRJobRunner(jobconf=jobconf) runner._steps = [{"jobconf": {"BAZ": "quux", "BAX": "Arnold"}}] self.assertEqual(runner._hadoop_args_for_step(0), ["-D", "BAX=Arnold", "-D", "BAZ=quux", "-D", "FOO=bar"])
def test_get_file_splits_sorted_test(self): # set up input paths input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write( b'1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n' b'3\tqux\n3\tqux\n') runner = LocalMRJobRunner(conf_paths=[]) file_splits = runner._get_file_splits([input_path], 3, keep_sorted=True) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved in sorted order content = [] for file_name in sorted(file_splits.keys()): with open(file_name, 'rb') as f: content.extend(f.readlines()) self.assertEqual(content, [ b'1\tbar\n', b'1\tbar\n', b'1\tbar\n', b'2\tfoo\n', b'2\tfoo\n', b'2\tfoo\n', b'3\tqux\n', b'3\tqux\n', b'3\tqux\n' ])
def test_empty_no_user(self): self.getuser_should_fail = True runner = LocalMRJobRunner(conf_path=False) match = JOB_NAME_RE.match(runner.get_job_name()) assert_equal(match.group(1), 'no_script') assert_equal(match.group(2), 'no_user')
def test_get_file_splits_sorted_test(self): # set up input paths input_path = os.path.join(self.tmp_dir, "input") with open(input_path, "wb") as input_file: input_file.write(b"1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n" b"3\tqux\n3\tqux\n") runner = LocalMRJobRunner(conf_paths=[]) file_splits = runner._get_file_splits([input_path], 3, keep_sorted=True) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved in sorted order content = [] for file_name in sorted(file_splits.keys()): with open(file_name, "rb") as f: content.extend(f.readlines()) self.assertEqual( content, [ b"1\tbar\n", b"1\tbar\n", b"1\tbar\n", b"2\tfoo\n", b"2\tfoo\n", b"2\tfoo\n", b"3\tqux\n", b"3\tqux\n", b"3\tqux\n", ], )
def test_owner_and_label_kwargs(self): runner = LocalMRJobRunner(conf_path=False, owner='ads', label='ads_chain') match = JOB_NAME_RE.match(runner.get_job_name()) assert_equal(match.group(1), 'ads_chain') assert_equal(match.group(2), 'ads')
def test_auto_owner(self): os.environ['USER'] = '******' runner = LocalMRJobRunner(conf_path=False) match = JOB_NAME_RE.match(runner.get_job_name()) assert_equal(match.group(1), 'no_script') assert_equal(match.group(2), 'mcp')
def _test_spark_executor_memory(self, conf_value, megs): runner = LocalMRJobRunner( jobconf={'spark.executor.memory': conf_value}) self.assertEqual(runner._spark_master(), 'local-cluster[%d,1,%d]' % ( cpu_count(), megs))
def test_empty_jobconf_values(self): # value of None means to omit that jobconf jobconf = {'foo': '', 'bar': None} runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf) self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ['-D', 'foo='])
def test_stream_output(self): a_dir_path = os.path.join(self.tmp_dir, 'a') b_dir_path = os.path.join(self.tmp_dir, 'b') l_dir_path = os.path.join(self.tmp_dir, '_logs') os.mkdir(a_dir_path) os.mkdir(b_dir_path) os.mkdir(l_dir_path) a_file_path = os.path.join(a_dir_path, 'part-00000') b_file_path = os.path.join(b_dir_path, 'part-00001') c_file_path = os.path.join(self.tmp_dir, 'part-00002') x_file_path = os.path.join(l_dir_path, 'log.xml') y_file_path = os.path.join(self.tmp_dir, '_SUCCESS') with open(a_file_path, 'w') as f: f.write('A') with open(b_file_path, 'w') as f: f.write('B') with open(c_file_path, 'w') as f: f.write('C') with open(x_file_path, 'w') as f: f.write('<XML XML XML/>') with open(y_file_path, 'w') as f: f.write('I win') runner = LocalMRJobRunner() runner._output_dir = self.tmp_dir assert_equal(sorted(runner.stream_output()), ['A', 'B', 'C'])
def test_get_file_splits_sorted_test(self): # set up input paths input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write( '1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n' '3\tqux\n3\tqux\n') runner = LocalMRJobRunner(conf_paths=[]) file_splits = runner._get_file_splits([input_path], 3, keep_sorted=True) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved in sorted order content = [] for file_name in sorted(file_splits.keys()): f = open(file_name, 'r') content.extend(f.readlines()) self.assertEqual(content, ['1\tbar\n', '1\tbar\n', '1\tbar\n', '2\tfoo\n', '2\tfoo\n', '2\tfoo\n', '3\tqux\n', '3\tqux\n', '3\tqux\n'])
def test_get_file_splits_test(self): # set up input paths input_path = os.path.join(self.tmp_dir, "input") with open(input_path, "w") as input_file: input_file.write("bar\nqux\nfoo\nbar\nqux\nfoo\n") input_path2 = os.path.join(self.tmp_dir, "input2") with open(input_path2, "wb") as input_file: input_file.write(b"foo\nbar\nbar\n") runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_path, input_path2], 3) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: with open(file_name, "rb") as f: content.extend(f.readlines()) self.assertEqual( sorted(content), [b"bar\n", b"bar\n", b"bar\n", b"bar\n", b"foo\n", b"foo\n", b"foo\n", b"qux\n", b"qux\n"] )
def test_get_file_splits_test(self): # set up input paths input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nqux\nfoo\nbar\nqux\nfoo\n') input_path2 = os.path.join(self.tmp_dir, 'input2') with open(input_path2, 'w') as input_file: input_file.write('foo\nbar\nbar\n') runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_path, input_path2], 3) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: f = open(file_name) content.extend(f.readlines()) self.assertEqual(sorted(content), ['bar\n', 'bar\n', 'bar\n', 'bar\n', 'foo\n', 'foo\n', 'foo\n', 'qux\n', 'qux\n'])
def test_hadoop_output_format(self): format = "org.apache.hadoop.mapred.SequenceFileOutputFormat" runner = LocalMRJobRunner(conf_paths=[], hadoop_output_format=format) self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ["-outputformat", format]) # test multi-step job self.assertEqual(runner._hadoop_conf_args({}, 0, 2), []) self.assertEqual(runner._hadoop_conf_args({}, 1, 2), ["-outputformat", format])
def test_jobconf_job_name_custom(self): jobconf = {'BAX': 'Arnold', 'mapred.job.name': 'Foo'} runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf, hadoop_version='0.18') self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ['-jobconf', 'BAX=Arnold', '-jobconf', 'mapred.job.name=Foo' ])
def test_partitioner(self): partitioner = 'org.apache.hadoop.mapreduce.Partitioner' runner = LocalMRJobRunner(conf_paths=[], partitioner=partitioner) self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ['-D', 'mapred.job.name=None > None', '-partitioner', partitioner, ])
def test_command_streaming_step_without_mr_job_script(self): # you don't need a script to run commands steps = MRCmdJob(['--mapper-cmd', 'cat'])._steps_desc() runner = LocalMRJobRunner(steps=steps, stdin=BytesIO(b'dog\n')) runner.run() runner.cleanup()
def test_cmdenv(self): cmdenv = {'FOO': 'bar', 'BAZ': 'qux', 'BAX': 'Arnold'} runner = LocalMRJobRunner(conf_paths=[], cmdenv=cmdenv) self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ['-cmdenv', 'BAX=Arnold', '-cmdenv', 'BAZ=qux', '-cmdenv', 'FOO=bar', ])
def test_cmdenv(self): cmdenv = {'FOO': 'bar', 'BAZ': 'qux', 'BAX': 'Arnold'} runner = LocalMRJobRunner(conf_paths=[], cmdenv=cmdenv) self.assertEqual(runner._hadoop_conf_args(0, 1), ['-cmdenv', 'BAX=Arnold', '-cmdenv', 'BAZ=qux', '-cmdenv', 'FOO=bar', ])
def test_jobconf(self): jobconf = {"FOO": "bar", "BAZ": "qux", "BAX": "Arnold"} runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf) self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ["-D", "BAX=Arnold", "-D", "BAZ=qux", "-D", "FOO=bar"]) runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf, hadoop_version="0.18") self.assertEqual( runner._hadoop_conf_args({}, 0, 1), ["-jobconf", "BAX=Arnold", "-jobconf", "BAZ=qux", "-jobconf", "FOO=bar"] )
def test_hadoop_output_format(self): format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat' runner = LocalMRJobRunner(conf_paths=[], hadoop_output_format=format) self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ['-outputformat', format]) # test multi-step job self.assertEqual(runner._hadoop_conf_args({}, 0, 2), []) self.assertEqual(runner._hadoop_conf_args({}, 1, 2), ['-outputformat', format])
def test_jobconf_from_step(self): jobconf = {'FOO': 'bar', 'BAZ': 'qux'} runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf) step = {'jobconf': {'BAZ': 'quux', 'BAX': 'Arnold'}} self.assertEqual(runner._hadoop_conf_args(step, 0, 1), ['-D', 'BAX=Arnold', '-D', 'BAZ=quux', '-D', 'FOO=bar', ])
def test_environment_variables_018(self): runner = LocalMRJobRunner(hadoop_version='0.18', conf_paths=[]) # clean up after we're done. On windows, job names are only to # the millisecond, so these two tests end up trying to create # the same temp dir with runner as runner: runner._setup_working_dir() self.assertIn('mapred_cache_localArchives', runner._subprocess_env('M', 0, 0).keys())
def test_environment_variables_018(self): runner = LocalMRJobRunner(hadoop_version='0.18', conf_paths=[]) # clean up after we're done. On windows, job names are only to # the millisecond, so these two tests end up trying to create # the same temp dir with runner as runner: runner._setup_working_dir() self.assertIn('mapred_cache_localArchives', runner._subprocess_env('mapper', 0, 0).keys())
def test_hadoop_output_format(self): format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat' runner = LocalMRJobRunner(conf_path=False, hadoop_output_format=format) assert_equal(runner._hadoop_conf_args(0, 1), ['-outputformat', format]) # test multi-step job assert_equal(runner._hadoop_conf_args(0, 2), []) assert_equal(runner._hadoop_conf_args(1, 2), ['-outputformat', format])
def test_configuration_translation(self): jobconf = {'mapred.jobtracker.maxtasks.per.job': 1} with no_handlers_for_logger('mrjob.compat'): runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf, hadoop_version='0.21') self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ['-D', 'mapred.jobtracker.maxtasks.per.job=1', '-D', 'mapreduce.jobtracker.maxtasks.perjob=1' ])
def test_hadoop_input_format(self): format = 'org.apache.hadoop.mapred.SequenceFileInputFormat' runner = LocalMRJobRunner(conf_paths=[], hadoop_input_format=format) self.assertEqual(runner._hadoop_conf_args(0, 1), ['-inputformat', format]) # test multi-step job self.assertEqual(runner._hadoop_conf_args(0, 2), ['-inputformat', format]) self.assertEqual(runner._hadoop_conf_args(1, 2), [])
def test_job_name_prefix_is_now_label(self): old_way = LocalMRJobRunner(conf_path=False, job_name_prefix='ads_chain') old_opts = old_way.get_opts() new_way = LocalMRJobRunner(conf_path=False, label='ads_chain') new_opts = new_way.get_opts() assert_equal(old_opts, new_opts) assert_equal(old_opts['label'], 'ads_chain') assert_not_in('job_name_prefix', old_opts)
def test_jobconf_from_step(self): jobconf = {'FOO': 'bar', 'BAZ': 'qux'} # Hack in steps rather than creating a new MRJob subclass runner = LocalMRJobRunner(jobconf=jobconf) runner._steps = [{'jobconf': {'BAZ': 'quux', 'BAX': 'Arnold'}}] self.assertEqual(runner._hadoop_args_for_step(0), ['-D', 'BAX=Arnold', '-D', 'BAZ=quux', '-D', 'FOO=bar', ])
def test_hadoop_output_format(self): format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat' runner = LocalMRJobRunner(conf_paths=[], hadoop_output_format=format) self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ['-D', 'mapred.job.name=None > None', '-outputformat', format]) # test multi-step job self.assertEqual(runner._hadoop_conf_args({}, 0, 2), ['-D', 'mapred.job.name=None > None (step 1 of 2)']) self.assertEqual(runner._hadoop_conf_args({}, 1, 2), ['-D', 'mapred.job.name=None > None (step 2 of 2)', '-outputformat', format ])
def test_hadoop_extra_args_comes_first(self): runner = LocalMRJobRunner( cmdenv={'FOO': 'bar'}, conf_paths=[], hadoop_extra_args=['-libjar', 'qux.jar'], hadoop_input_format='FooInputFormat', hadoop_output_format='BarOutputFormat', jobconf={'baz': 'quz'}, partitioner='java.lang.Object', ) # hadoop_extra_args should come first conf_args = runner._hadoop_conf_args({}, 0, 1) self.assertEqual(conf_args[:2], ['-libjar', 'qux.jar']) self.assertEqual(len(conf_args), 14)
def test_hadoop_extra_args_comes_first(self): runner = LocalMRJobRunner( cmdenv={"FOO": "bar"}, conf_paths=[], hadoop_extra_args=["-libjar", "qux.jar"], hadoop_input_format="FooInputFormat", hadoop_output_format="BarOutputFormat", jobconf={"baz": "quz"}, partitioner="java.lang.Object", ) # hadoop_extra_args should come first conf_args = runner._hadoop_conf_args({}, 0, 1) self.assertEqual(conf_args[:2], ["-libjar", "qux.jar"]) self.assertEqual(len(conf_args), 12)
def test_hadoop_extra_args_comes_first(self): runner = LocalMRJobRunner( cmdenv={'FOO': 'bar'}, conf_paths=[], hadoop_extra_args=['-libjar', 'qux.jar'], hadoop_input_format='FooInputFormat', hadoop_output_format='BarOutputFormat', jobconf={'baz': 'quz'}, partitioner='java.lang.Object', ) # hadoop_extra_args should come first conf_args = runner._hadoop_conf_args({}, 0, 1) self.assertEqual(conf_args[:2], ['-libjar', 'qux.jar']) self.assertEqual(len(conf_args), 12)
class TestIronPythonEnvironment(unittest.TestCase): def setUp(self): self.runner = LocalMRJobRunner(conf_paths=[]) self.runner._setup_working_dir() def test_env_ironpython(self): with patch.object(local, 'is_ironpython', True): environment = self.runner._subprocess_env('M', 0, 0) self.assertIn('IRONPYTHONPATH', environment) def test_env_no_ironpython(self): with patch.object(local, 'is_ironpython', False): environment = self.runner._subprocess_env('M', 0, 0) self.assertNotIn('IRONPYTHONPATH', environment)
def test_jobconf(self): jobconf = {'FOO': 'bar', 'BAZ': 'qux', 'BAX': 'Arnold'} runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf) self.assertEqual(runner._hadoop_conf_args(0, 1), ['-D', 'BAX=Arnold', '-D', 'BAZ=qux', '-D', 'FOO=bar', ]) runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf, hadoop_version='0.18') self.assertEqual(runner._hadoop_conf_args(0, 1), ['-jobconf', 'BAX=Arnold', '-jobconf', 'BAZ=qux', '-jobconf', 'FOO=bar', ])
def make_runner(self): """Make a runner based on command-line arguments, so we can launch this job on EMR, on Hadoop, or locally. :rtype: :py:class:`mrjob.runner.MRJobRunner` """ if self.options.runner == 'emr': # avoid requiring dependencies (such as boto3) for other runners from mrjob.emr import EMRJobRunner return EMRJobRunner(**self.emr_job_runner_kwargs()) elif self.options.runner == 'dataproc': from mrjob.dataproc import DataprocJobRunner return DataprocJobRunner(**self.dataproc_job_runner_kwargs()) elif self.options.runner == 'hadoop': from mrjob.hadoop import HadoopJobRunner return HadoopJobRunner(**self.hadoop_job_runner_kwargs()) elif self.options.runner == 'inline': raise ValueError("inline is not supported in the multi-lingual" " launcher.") else: # run locally by default from mrjob.local import LocalMRJobRunner return LocalMRJobRunner(**self.local_job_runner_kwargs())
def test_extra_kwargs_passed_in_directly_okay(self): with logger_disabled('mrjob.runner'): with LocalMRJobRunner(conf_path=False, base_tmp_dir='/var/tmp', foo='bar') as runner: self.assertEqual(runner._opts['base_tmp_dir'], '/var/tmp') self.assertNotIn('bar', runner._opts)
def make_runner(self): """Make a runner based on command-line arguments, so we can launch this job on EMR, on Hadoop, or locally. :rtype: :py:class:`mrjob.runner.MRJobRunner` """ # have to import here so that we can still run the MRJob # without importing boto from mrjob.emr import EMRJobRunner from mrjob.hadoop import HadoopJobRunner from mrjob.local import LocalMRJobRunner if self.options.runner == 'emr': return EMRJobRunner(**self.emr_job_runner_kwargs()) elif self.options.runner == 'hadoop': return HadoopJobRunner(**self.hadoop_job_runner_kwargs()) elif self.options.runner == 'inline': raise ValueError("inline is not supported in the multi-lingual" " launcher.") else: # run locally by default return LocalMRJobRunner(**self.local_job_runner_kwargs())
def test_cleanup_after_with_statement(self): local_tmp_dir = None with LocalMRJobRunner() as runner: local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) assert not os.path.exists(local_tmp_dir)
def _test_cleanup_after_with_statement(self, mode, should_exist): with LocalMRJobRunner(cleanup=mode) as runner: self.local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(self.local_tmp_dir) assert_equal(os.path.exists(self.local_tmp_dir), should_exist) if not should_exist: self.local_tmp_dir = None
def gz_test(self, dir_path_name): contents_gz = [b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n'] contents_normal = [b'foo\n', b'bar\n', b'bar\n'] all_contents_sorted = sorted(contents_gz + contents_normal) input_gz_path = join(dir_path_name, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b''.join(contents_gz)) input_gz.close() input_path2 = join(dir_path_name, 'input2') with open(input_path2, 'wb') as input_file: input_file.write(b''.join(contents_normal)) runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_gz_path, input_path2], 3) # Make sure that input.gz occurs in a single split that starts at # its beginning and ends at its end for split_info in file_splits.values(): if split_info['orig_name'] == input_gz_path: self.assertEqual(split_info['start'], 0) self.assertEqual(split_info['length'], os.stat(input_gz_path)[stat.ST_SIZE]) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: with open(file_name, 'rb') as f: lines = list(to_lines(decompress(f, file_name))) # make sure the input_gz split got its entire contents if file_name == input_gz_path: self.assertEqual(lines, contents_gz) content.extend(lines) self.assertEqual(sorted(content), all_contents_sorted)
def gz_test(self, dir_path_name): contents_gz = [ b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n' ] contents_normal = [b'foo\n', b'bar\n', b'bar\n'] all_contents_sorted = sorted(contents_gz + contents_normal) input_gz_path = join(dir_path_name, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b''.join(contents_gz)) input_gz.close() input_path2 = join(dir_path_name, 'input2') with open(input_path2, 'wb') as input_file: input_file.write(b''.join(contents_normal)) runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_gz_path, input_path2], 3) # Make sure that input.gz occurs in a single split that starts at # its beginning and ends at its end for split_info in file_splits.values(): if split_info['orig_name'] == input_gz_path: self.assertEqual(split_info['start'], 0) self.assertEqual(split_info['length'], os.stat(input_gz_path)[stat.ST_SIZE]) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: with open(file_name, 'rb') as f: lines = list(to_lines(decompress(f, file_name))) # make sure the input_gz split got its entire contents if file_name == input_gz_path: self.assertEqual(lines, contents_gz) content.extend(lines) self.assertEqual(sorted(content), all_contents_sorted)
def test_create_mrjob_tar_gz(self): with LocalMRJobRunner(conf_path=False) as runner: mrjob_tar_gz_path = runner._create_mrjob_tar_gz() mrjob_tar_gz = tarfile.open(mrjob_tar_gz_path) contents = mrjob_tar_gz.getnames() for path in contents: assert_equal(path[:6], 'mrjob/') assert_in('mrjob/job.py', contents)
def test_cat_uncompressed(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nfoo\n') with LocalMRJobRunner() as runner: output = [] for line in runner.cat(input_path): output.append(line) assert_equal(output, ['bar\n', 'foo\n'])
def test_cleanup_deprecated(self): stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob', stderr) with LocalMRJobRunner(cleanup=CLEANUP_DEFAULT) as runner: self.local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(self.local_tmp_dir) assert_equal(os.path.exists(self.local_tmp_dir), False) self.local_tmp_dir = None assert_in('deprecated', stderr.getvalue())
def test_cat_compressed(self): input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\nbar\n') input_gz.close() with LocalMRJobRunner() as runner: output = [] for line in runner.cat(input_gz_path): output.append(line) assert_equal(output, ['foo\n', 'bar\n']) input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'w') input_bz2.write('bar\nbar\nfoo\n') input_bz2.close() with LocalMRJobRunner() as runner: output = [] for line in runner.cat(input_bz2_path): output.append(line) assert_equal(output, ['bar\n', 'bar\n', 'foo\n'])
def test_job_name_prefix_is_now_label(self): with logger_disabled('mrjob.runner'): old_way = LocalMRJobRunner( conf_path=False, job_name_prefix='ads_chain') old_opts = old_way.get_opts() new_way = LocalMRJobRunner(conf_path=False, label='ads_chain') new_opts = new_way.get_opts() assert_equal(old_opts, new_opts) assert_equal(old_opts['label'], 'ads_chain') assert_not_in('job_name_prefix', old_opts)
def test_jobconf(self): jobconf = {'FOO': 'bar', 'BAZ': 'qux', 'BAX': 'Arnold'} runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf) self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ['-D', 'BAX=Arnold', '-D', 'BAZ=qux', '-D', 'FOO=bar', ]) runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf, hadoop_version='0.18') self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ['-jobconf', 'BAX=Arnold', '-jobconf', 'BAZ=qux', '-jobconf', 'FOO=bar', ])
def test_partitioner(self): partitioner = 'org.apache.hadoop.mapreduce.Partitioner' runner = LocalMRJobRunner(conf_paths=[], partitioner=partitioner) self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ['-partitioner', partitioner])