def test_doesnt_actually_create_archive(self): archive_dir = self.makedirs('archive') runner = InlineMRJobRunner() archive_path = runner._dir_archive_path(archive_dir) self.assertFalse(os.path.exists(archive_path))
def test_auto_owner(self): os.environ["USER"] = "******" runner = InlineMRJobRunner(conf_paths=[]) match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), "no_script") self.assertEqual(match.group(2), "mcp")
def test_empty_no_user(self): self.getuser_should_fail = True runner = InlineMRJobRunner(conf_paths=[]) match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), "no_script") self.assertEqual(match.group(2), "no_user")
def test_owner_and_label_kwargs(self): runner = InlineMRJobRunner(conf_path=False, owner='ads', label='ads_chain') match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'ads_chain') self.assertEqual(match.group(2), 'ads')
def test_getattr_forward(self): with no_handlers_for_logger(): r = InlineMRJobRunner(conf_path=False) store = r._opts self.assertIsInstance(store, InlineRunnerOptionStore) a = r.get_default_opts() self.assertEqual(a, store.default_options())
def test_auto_owner(self): os.environ['USER'] = '******' runner = InlineMRJobRunner(conf_path=False) match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), 'mcp')
def test_trailing_slash(self): archive_dir = self.makedirs('archive') + os.sep runner = InlineMRJobRunner() archive_path = runner._dir_archive_path(archive_dir) self.assertEqual(os.path.basename(archive_path), 'archive.tar.gz')
def test_auto_owner(self): os.environ['USER'] = '******' runner = InlineMRJobRunner(conf_paths=[]) match = _JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), 'mcp')
def test_owner_and_label_kwargs(self): runner = InlineMRJobRunner(conf_paths=[], owner='ads', label='ads_chain') match = _JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'ads_chain') self.assertEqual(match.group(2), 'ads')
def test_stream_output(self): a_dir_path = os.path.join(self.tmp_dir, "a") b_dir_path = os.path.join(self.tmp_dir, "b") l_dir_path = os.path.join(self.tmp_dir, "_logs") os.mkdir(a_dir_path) os.mkdir(b_dir_path) os.mkdir(l_dir_path) a_file_path = os.path.join(a_dir_path, "part-00000") b_file_path = os.path.join(b_dir_path, "part-00001") c_file_path = os.path.join(self.tmp_dir, "part-00002") x_file_path = os.path.join(l_dir_path, "log.xml") y_file_path = os.path.join(self.tmp_dir, "_SUCCESS") with open(a_file_path, "w") as f: f.write("A") with open(b_file_path, "w") as f: f.write("B") with open(c_file_path, "w") as f: f.write("C") with open(x_file_path, "w") as f: f.write("<XML XML XML/>") with open(y_file_path, "w") as f: f.write("I win") runner = InlineMRJobRunner(conf_paths=[], output_dir=self.tmp_dir) self.assertEqual(sorted(runner.stream_output()), ["A", "B", "C"])
def test_empty_no_user(self): self.getuser_should_fail = True runner = InlineMRJobRunner(conf_paths=[]) match = _JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), 'no_user')
def test_stream_output(self): a_dir_path = os.path.join(self.tmp_dir, 'a') b_dir_path = os.path.join(self.tmp_dir, 'b') l_dir_path = os.path.join(self.tmp_dir, '_logs') os.mkdir(a_dir_path) os.mkdir(b_dir_path) os.mkdir(l_dir_path) a_file_path = os.path.join(a_dir_path, 'part-00000') b_file_path = os.path.join(b_dir_path, 'part-00001') c_file_path = os.path.join(self.tmp_dir, 'part-00002') x_file_path = os.path.join(l_dir_path, 'log.xml') y_file_path = os.path.join(self.tmp_dir, '_SUCCESS') with open(a_file_path, 'w') as f: f.write('A') with open(b_file_path, 'w') as f: f.write('B') with open(c_file_path, 'w') as f: f.write('C') with open(x_file_path, 'w') as f: f.write('<XML XML XML/>') with open(y_file_path, 'w') as f: f.write('I win') runner = InlineMRJobRunner(conf_paths=[], output_dir=self.tmp_dir) self.assertEqual(sorted(runner.stream_output()), [b'A', b'B', b'C'])
def test_same_dir_twice(self): archive_dir = self.makedirs('archive') runner = InlineMRJobRunner() archive_path_1 = runner._dir_archive_path(archive_dir) archive_path_2 = runner._dir_archive_path(archive_dir) self.assertEqual(os.path.basename(archive_path_1), 'archive.tar.gz') self.assertEqual(archive_path_1, archive_path_2)
class UpdateJobConfForHadoopVersionTestCase(TestCase): # jobconf with strange mix of Hadoop 1 and Hadoop 2 variables JOBCONF = { 'foo.bar': 'baz', # unknown jobconf 'mapred.jar': 'a.jar', # Hadoop 1 jobconf 'mapreduce.job.user.name': 'dave', # Hadoop 2 jobconf } def setUp(self): self.runner = InlineMRJobRunner(conf_paths=[]) def updated_and_warnings(self, jobconf, hadoop_version): jobconf = jobconf.copy() with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.runner._update_jobconf_for_hadoop_version( jobconf, hadoop_version) return jobconf, stderr.getvalue() def test_no_version(self): updated, warnings = self.updated_and_warnings( self.JOBCONF, None) self.assertEqual(updated, self.JOBCONF) self.assertEqual(warnings, '') def test_hadoop_1(self): updated, warnings = self.updated_and_warnings( self.JOBCONF, '1.0') self.assertEqual(updated, combine_dicts(self.JOBCONF, {'user.name': 'dave'})) self.assertIn('do not match hadoop version', warnings) self.assertIn('mapreduce.job.user.name: user.name', warnings) def test_hadoop_2(self): updated, warnings = self.updated_and_warnings( self.JOBCONF, '2.0') self.assertEqual(updated, combine_dicts(self.JOBCONF, {'mapreduce.job.jar': 'a.jar'})) self.assertIn('do not match hadoop version', warnings) self.assertIn('mapred.jar: mapreduce.job.jar', warnings) def test_dont_overwrite(self): # this jobconf contains two versions of the same variable jobconf = {'mapred.jar': 'a.jar', 'mapreduce.job.jar': 'b.jar'} updated, warnings = self.updated_and_warnings(jobconf, '1.0') self.assertEqual(updated, jobconf) self.assertEqual(warnings, '')
def test_mrjob_zip_compiles(self): runner = InlineMRJobRunner() with no_handlers_for_logger('mrjob.runner'): mrjob_zip = runner._create_mrjob_zip() ZipFile(mrjob_zip).extractall(self.tmp_dir) self.assertTrue( compileall.compile_dir(os.path.join(self.tmp_dir, 'mrjob'), quiet=1))
def test_output_dir_not_considered_hidden(self): output_dir = os.path.join(self.tmp_dir, '_hidden', '_output_dir') self.makefile(os.path.join(output_dir, 'part-00000'), b'cats\n') runner = InlineMRJobRunner(conf_paths=[], output_dir=output_dir) self.assertEqual(sorted(to_lines(runner.stream_output())), [b'cats\n'])
def test_dirs_with_same_name(self): foo_archive = self.makedirs(os.path.join('foo', 'archive')) bar_archive = self.makedirs(os.path.join('bar', 'archive')) runner = InlineMRJobRunner() foo_archive_path = runner._dir_archive_path(foo_archive) bar_archive_path = runner._dir_archive_path(bar_archive) self.assertEqual(os.path.basename(foo_archive_path), 'archive.tar.gz') self.assertNotEqual(foo_archive_path, bar_archive_path)
def test_empty_dir(self): runner = InlineMRJobRunner() empty_dir = self.makedirs('empty') tar_gz_path = runner._dir_archive_path(empty_dir) self.assertEqual(os.path.basename(tar_gz_path), 'empty.tar.gz') runner._create_dir_archive(empty_dir) with tarfile.open(tar_gz_path, 'r:gz') as tar_gz: self.assertEqual(sorted(tar_gz.getnames()), [])
def test_archive(self): runner = InlineMRJobRunner() tar_gz_path = runner._dir_archive_path(self._to_archive) self.assertEqual(os.path.basename(tar_gz_path), 'archive.tar.gz') runner._create_dir_archive(self._to_archive) tar_gz = tarfile.open(tar_gz_path, 'r:gz') try: self.assertEqual(sorted(tar_gz.getnames()), [os.path.join('bar', 'baz'), 'foo']) finally: tar_gz.close()
def test_deprecated_stream_output(self): self.makefile('part-00000', contents=b'1\n2') self.makefile('part-00001', contents=b'3\n4\n') runner = InlineMRJobRunner(conf_paths=[], output_dir=self.tmp_dir) log = self.start(patch('mrjob.runner.log')) # should group output into lines, but not join across files self.assertEqual(sorted(runner.stream_output()), [b'1\n', b'2', b'3\n', b'4\n']) # should issue deprecation warning self.assertEqual(log.warning.call_count, 1)
class UpdateJobConfForHadoopVersionTestCase(TestCase): # jobconf with strange mix of Hadoop 1 and Hadoop 2 variables JOBCONF = { "foo.bar": "baz", # unknown jobconf "mapred.jar": "a.jar", # Hadoop 1 jobconf "mapreduce.job.user.name": "dave", # Hadoop 2 jobconf } def setUp(self): self.runner = InlineMRJobRunner(conf_paths=[]) def updated_and_warnings(self, jobconf, hadoop_version): jobconf = jobconf.copy() with no_handlers_for_logger("mrjob.runner"): stderr = StringIO() log_to_stream("mrjob.runner", stderr) self.runner._update_jobconf_for_hadoop_version(jobconf, hadoop_version) return jobconf, stderr.getvalue() def test_no_version(self): updated, warnings = self.updated_and_warnings(self.JOBCONF, None) self.assertEqual(updated, self.JOBCONF) self.assertEqual(warnings, "") def test_hadoop_1(self): updated, warnings = self.updated_and_warnings(self.JOBCONF, "1.0") self.assertEqual(updated, combine_dicts(self.JOBCONF, {"user.name": "dave"})) self.assertIn("do not match hadoop version", warnings) self.assertIn("mapreduce.job.user.name: user.name", warnings) def test_hadoop_2(self): updated, warnings = self.updated_and_warnings(self.JOBCONF, "2.0") self.assertEqual(updated, combine_dicts(self.JOBCONF, {"mapreduce.job.jar": "a.jar"})) self.assertIn("do not match hadoop version", warnings) self.assertIn("mapred.jar: mapreduce.job.jar", warnings) def test_dont_overwrite(self): # this jobconf contains two versions of the same variable jobconf = {"mapred.jar": "a.jar", "mapreduce.job.jar": "b.jar"} updated, warnings = self.updated_and_warnings(jobconf, "1.0") self.assertEqual(updated, jobconf) self.assertEqual(warnings, "")
def setUp(self): super(TestCatOutput, self).setUp() self.output_dir = os.path.join(self.tmp_dir, 'job_output') os.mkdir(self.output_dir) self.runner = InlineMRJobRunner( conf_paths=[], output_dir=self.output_dir)
def test_no_script_and_no_steps(self): runner = InlineMRJobRunner() self.assertEqual(runner._script_path, None) self.assertEqual(runner._steps, []) self.assertRaises(ValueError, runner.run) self.assertFalse(self.log.warning.called)
def _test_cleanup_after_with_statement(self, mode, should_exist): local_tmp_dir = None with InlineMRJobRunner(cleanup=mode, conf_paths=[]) as runner: local_tmp_dir = runner._get_local_tmp_dir() self.assertTrue(os.path.exists(local_tmp_dir)) # leaving the with: block activates cleanup self.assertEqual(os.path.exists(local_tmp_dir), should_exist)
def test_option_debug_printout(self): log = self.start(patch('mrjob.runner.log')) InlineMRJobRunner(owner='dave') debug = ''.join(a[0] + '\n' for a, kw in log.debug.call_args_list) self.assertIn("'owner'", debug) self.assertIn("'dave'", debug)
def test_extra_kwargs_passed_in_directly_okay(self): runner = InlineMRJobRunner( foo='bar', local_tmp_dir='/var/tmp', conf_paths=[], ) self.assertEqual(runner._opts['local_tmp_dir'], '/var/tmp') self.assertNotIn('bar', runner._opts)
def test_multiple_configs_via_runner_args(self): path_left = self.save_conf('left.conf', self.BASE_CONFIG_LEFT) path_right = self.save_conf('right.conf', self.BASE_CONFIG_RIGHT) runner = InlineMRJobRunner(conf_paths=[path_left, path_right]) self.assertEqual( runner._opts['jobconf'], dict(from_left='one', from_both='two', from_right='two'))
def test_create_mrjob_tar_gz(self): with InlineMRJobRunner(conf_paths=[]) as runner: mrjob_tar_gz_path = runner._create_mrjob_tar_gz() mrjob_tar_gz = tarfile.open(mrjob_tar_gz_path) contents = mrjob_tar_gz.getnames() for path in contents: self.assertEqual(path[:6], 'mrjob/') self.assertIn('mrjob/job.py', contents)
def test_option_debug_printout(self): stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.runner', stderr, debug=True) InlineMRJobRunner(owner='dave') self.assertIn("'owner'", stderr.getvalue()) self.assertIn("'dave'", stderr.getvalue())
def test_empty_runner_error(self): conf = dict(runner=dict(local=dict(base_tmp_dir='/tmp'))) path = self.save_conf('basic', conf) stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.runner', stderr) InlineMRJobRunner(conf_paths=[path]) self.assertEqual("No configs specified for inline runner\n", stderr.getvalue())
def test_file_uris_only(self): runner = InlineMRJobRunner() # sanity check foo_path = self.makefile('foo') bar_path = join(self.tmp_dir, 'bar') self.assertTrue(runner.fs.exists(foo_path)) self.assertFalse(runner.fs.exists('file://' + bar_path)) # non-file:/// URI should raise IOError, not return False self.assertRaises(IOError, runner.fs.exists, 's3://walrus/fish')
def test_recurse(self): path = os.path.join(self.tmp_dir, 'LOL.conf') recurse_conf = dict(include=path) with open(path, 'w') as f: dump_mrjob_conf(recurse_conf, f) stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.conf', stderr) InlineMRJobRunner(conf_path=path) self.assertIn('%s tries to recursively include %s!' % (path, path), stderr.getvalue())
def test_create_mrjob_zip(self): with no_handlers_for_logger('mrjob.runner'): with InlineMRJobRunner(conf_paths=[]) as runner: mrjob_zip_path = runner._create_mrjob_zip() mrjob_zip = ZipFile(mrjob_zip_path) contents = mrjob_zip.namelist() for path in contents: self.assertEqual(path[:6], 'mrjob/') self.assertIn('mrjob/job.py', contents) for filename in contents: self.assertFalse(filename.endswith('.pyc'), msg="%s ends with '.pyc'" % filename)
def test_no_uris(self): runner = InlineMRJobRunner() # sanity check foo_path = self.makefile('foo') bar_path = os.path.join(self.tmp_dir, 'bar') self.assertTrue(runner.fs.exists(foo_path)) self.assertFalse(runner.fs.exists(bar_path)) # URI should raise IOError, not return False self.assertRaises(IOError, runner.fs.exists, 's3://walrus/fish') # and it's because we wrapped the local fs in CompositeFilesystem self.assertFalse(runner.fs.local.exists('s3://walrus/fish'))
def test_only_create_archive_once(self): runner = InlineMRJobRunner() tar_gz_path = runner._dir_archive_path(self._to_archive) runner._create_dir_archive(self._to_archive) mtime_1 = os.stat(tar_gz_path).st_mtime sleep(1) runner._create_dir_archive(self._to_archive) mtime_2 = os.stat(tar_gz_path).st_mtime self.assertEqual(mtime_1, mtime_2)
def test_passthrough(self): runner = InlineMRJobRunner() with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.assertEqual(runner.ls, runner.fs.ls) # no special rules for underscore methods self.assertEqual(runner._cat_file, runner.fs._cat_file) self.assertIn( 'deprecated: call InlineMRJobRunner.fs.ls() directly', stderr.getvalue()) self.assertIn( 'deprecated: call InlineMRJobRunner.fs._cat_file() directly', stderr.getvalue())
def test_conf_contain_only_include_file(self): """If a config file only include other configuration files no warnings are thrown as long as the included files are not empty. """ # dummy configuration for include file 1 conf = { 'runners': { 'inline': { 'local_tmp_dir': "include_file1_local_tmp_dir" } } } include_file_1 = self.save_conf('include_file_1', conf) # dummy configuration for include file 2 conf = { 'runners': { 'inline': { 'local_tmp_dir': "include_file2_local_tmp_dir" } } } include_file_2 = self.save_conf('include_file_2', conf) # test configuration conf = { 'include': [include_file_1, include_file_2] } path = self.save_conf('twoincludefiles', conf) stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.conf', stderr) InlineMRJobRunner(conf_paths=[path]) self.assertEqual( "", stderr.getvalue())
def make_runner(self): """Make a runner based on command-line arguments, so we can launch this job on EMR, on Hadoop, or locally. :rtype: :py:class:`mrjob.runner.MRJobRunner` """ bad_words = ('--steps', '--mapper', '--reducer', '--combiner', '--step-num') for w in bad_words: if w in sys.argv: raise UsageError("make_runner() was called with %s. This" " probably means you tried to use it from" " __main__, which doesn't work." % w) # support inline runner when running from the MRJob itself from mrjob.inline import InlineMRJobRunner if self.options.runner == 'inline': return InlineMRJobRunner(mrjob_cls=self.__class__, **self.inline_job_runner_kwargs()) return super(MRJob, self).make_runner()
def test_conf_contain_only_include_file(self): """If a config file only include other configuration files no warnings are thrown as long as the included files are not empty. """ # dummy configuration for include file 1 conf = { 'runners': { 'inline': { 'local_tmp_dir': "include_file1_local_tmp_dir" } } } include_file_1 = self.save_conf('include_file_1', conf) # dummy configuration for include file 2 conf = { 'runners': { 'inline': { 'local_tmp_dir': "include_file2_local_tmp_dir" } } } include_file_2 = self.save_conf('include_file_2', conf) # test configuration conf = { 'include': [include_file_1, include_file_2] } path = self.save_conf('twoincludefiles', conf) InlineMRJobRunner(conf_paths=[path]) self.assertFalse(self.log.called)
def setUp(self): super(LocalFSTestCase, self).setUp() self.runner = InlineMRJobRunner()
def test_empty(self): runner = InlineMRJobRunner(conf_paths=[]) match = _JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), getpass.getuser())
def setUp(self): super(ClearTagTestCase, self).setUp() self.base_conf_path = self.save_conf('base.conf', self.BASE_CONF) runner = InlineMRJobRunner(conf_paths=[self.base_conf_path]) self.base_opts = runner._opts
def test_file(self): qux_path = self.makefile('qux') runner = InlineMRJobRunner() self.assertRaises(OSError, runner._create_dir_archive, qux_path)
def opts_for_conf(self, name, conf): conf_path = self.save_conf(name, conf) runner = InlineMRJobRunner(conf_paths=[conf_path]) return runner._opts
def test_uri(self): # we don't check whether URIs exist or are directories runner = InlineMRJobRunner() archive_path = runner._dir_archive_path('s3://bucket/stuff') self.assertEqual(os.path.basename(archive_path), 'stuff.tar.gz')
class TestCatOutput(SandboxedTestCase): def setUp(self): super(TestCatOutput, self).setUp() self.output_dir = os.path.join(self.tmp_dir, 'job_output') os.mkdir(self.output_dir) self.runner = InlineMRJobRunner( conf_paths=[], output_dir=self.output_dir) def test_empty(self): self.assertEqual(list(self.runner.cat_output()), []) def test_typical_output(self): # actual output self.makefile(os.path.join(self.output_dir, 'part-00000'), b'line0\n') self.makefile(os.path.join(self.output_dir, 'part-00001'), b'line1\n') # hidden .crc file self.makefile(os.path.join(self.output_dir, '.crc.part-00000'), b'42\n') # hidden _SUCCESS file (ignore) self.makefile(os.path.join(self.output_dir, '_SUCCESS'), b'such a relief!\n') # hidden _logs dir self.makefile(os.path.join(self.output_dir, '_logs', 'log.xml'), b'pretty much the usual\n') self.assertEqual(sorted(to_lines(self.runner.cat_output())), [b'line0\n', b'line1\n']) def test_output_in_subdirs(self): # test for output being placed in subdirs, for example with nicknack self.makefile(os.path.join(self.output_dir, 'a', 'part-00000'), b'line-a0\n') self.makefile(os.path.join(self.output_dir, 'a', 'part-00001'), b'line-a1\n') self.makefile(os.path.join(self.output_dir, 'b', 'part-00000'), b'line-b0\n') self.makefile(os.path.join(self.output_dir, 'b', '.crc.part-00000'), b'42\n') self.assertEqual(sorted(to_lines(self.runner.cat_output())), [b'line-a0\n', b'line-a1\n', b'line-b0\n']) def test_read_all_non_hidden_files(self): self.makefile(os.path.join(self.output_dir, 'baz'), b'qux\n') self.makefile(os.path.join(self.output_dir, 'foo', 'bar'), b'baz\n') self.assertEqual(sorted(to_lines(self.runner.cat_output())), [b'baz\n', b'qux\n']) def test_empty_string_between_files(self): self.makefile(os.path.join(self.output_dir, 'part-00000'), b'A') self.makefile(os.path.join(self.output_dir, 'part-00001'), b'\n') self.makefile(os.path.join(self.output_dir, 'part-00002'), b'C') # order isn't guaranteed, but there should be 3 chunks separated # by two empty strings chunks = list(self.runner.cat_output()) self.assertEqual(len(chunks), 5) self.assertEqual(chunks[1], b'') self.assertEqual(chunks[3], b'') def test_output_dir_not_considered_hidden(self): output_dir = os.path.join(self.tmp_dir, '_hidden', '_output_dir') self.makefile(os.path.join(output_dir, 'part-00000'), b'cats\n') runner = InlineMRJobRunner(conf_paths=[], output_dir=output_dir) self.assertEqual(sorted(to_lines(runner.cat_output())), [b'cats\n'])
def test_file(self): foo_file = self.makefile('foo') runner = InlineMRJobRunner() self.assertRaises(OSError, runner._dir_archive_path, foo_file)
def test_missing_input(self): runner = InlineMRJobRunner(input_paths=['/some/bogus/file/path']) self.assertRaises(Exception, runner._run)
class TestCatOutput(SandboxedTestCase): def setUp(self): super(TestCatOutput, self).setUp() self.output_dir = os.path.join(self.tmp_dir, 'job_output') os.mkdir(self.output_dir) self.runner = InlineMRJobRunner( conf_paths=[], output_dir=self.output_dir) def test_empty(self): self.assertEqual(list(self.runner.cat_output()), []) def test_typical_output(self): # actual output self.makefile(os.path.join(self.output_dir, 'part-00000'), b'line0\n') self.makefile(os.path.join(self.output_dir, 'part-00001'), b'line1\n') # hidden .crc file self.makefile(os.path.join(self.output_dir, '.crc.part-00000'), b'42\n') # hidden _SUCCESS file (ignore) self.makefile(os.path.join(self.output_dir, '_SUCCESS'), b'such a relief!\n') # hidden _logs dir self.makefile(os.path.join(self.output_dir, '_logs', 'log.xml'), b'pretty much the usual\n') self.assertEqual(sorted(to_lines(self.runner.cat_output())), [b'line0\n', b'line1\n']) def test_output_in_subdirs(self): # test for output being placed in subdirs, for example with nicknack self.makefile(os.path.join(self.output_dir, 'a', 'part-00000'), b'line-a0\n') self.makefile(os.path.join(self.output_dir, 'a', 'part-00001'), b'line-a1\n') self.makefile(os.path.join(self.output_dir, 'b', 'part-00000'), b'line-b0\n') self.makefile(os.path.join(self.output_dir, 'b', '.crc.part-00000'), b'42\n') self.assertEqual(sorted(to_lines(self.runner.cat_output())), [b'line-a0\n', b'line-a1\n', b'line-b0\n']) def test_read_all_non_hidden_files(self): self.makefile(os.path.join(self.output_dir, 'baz'), b'qux\n') self.makefile(os.path.join(self.output_dir, 'foo', 'bar'), b'baz\n') self.assertEqual(sorted(to_lines(self.runner.cat_output())), [b'baz\n', b'qux\n']) def test_empty_string_between_files(self): self.makefile(os.path.join(self.output_dir, 'part-00000'), b'A') self.makefile(os.path.join(self.output_dir, 'part-00001'), b'\n') self.makefile(os.path.join(self.output_dir, 'part-00002'), b'C') # order isn't guaranteed, but there should be 3 chunks separated # by two empty strings chunks = list(self.runner.cat_output()) self.assertEqual(len(chunks), 5) self.assertEqual(chunks[1], b'') self.assertEqual(chunks[3], b'') def test_output_dir_not_considered_hidden(self): output_dir = os.path.join(self.tmp_dir, '_hidden', '_output_dir') self.makefile(os.path.join(output_dir, 'part-00000'), b'cats\n') runner = InlineMRJobRunner(conf_paths=[], output_dir=output_dir) self.assertEqual(sorted(to_lines(runner.stream_output())), [b'cats\n']) def test_deprecated_stream_output(self): self.makefile(os.path.join(self.output_dir, 'part-00000'), b'1\n2') self.makefile(os.path.join(self.output_dir, 'part-00001'), b'3\n4\n') log = self.start(patch('mrjob.runner.log')) # should group output into lines, but not join across files self.assertEqual(sorted(self.runner.stream_output()), [b'1\n', b'2', b'3\n', b'4\n']) # should issue deprecation warning self.assertEqual(log.warning.call_count, 1)
def test_extra_kwargs_in_mrjob_conf_okay(self): with logger_disabled('mrjob.runner'): runner = InlineMRJobRunner(conf_paths=[self.path]) self.assertEqual(runner._opts['setup'], ['echo foo']) self.assertNotIn('qux', runner._opts)
def test_empty(self): runner = InlineMRJobRunner(conf_paths=[]) match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), getpass.getuser())
def setUp(self): self.runner = InlineMRJobRunner(conf_paths=[])
def test_empty(self): runner = InlineMRJobRunner(conf_paths=[]) match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), "no_script") self.assertEqual(match.group(2), getpass.getuser())
def test_nonexistent_dir(self): runner = InlineMRJobRunner() nonexistent_dir = os.path.join(self.tmp_dir, 'nonexistent') self.assertRaises(OSError, runner._create_dir_archive, nonexistent_dir)
def test_missing_dir(self): archive_path = os.path.join(self.tmp_dir, 'archive') runner = InlineMRJobRunner() self.assertRaises(OSError, runner._dir_archive_path, archive_path)
def test_owner_and_label_kwargs(self): runner = InlineMRJobRunner(conf_paths=[], owner="ads", label="ads_chain") match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), "ads_chain") self.assertEqual(match.group(2), "ads")