def test_can_turn_off_bootstrap_mrjob(self): with mrjob_conf_patcher( {'runners': { 'local': { 'bootstrap_mrjob': False } }}): mr_job = MRJobWhereAreYou(['-r', 'local']) mr_job.sandbox() with mr_job.make_runner() as runner: # sanity check self.assertEqual(runner.get_opts()['bootstrap_mrjob'], False) local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir()) try: with no_handlers_for_logger(): runner.run() except Exception as e: # if mrjob is not installed, script won't be able to run self.assertIn('ImportError', str(e)) return output = list(runner.stream_output()) self.assertEqual(len(output), 1) # script should not load mrjob from local_tmp_dir _, script_mrjob_dir = mr_job.parse_output_line(output[0]) self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
def test_command_line_can_blank_out_conf(self): self.start(mrjob_conf_patcher( dict(runners=dict(inline=dict( local_tmp_dir=self.tmp_dir))))) with self.make_runner('--local-tmp-dir', '') as runner: self.assert_local_tmp_in(runner, tempfile.gettempdir())
def test_mrjob_conf(self): self.start(mrjob_conf_patcher( dict(runners=dict(inline=dict( local_tmp_dir=self.tmp_dir))))) with self.make_runner() as runner: self.assert_local_tmp_in(runner, self.tmp_dir)
def test_can_turn_off_bootstrap_mrjob(self): with mrjob_conf_patcher( {'runners': {'local': {'bootstrap_mrjob': False}}}): mr_job = MRJobWhereAreYou(['-r', 'local']) mr_job.sandbox() with mr_job.make_runner() as runner: # sanity check self.assertEqual(runner._opts['bootstrap_mrjob'], False) local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir()) try: runner.run() except StepFailedException: # this is what happens when mrjob isn't installed elsewhere return # however, if mrjob is installed, we need to verify that # we're using the installed version and not a bootstrapped copy output = list(mr_job.parse_output(runner.cat_output())) self.assertEqual(len(output), 1) # script should not load mrjob from local_tmp_dir _, script_mrjob_dir = output[0] self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
def test_can_turn_off_bootstrap_mrjob(self): with mrjob_conf_patcher({"runners": {"local": {"bootstrap_mrjob": False}}}): mr_job = MRJobWhereAreYou(["-r", "local"]) mr_job.sandbox() with mr_job.make_runner() as runner: # sanity check self.assertEqual(runner.get_opts()["bootstrap_mrjob"], False) local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir()) try: with no_handlers_for_logger(): runner.run() except Exception as e: # if mrjob is not installed, script won't be able to run self.assertIn("ImportError", str(e)) return output = list(runner.stream_output()) self.assertEqual(len(output), 1) # script should not load mrjob from local_tmp_dir _, script_mrjob_dir = mr_job.parse_output_line(output[0]) self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
def test_can_turn_off_bootstrap_mrjob(self): with mrjob_conf_patcher( {'runners': { 'local': { 'bootstrap_mrjob': False } }}): mr_job = MRJobWhereAreYou(['-r', 'local']) mr_job.sandbox() with mr_job.make_runner() as runner: # sanity check self.assertEqual(runner.get_opts()['bootstrap_mrjob'], False) local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir()) try: with no_handlers_for_logger(): runner.run() except StepFailedException: # this is what happens when mrjob isn't installed elsewhere return # however, if mrjob is installed, we need to verify that # we're using the installed version and not a bootstrapped copy output = list(runner.stream_output()) self.assertEqual(len(output), 1) # script should not load mrjob from local_tmp_dir _, script_mrjob_dir = mr_job.parse_output_line(output[0]) self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
def test_loading_boostrapped_mrjob_library(self): # track the dir we're loading mrjob from rather than the full path # to deal with edge cases where we load from the .py file, # and the script loads from the .pyc compiled from that .py file. our_mrjob_dir = os.path.dirname(os.path.realpath(mrjob.__file__)) with mrjob_conf_patcher(): mr_job = MRJobWhereAreYou(['-r', 'local']) mr_job.sandbox() with mr_job.make_runner() as runner: # sanity check self.assertEqual(runner.get_opts()['bootstrap_mrjob'], True) local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir()) runner.run() output = list(runner.stream_output()) self.assertEqual(len(output), 1) # script should load mrjob from its working dir _, script_mrjob_dir = mr_job.parse_output_line(output[0]) self.assertNotEqual(our_mrjob_dir, script_mrjob_dir) assert script_mrjob_dir.startswith(local_tmp_dir)
def test_loading_boostrapped_mrjob_library(self): # track the dir we're loading mrjob from rather than the full path # to deal with edge cases where we load from the .py file, # and the script loads from the .pyc compiled from that .py file. our_mrjob_dir = os.path.dirname(os.path.realpath(mrjob.__file__)) with mrjob_conf_patcher(): mr_job = MRJobWhereAreYou(['-r', 'local', '--bootstrap-mrjob']) mr_job.sandbox() with mr_job.make_runner() as runner: # sanity check self.assertEqual(runner._bootstrap_mrjob(), True) local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir()) runner.run() output = list(runner.stream_output()) self.assertEqual(len(output), 1) # script should load mrjob from its working dir _, script_mrjob_dir = mr_job.parse_output_line(output[0]) self.assertNotEqual(our_mrjob_dir, script_mrjob_dir) self.assertTrue(script_mrjob_dir.startswith(local_tmp_dir))
def test_blank_local_tmp_dir_means_default(self): self.start(mrjob_conf_patcher( dict(runners=dict(inline=dict( local_tmp_dir=''))))) with self.make_runner() as runner: self.assert_local_tmp_in(runner, tempfile.gettempdir())
def test_can_disable_check_input_paths_in_config(self): job = MRWordCount() with mrjob_conf_patcher( {'runners': { 'inline': { 'check_input_paths': False } }}): with job.make_runner() as runner: self.assertFalse(runner._opts['check_input_paths'])
def test_command_line_beats_config(self): ZONE_CONF = dict(runners=dict(dataproc=dict(zone='us-west1-a'))) with mrjob_conf_patcher(ZONE_CONF): runner = DataprocJobRunner(region='europe-west1') # region takes precedence because it was set on the command line self.assertEqual(runner._opts['region'], 'europe-west1') self.assertEqual(runner._opts['zone'], None) # only a problem if you set region and zone # in the same config self.assertFalse(self.log.warning.called)
def test_emulate_map_input_file_in_conf(self): self.start( mrjob_conf_patcher( dict(runners=dict(spark=dict(emulate_map_input_file=True))))) two_lines_path = self.makefile('two_lines', b'line\nother line\n') job = MRCountLinesByFile(['-r', 'spark', two_lines_path]) with job.make_runner() as runner: runner.run() output = dict(job.parse_output(runner.cat_output())) self.assertEqual(output, {'file://' + two_lines_path: 2})
def test_max_output_files_is_cmd_line_only(self): self.start(mrjob_conf_patcher( dict(runners=dict(spark=dict(max_output_files=1))))) log = self.start(patch('mrjob.runner.log')) job = MRWordFreqCount(['-r', 'spark']) job.sandbox(stdin=BytesIO(b'one two one\n two three\n')) with job.make_runner() as runner: runner.run() # by default there should be at least 2 output files self.assertNotEqual(self._num_output_files(runner), 1) self.assertTrue(log.warning.called)
def test_override_emulate_map_input_file_in_conf(self): self.start( mrjob_conf_patcher( dict(runners=dict(spark=dict(emulate_map_input_file=True))))) two_lines_path = self.makefile('two_lines', b'line\nother line\n') job = MRCountLinesByFile( ['-r', 'spark', '--no-emulate-map-input-file', two_lines_path]) with job.make_runner() as runner: runner.run() output = dict(job.parse_output(runner.cat_output())) # without emulate_map_input_file, there is no input file path self.assertEqual(output, {None: 2})
def test_can_turn_off_bootstrap_mrjob(self): # track the dir we're loading mrjob from rather than the full path # to deal with edge cases where we load from the .py file, # and the script loads from the .pyc compiled from that .py file. our_mrjob_dir = os.path.dirname(os.path.realpath(mrjob.__file__)) with mrjob_conf_patcher( {'runners': {'local': {'bootstrap_mrjob': False}}}): mr_job = MRJobWhereAreYou(['-r', 'local']) mr_job.sandbox() with mr_job.make_runner() as runner: # sanity check self.assertEqual(runner.get_opts()['bootstrap_mrjob'], False) runner.run() output = list(runner.stream_output()) self.assertEqual(len(output), 1) # script should load mrjob from the same place our test does _, script_mrjob_dir = mr_job.parse_output_line(output[0]) self.assertEqual(our_mrjob_dir, script_mrjob_dir)
def test_loose_mrjob_conf(self): job = MRJob() with mrjob_conf_patcher(self.LOOSE_MRJOB_CONF): with job.make_runner() as runner: self.assertEqual(runner._opts['strict_protocols'], False)
def set_in_mrjob_conf(self, **kwargs): dataproc_opts = copy.deepcopy(self.MRJOB_CONF_CONTENTS) dataproc_opts['runners']['dataproc'].update(kwargs) patcher = mrjob_conf_patcher(dataproc_opts) patcher.start() self.addCleanup(patcher.stop)
def test_unencodable_output_strict_in_mrjob_conf(self): with mrjob_conf_patcher(self.STRICT_MRJOB_CONF): self.assertJobRaisesExceptionOnUnencodableOutput( job_args=['--strict-protocols'])
def test_can_disable_check_input_paths_in_config(self): job = MRWordCount() with mrjob_conf_patcher( {'runners': {'inline': {'check_input_paths': False}}}): with job.make_runner() as runner: self.assertFalse(runner._opts['check_input_paths'])
def test_strict_mrjob_conf(self): job = MRJob() with mrjob_conf_patcher(self.STRICT_MRJOB_CONF): with job.make_runner() as runner: self.assertEqual(runner._opts['strict_protocols'], True)
def test_unencodable_output_no_strict_protocols(self): with mrjob_conf_patcher(self.STRICT_MRJOB_CONF): self.assertJobHandlesUnencodableOutput( job_args=['--no-strict-protocols'])