Example #1
0
    def test_can_turn_off_bootstrap_mrjob(self):
        with mrjob_conf_patcher(
            {'runners': {
                'local': {
                    'bootstrap_mrjob': False
                }
            }}):

            mr_job = MRJobWhereAreYou(['-r', 'local'])
            mr_job.sandbox()

            with mr_job.make_runner() as runner:
                # sanity check
                self.assertEqual(runner.get_opts()['bootstrap_mrjob'], False)
                local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir())
                try:
                    with no_handlers_for_logger():
                        runner.run()
                except Exception as e:
                    # if mrjob is not installed, script won't be able to run
                    self.assertIn('ImportError', str(e))
                    return

                output = list(runner.stream_output())

                self.assertEqual(len(output), 1)

                # script should not load mrjob from local_tmp_dir
                _, script_mrjob_dir = mr_job.parse_output_line(output[0])
                self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
Example #2
0
    def test_command_line_can_blank_out_conf(self):
        self.start(mrjob_conf_patcher(
            dict(runners=dict(inline=dict(
                local_tmp_dir=self.tmp_dir)))))

        with self.make_runner('--local-tmp-dir', '') as runner:
            self.assert_local_tmp_in(runner, tempfile.gettempdir())
Example #3
0
    def test_mrjob_conf(self):
        self.start(mrjob_conf_patcher(
            dict(runners=dict(inline=dict(
                local_tmp_dir=self.tmp_dir)))))

        with self.make_runner() as runner:
            self.assert_local_tmp_in(runner, self.tmp_dir)
Example #4
0
    def test_mrjob_conf(self):
        self.start(mrjob_conf_patcher(
            dict(runners=dict(inline=dict(
                local_tmp_dir=self.tmp_dir)))))

        with self.make_runner() as runner:
            self.assert_local_tmp_in(runner, self.tmp_dir)
Example #5
0
    def test_can_turn_off_bootstrap_mrjob(self):
        with mrjob_conf_patcher(
                {'runners': {'local': {'bootstrap_mrjob': False}}}):

            mr_job = MRJobWhereAreYou(['-r', 'local'])
            mr_job.sandbox()

            with mr_job.make_runner() as runner:
                # sanity check
                self.assertEqual(runner._opts['bootstrap_mrjob'], False)
                local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir())
                try:
                    runner.run()
                except StepFailedException:
                    # this is what happens when mrjob isn't installed elsewhere
                    return

                # however, if mrjob is installed, we need to verify that
                # we're using the installed version and not a bootstrapped copy
                output = list(mr_job.parse_output(runner.cat_output()))

                self.assertEqual(len(output), 1)

                # script should not load mrjob from local_tmp_dir
                _, script_mrjob_dir = output[0]
                self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
Example #6
0
    def test_can_turn_off_bootstrap_mrjob(self):
        with mrjob_conf_patcher({"runners": {"local": {"bootstrap_mrjob": False}}}):

            mr_job = MRJobWhereAreYou(["-r", "local"])
            mr_job.sandbox()

            with mr_job.make_runner() as runner:
                # sanity check
                self.assertEqual(runner.get_opts()["bootstrap_mrjob"], False)
                local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir())
                try:
                    with no_handlers_for_logger():
                        runner.run()
                except Exception as e:
                    # if mrjob is not installed, script won't be able to run
                    self.assertIn("ImportError", str(e))
                    return

                output = list(runner.stream_output())

                self.assertEqual(len(output), 1)

                # script should not load mrjob from local_tmp_dir
                _, script_mrjob_dir = mr_job.parse_output_line(output[0])
                self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
Example #7
0
    def test_can_turn_off_bootstrap_mrjob(self):
        with mrjob_conf_patcher(
            {'runners': {
                'local': {
                    'bootstrap_mrjob': False
                }
            }}):

            mr_job = MRJobWhereAreYou(['-r', 'local'])
            mr_job.sandbox()

            with mr_job.make_runner() as runner:
                # sanity check
                self.assertEqual(runner.get_opts()['bootstrap_mrjob'], False)
                local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir())
                try:
                    with no_handlers_for_logger():
                        runner.run()
                except StepFailedException:
                    # this is what happens when mrjob isn't installed elsewhere
                    return

                # however, if mrjob is installed, we need to verify that
                # we're using the installed version and not a bootstrapped copy
                output = list(runner.stream_output())

                self.assertEqual(len(output), 1)

                # script should not load mrjob from local_tmp_dir
                _, script_mrjob_dir = mr_job.parse_output_line(output[0])
                self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
Example #8
0
    def test_loading_boostrapped_mrjob_library(self):
        # track the dir we're loading mrjob from rather than the full path
        # to deal with edge cases where we load from the .py file,
        # and the script loads from the .pyc compiled from that .py file.
        our_mrjob_dir = os.path.dirname(os.path.realpath(mrjob.__file__))

        with mrjob_conf_patcher():
            mr_job = MRJobWhereAreYou(['-r', 'local'])
            mr_job.sandbox()

            with mr_job.make_runner() as runner:
                # sanity check
                self.assertEqual(runner.get_opts()['bootstrap_mrjob'], True)
                local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir())

                runner.run()

                output = list(runner.stream_output())
                self.assertEqual(len(output), 1)

                # script should load mrjob from its working dir
                _, script_mrjob_dir = mr_job.parse_output_line(output[0])

                self.assertNotEqual(our_mrjob_dir, script_mrjob_dir)
                assert script_mrjob_dir.startswith(local_tmp_dir)
Example #9
0
    def test_loading_boostrapped_mrjob_library(self):
        # track the dir we're loading mrjob from rather than the full path
        # to deal with edge cases where we load from the .py file,
        # and the script loads from the .pyc compiled from that .py file.
        our_mrjob_dir = os.path.dirname(os.path.realpath(mrjob.__file__))

        with mrjob_conf_patcher():
            mr_job = MRJobWhereAreYou(['-r', 'local', '--bootstrap-mrjob'])
            mr_job.sandbox()

            with mr_job.make_runner() as runner:
                # sanity check
                self.assertEqual(runner._bootstrap_mrjob(), True)
                local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir())

                runner.run()

                output = list(runner.stream_output())
                self.assertEqual(len(output), 1)

                # script should load mrjob from its working dir
                _, script_mrjob_dir = mr_job.parse_output_line(output[0])

                self.assertNotEqual(our_mrjob_dir, script_mrjob_dir)
                self.assertTrue(script_mrjob_dir.startswith(local_tmp_dir))
Example #10
0
    def test_command_line_can_blank_out_conf(self):
        self.start(mrjob_conf_patcher(
            dict(runners=dict(inline=dict(
                local_tmp_dir=self.tmp_dir)))))

        with self.make_runner('--local-tmp-dir', '') as runner:
            self.assert_local_tmp_in(runner, tempfile.gettempdir())
Example #11
0
    def test_blank_local_tmp_dir_means_default(self):
        self.start(mrjob_conf_patcher(
            dict(runners=dict(inline=dict(
                local_tmp_dir='')))))

        with self.make_runner() as runner:
            self.assert_local_tmp_in(runner, tempfile.gettempdir())
Example #12
0
    def test_blank_local_tmp_dir_means_default(self):
        self.start(mrjob_conf_patcher(
            dict(runners=dict(inline=dict(
                local_tmp_dir='')))))

        with self.make_runner() as runner:
            self.assert_local_tmp_in(runner, tempfile.gettempdir())
Example #13
0
 def test_can_disable_check_input_paths_in_config(self):
     job = MRWordCount()
     with mrjob_conf_patcher(
         {'runners': {
             'inline': {
                 'check_input_paths': False
             }
         }}):
         with job.make_runner() as runner:
             self.assertFalse(runner._opts['check_input_paths'])
Example #14
0
    def test_command_line_beats_config(self):
        ZONE_CONF = dict(runners=dict(dataproc=dict(zone='us-west1-a')))

        with mrjob_conf_patcher(ZONE_CONF):
            runner = DataprocJobRunner(region='europe-west1')

            # region takes precedence because it was set on the command line
            self.assertEqual(runner._opts['region'], 'europe-west1')
            self.assertEqual(runner._opts['zone'], None)
            # only a problem if you set region and zone
            # in the same config
            self.assertFalse(self.log.warning.called)
Example #15
0
    def test_emulate_map_input_file_in_conf(self):
        self.start(
            mrjob_conf_patcher(
                dict(runners=dict(spark=dict(emulate_map_input_file=True)))))

        two_lines_path = self.makefile('two_lines', b'line\nother line\n')

        job = MRCountLinesByFile(['-r', 'spark', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output, {'file://' + two_lines_path: 2})
Example #16
0
    def test_max_output_files_is_cmd_line_only(self):
        self.start(mrjob_conf_patcher(
            dict(runners=dict(spark=dict(max_output_files=1)))))

        log = self.start(patch('mrjob.runner.log'))

        job = MRWordFreqCount(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(b'one two one\n two three\n'))

        with job.make_runner() as runner:
            runner.run()

            # by default there should be at least 2 output files
            self.assertNotEqual(self._num_output_files(runner), 1)

        self.assertTrue(log.warning.called)
Example #17
0
    def test_max_output_files_is_cmd_line_only(self):
        self.start(mrjob_conf_patcher(
            dict(runners=dict(spark=dict(max_output_files=1)))))

        log = self.start(patch('mrjob.runner.log'))

        job = MRWordFreqCount(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(b'one two one\n two three\n'))

        with job.make_runner() as runner:
            runner.run()

            # by default there should be at least 2 output files
            self.assertNotEqual(self._num_output_files(runner), 1)

        self.assertTrue(log.warning.called)
Example #18
0
    def test_override_emulate_map_input_file_in_conf(self):
        self.start(
            mrjob_conf_patcher(
                dict(runners=dict(spark=dict(emulate_map_input_file=True)))))

        two_lines_path = self.makefile('two_lines', b'line\nother line\n')

        job = MRCountLinesByFile(
            ['-r', 'spark', '--no-emulate-map-input-file', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            # without emulate_map_input_file, there is no input file path
            self.assertEqual(output, {None: 2})
Example #19
0
    def test_can_turn_off_bootstrap_mrjob(self):
        # track the dir we're loading mrjob from rather than the full path
        # to deal with edge cases where we load from the .py file,
        # and the script loads from the .pyc compiled from that .py file.
        our_mrjob_dir = os.path.dirname(os.path.realpath(mrjob.__file__))

        with mrjob_conf_patcher(
            {'runners': {'local': {'bootstrap_mrjob': False}}}):

            mr_job = MRJobWhereAreYou(['-r', 'local'])
            mr_job.sandbox()

            with mr_job.make_runner() as runner:
                # sanity check
                self.assertEqual(runner.get_opts()['bootstrap_mrjob'], False)
                runner.run()

                output = list(runner.stream_output())

                self.assertEqual(len(output), 1)

                # script should load mrjob from the same place our test does
                _, script_mrjob_dir = mr_job.parse_output_line(output[0])
                self.assertEqual(our_mrjob_dir, script_mrjob_dir)
Example #20
0
 def test_loose_mrjob_conf(self):
     job = MRJob()
     with mrjob_conf_patcher(self.LOOSE_MRJOB_CONF):
         with job.make_runner() as runner:
             self.assertEqual(runner._opts['strict_protocols'], False)
Example #21
0
 def set_in_mrjob_conf(self, **kwargs):
     dataproc_opts = copy.deepcopy(self.MRJOB_CONF_CONTENTS)
     dataproc_opts['runners']['dataproc'].update(kwargs)
     patcher = mrjob_conf_patcher(dataproc_opts)
     patcher.start()
     self.addCleanup(patcher.stop)
Example #22
0
 def test_unencodable_output_strict_in_mrjob_conf(self):
     with mrjob_conf_patcher(self.STRICT_MRJOB_CONF):
         self.assertJobRaisesExceptionOnUnencodableOutput(
             job_args=['--strict-protocols'])
Example #23
0
 def set_in_mrjob_conf(self, **kwargs):
     dataproc_opts = copy.deepcopy(self.MRJOB_CONF_CONTENTS)
     dataproc_opts['runners']['dataproc'].update(kwargs)
     patcher = mrjob_conf_patcher(dataproc_opts)
     patcher.start()
     self.addCleanup(patcher.stop)
Example #24
0
 def test_can_disable_check_input_paths_in_config(self):
     job = MRWordCount()
     with mrjob_conf_patcher(
             {'runners': {'inline': {'check_input_paths': False}}}):
         with job.make_runner() as runner:
             self.assertFalse(runner._opts['check_input_paths'])
Example #25
0
 def test_loose_mrjob_conf(self):
     job = MRJob()
     with mrjob_conf_patcher(self.LOOSE_MRJOB_CONF):
         with job.make_runner() as runner:
             self.assertEqual(runner._opts['strict_protocols'], False)
Example #26
0
 def test_strict_mrjob_conf(self):
     job = MRJob()
     with mrjob_conf_patcher(self.STRICT_MRJOB_CONF):
         with job.make_runner() as runner:
             self.assertEqual(runner._opts['strict_protocols'], True)
Example #27
0
 def test_unencodable_output_no_strict_protocols(self):
     with mrjob_conf_patcher(self.STRICT_MRJOB_CONF):
         self.assertJobHandlesUnencodableOutput(
             job_args=['--no-strict-protocols'])
Example #28
0
 def test_unencodable_output_strict_in_mrjob_conf(self):
     with mrjob_conf_patcher(self.STRICT_MRJOB_CONF):
         self.assertJobRaisesExceptionOnUnencodableOutput(
             job_args=['--strict-protocols'])
Example #29
0
 def test_unencodable_output_no_strict_protocols(self):
     with mrjob_conf_patcher(self.STRICT_MRJOB_CONF):
         self.assertJobHandlesUnencodableOutput(
             job_args=['--no-strict-protocols'])
Example #30
0
 def test_strict_mrjob_conf(self):
     job = MRJob()
     with mrjob_conf_patcher(self.STRICT_MRJOB_CONF):
         with job.make_runner() as runner:
             self.assertEqual(runner._opts['strict_protocols'], True)