Example #1
0
    def test_precedence_deprecated(self):
        os.environ["HOME"] = "/home/foo"
        os.environ["PYTHONPATH"] = "/py1:/py2"
        self._existing_paths = set()

        self.assertEqual(find_mrjob_conf(), None)

        self._existing_paths.add("/etc/mrjob.conf")
        self.assertEqual(find_mrjob_conf(), "/etc/mrjob.conf")

        self._existing_paths.add("/py2/mrjob.conf")
        with no_handlers_for_logger():
            buf = log_to_buffer("mrjob.conf")
            self.assertEqual(find_mrjob_conf(), "/py2/mrjob.conf")
            self.assertIn("This config path is deprecated", buf.getvalue())

        self._existing_paths.add("/py1/mrjob.conf")
        with no_handlers_for_logger():
            buf = log_to_buffer("mrjob.conf")
            self.assertEqual(find_mrjob_conf(), "/py1/mrjob.conf")
            self.assertIn("This config path is deprecated", buf.getvalue())

        self._existing_paths.add("/home/foo/.mrjob")
        with no_handlers_for_logger():
            buf = log_to_buffer("mrjob.conf")
            self.assertEqual(find_mrjob_conf(), "/home/foo/.mrjob")
            self.assertIn("This config path is deprecated", buf.getvalue())

        mrjob_conf_path = os.path.join(self.tmp_dir, "mrjob.conf")
        open(mrjob_conf_path, "w").close()
        os.environ["MRJOB_CONF"] = mrjob_conf_path
        self._existing_paths.add(mrjob_conf_path)
        self.assertEqual(find_mrjob_conf(), mrjob_conf_path)
Example #2
0
    def test_precedence_deprecated(self):
        os.environ['HOME'] = '/home/foo'
        os.environ['PYTHONPATH'] = '/py1:/py2'
        self._existing_paths = set()

        assert_equal(find_mrjob_conf(), None)

        self._existing_paths.add('/etc/mrjob.conf')
        assert_equal(find_mrjob_conf(), '/etc/mrjob.conf')

        self._existing_paths.add('/py2/mrjob.conf')
        with no_handlers_for_logger():
            buf = self._log_to_buffer()
            assert_equal(find_mrjob_conf(), '/py2/mrjob.conf')
            assert_in('This config path is deprecated', buf.getvalue())

        self._existing_paths.add('/py1/mrjob.conf')
        with no_handlers_for_logger():
            buf = self._log_to_buffer()
            assert_equal(find_mrjob_conf(), '/py1/mrjob.conf')
            assert_in('This config path is deprecated', buf.getvalue())

        self._existing_paths.add('/home/foo/.mrjob')
        with no_handlers_for_logger():
            buf = self._log_to_buffer()
            assert_equal(find_mrjob_conf(), '/home/foo/.mrjob')
            assert_in('This config path is deprecated', buf.getvalue())

        mrjob_conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')
        open(mrjob_conf_path, 'w').close()
        os.environ['MRJOB_CONF'] = mrjob_conf_path
        self._existing_paths.add(mrjob_conf_path)
        assert_equal(find_mrjob_conf(), mrjob_conf_path)
Example #3
0
    def test_round_trip(self):
        conf = {"runners": {"foo": {"qux": "quux"}}}
        conf_path = os.path.join(self.tmp_dir, "mrjob.conf")

        dump_mrjob_conf(conf, open(conf_path, "w"))
        with no_handlers_for_logger("mrjob.conf"):
            self.assertEqual(conf, load_mrjob_conf(conf_path=conf_path))
Example #4
0
    def test_non_log_lines(self):
        lines = StringIO(
            "foo\n"
            "bar\n"
            "15/12/11 13:26:08 ERROR streaming.StreamJob:"
            " Error Launching job :"
            " Output directory already exists\n"
            "Streaming Command Failed!"
        )

        with no_handlers_for_logger("mrjob.logs.parse"):
            stderr = StringIO()
            log_to_stream("mrjob.logs.parse", stderr)

            self.assertEqual(
                list(_parse_hadoop_log_lines(lines)),
                [
                    # ignore leading non-log lines
                    dict(
                        timestamp="15/12/11 13:26:08",
                        level="ERROR",
                        logger="streaming.StreamJob",
                        thread=None,
                        # no way to know that Streaming Command Failed! wasn't part
                        # of a multi-line message
                        message=(
                            "Error Launching job :" " Output directory already exists\n" "Streaming Command Failed!"
                        ),
                    )
                ],
            )

            # should be one warning for each leading non-log line
            log_lines = stderr.getvalue().splitlines()
            self.assertEqual(len(log_lines), 2)
Example #5
0
 def test_kill_persistent_cluster(self):
     with no_handlers_for_logger("mrjob.dataproc"):
         r = self._quick_runner()
         with patch.object(mrjob.dataproc.DataprocJobRunner, "_api_cluster_delete") as m:
             r._opts["cluster_id"] = "j-MOCKCLUSTER0"
             r._cleanup_cluster()
             self.assertTrue(m.called)
Example #6
0
    def test_failed_job(self):
        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v'])
        mr_job.sandbox()

        with no_handlers_for_logger('mrjob.dataproc'):
            stderr = StringIO()
            log_to_stream('mrjob.dataproc', stderr)

            self._dataproc_client.job_get_advances_states = (
                collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR']))

            with mr_job.make_runner() as runner:
                self.assertIsInstance(runner, DataprocJobRunner)

                self.assertRaises(StepFailedException, runner.run)

                self.assertIn(' => ERROR\n', stderr.getvalue())

                cluster_id = runner.get_cluster_id()

        # job should get terminated
        cluster = (
            self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id])
        cluster_state = self._dataproc_client.get_state(cluster)
        self.assertEqual(cluster_state, 'DELETING')
Example #7
0
    def test_non_log_lines(self):
        lines = StringIO('foo\n'
                         'bar\n'
                         '15/12/11 13:26:08 ERROR streaming.StreamJob:'
                         ' Error Launching job :'
                         ' Output directory already exists\n'
                         'Streaming Command Failed!')

        with no_handlers_for_logger('mrjob.logs.parse'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.parse', stderr)

            self.assertEqual(
            list(_parse_hadoop_log_lines(lines)), [
                # ignore leading non-log lines
                dict(
                    timestamp='15/12/11 13:26:08',
                    level='ERROR',
                    logger='streaming.StreamJob',
                    thread=None,
                    # no way to know that Streaming Command Failed! wasn't part
                    # of a multi-line message
                    message=('Error Launching job :'
                             ' Output directory already exists\n'
                             'Streaming Command Failed!'))
            ])

            # should be one warning for each leading non-log line
            log_lines = stderr.getvalue().splitlines()
            self.assertEqual(len(log_lines), 2)
Example #8
0
 def assert_hadoop_version(self, JobClass, version_string):
     mr_job = JobClass()
     mock_log = StringIO()
     with no_handlers_for_logger("mrjob.job"):
         log_to_stream("mrjob.job", mock_log)
         self.assertEqual(mr_job.jobconf()["hadoop_version"], version_string)
         self.assertIn("should be a string", mock_log.getvalue())
Example #9
0
 def test_getattr_forward(self):
     with no_handlers_for_logger():
         r = InlineMRJobRunner(conf_path=False)
     store = r._opts
     self.assertIsInstance(store, InlineRunnerOptionStore)
     a = r.get_default_opts()
     self.assertEqual(a, store.default_options())
Example #10
0
 def test_messy_error(self):
     counter_string = 'Job JOBID="_001" FAILED_REDUCES="0" COUNTERS="THIS IS NOT ACTUALLY A COUNTER"'
     with no_handlers_for_logger(''):
         stderr = StringIO()
         log_to_stream('mrjob.parse', stderr, level=logging.WARN)
         assert_equal((None, None), parse_hadoop_counters_from_line(counter_string))
         assert_in('Cannot parse Hadoop counter line', stderr.getvalue())
Example #11
0
    def test_cleanup_options(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.runner'):
            log_to_stream('mrjob.runner', stderr)
            opts = RunnerOptionStore(
                'inline',
                dict(cleanup=['LOCAL_SCRATCH', 'REMOTE_SCRATCH'],
                     cleanup_on_failure=['JOB_FLOW', 'SCRATCH']),
                [])

            self.assertEqual(opts['cleanup'], ['LOCAL_TMP', 'CLOUD_TMP'])
            self.assertIn(
                'Deprecated cleanup option LOCAL_SCRATCH has been renamed'
                ' to LOCAL_TMP', stderr.getvalue())
            self.assertIn(
                'Deprecated cleanup option REMOTE_SCRATCH has been renamed'
                ' to CLOUD_TMP', stderr.getvalue())

            self.assertEqual(opts['cleanup_on_failure'], ['CLUSTER', 'TMP'])
            self.assertIn(
                'Deprecated cleanup_on_failure option JOB_FLOW has been'
                ' renamed to CLUSTER', stderr.getvalue())
            self.assertIn(
                'Deprecated cleanup_on_failure option SCRATCH has been renamed'
                ' to TMP', stderr.getvalue())
Example #12
0
 def test_hadoop_runner(self):
     # you can't instantiate a HadoopJobRunner without Hadoop installed
     launcher = MRJobLauncher(args=["--no-conf", "-r", "hadoop", "", "--hadoop-streaming-jar", "HUNNY"])
     with no_handlers_for_logger("mrjob.runner"):
         with patch.dict(os.environ, {"HADOOP_HOME": "100-Acre Wood"}):
             with launcher.make_runner() as runner:
                 self.assertIsInstance(runner, HadoopJobRunner)
Example #13
0
    def _test_round_trip(self, conf):
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')

        with open(conf_path, 'w') as f:
            dump_mrjob_conf(conf, f)
        with no_handlers_for_logger('mrjob.conf'):
            self.assertEqual(conf, load_mrjob_conf(conf_path=conf_path))
Example #14
0
    def test_round_trip(self):
        conf = {'runners': {'foo': {'qux': 'quux'}}}
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf')

        dump_mrjob_conf(conf, open(conf_path, 'w'))
        with no_handlers_for_logger('mrjob.conf'):
            self.assertEqual(conf, load_mrjob_conf(conf_path=conf_path))
Example #15
0
    def test_fallback(self):
        self.assertFalse(self.which.called)

        with no_handlers_for_logger('mrjob.fs.hadoop'):
            self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop'])

        self.which.assert_called_once_with('hadoop', path=None)
Example #16
0
 def test_kill_persistent_cluster(self):
     with no_handlers_for_logger('mrjob.dataproc'):
         r = self._quick_runner()
         with patch.object(mrjob.dataproc.DataprocJobRunner, '_api_cluster_delete') as m:
             r._opts['cluster_id'] = 'j-MOCKCLUSTER0'
             r._cleanup_cluster()
             self.assertTrue(m.called)
Example #17
0
    def _test_environment_variable(self, envvar, *dirnames):
        """Check if we can find the hadoop binary from *envvar*"""
        # okay to add after HadoopFilesystem() created; it hasn't looked yet
        hadoop_bin = self._add_hadoop_bin_for_envvar(envvar, *dirnames)

        with no_handlers_for_logger('mrjob.fs.hadoop'):
            self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_bin])
Example #18
0
    def test_deprecated_mapper_final_positional_arg(self):
        def mapper(k, v):
            pass

        def reducer(k, v):
            pass

        def mapper_final():
            pass

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.job', stderr)
            step = MRJob.mr(mapper, reducer, mapper_final)

        # should be allowed to specify mapper_final as a positional arg,
        # but we log a warning
        self.assertEqual(
            step,
            MRJob.mr(
                mapper=mapper, reducer=reducer, mapper_final=mapper_final))
        self.assertIn('mapper_final should be specified', stderr.getvalue())

        # can't specify mapper_final as a positional and keyword arg
        self.assertRaises(
            TypeError,
            MRJob.mr,
            mapper,
            reducer,
            mapper_final,
            mapper_final=mapper_final)
Example #19
0
    def test_can_turn_off_bootstrap_mrjob(self):
        with mrjob_conf_patcher({"runners": {"local": {"bootstrap_mrjob": False}}}):

            mr_job = MRJobWhereAreYou(["-r", "local"])
            mr_job.sandbox()

            with mr_job.make_runner() as runner:
                # sanity check
                self.assertEqual(runner.get_opts()["bootstrap_mrjob"], False)
                local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir())
                try:
                    with no_handlers_for_logger():
                        runner.run()
                except Exception as e:
                    # if mrjob is not installed, script won't be able to run
                    self.assertIn("ImportError", str(e))
                    return

                output = list(runner.stream_output())

                self.assertEqual(len(output), 1)

                # script should not load mrjob from local_tmp_dir
                _, script_mrjob_dir = mr_job.parse_output_line(output[0])
                self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
Example #20
0
    def test_bad_sort(self):
        self.use_bad_sort()

        runner = MRJobRunner(conf_paths=[])
        with no_handlers_for_logger():
            self.assertRaises(CalledProcessError,
                              runner._invoke_sort, [self.a, self.b], self.out)
Example #21
0
    def test_large_amounts_of_stderr(self):
        mr_job = MRVerboseJob(['--no-conf', '-r', 'local', '-v'])
        mr_job.sandbox()

        try:
            with no_handlers_for_logger():
                mr_job.run_job()
        except TimeoutException:
            raise
        except SystemExit:
            # we expect the job to throw a StepFailedException,
            # which causes run_job to call sys.exit()

            # look for expected output from MRVerboseJob
            stderr = mr_job.stderr.getvalue()
            self.assertIn(
                b"Counters: 1\n\tFoo\n\t\tBar=10000", stderr)
            self.assertIn(b'Status: 0\n', stderr)
            self.assertIn(b'Status: 99\n', stderr)
            self.assertNotIn(b'Status: 100\n', stderr)
            self.assertIn(b'STDERR: Qux\n', stderr)
            # exception should appear in exception message
            self.assertIn(b'BOOM', stderr)
        else:
            raise AssertionError()
Example #22
0
    def test_can_turn_off_bootstrap_mrjob(self):
        with mrjob_conf_patcher(
                {'runners': {'local': {'bootstrap_mrjob': False}}}):

            mr_job = MRJobWhereAreYou(['-r', 'local'])
            mr_job.sandbox()

            with mr_job.make_runner() as runner:
                # sanity check
                self.assertEqual(runner._opts['bootstrap_mrjob'], False)
                local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir())
                try:
                    with no_handlers_for_logger():
                        runner.run()
                except StepFailedException:
                    # this is what happens when mrjob isn't installed elsewhere
                    return

                # however, if mrjob is installed, we need to verify that
                # we're using the installed version and not a bootstrapped copy
                output = list(mr_job.parse_output(runner.cat_output()))

                self.assertEqual(len(output), 1)

                # script should not load mrjob from local_tmp_dir
                _, script_mrjob_dir = output[0]
                self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
Example #23
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf',
                               '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with no_handlers_for_logger():
            with mr_job.make_runner() as runner:
                runner.run()

                # expect python -v crud in stderr

                with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                    self.assertTrue(any(
                        'import mrjob' in line or  # Python 2
                        "import 'mrjob'" in line
                        for line in lines))

                with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                    self.assertTrue(any(
                        '#' in line for line in lines))

                # should still get expected results
                self.assertEqual(
                    sorted(to_lines(runner.cat_output())),
                    sorted([b'1\tnull\n', b'1\t"bar"\n']))
Example #24
0
 def test_mixed_behavior_2(self):
     stderr = StringIO()
     with no_handlers_for_logger():
         log_to_stream('mrjob.job', stderr)
         mr_job = self.MRInconsistentJob2()
         self.assertEqual(mr_job.options.input_protocol, None)
         self.assertEqual(mr_job.input_protocol().__class__, ReprProtocol)
         self.assertIn('custom behavior', stderr.getvalue())
Example #25
0
 def test_hadoop_runner(self):
     # you can't instantiate a HadoopJobRunner without Hadoop installed
     launcher = MRJobLauncher(args=['--no-conf', '-r', 'hadoop', '',
                                    '--hadoop-streaming-jar', 'HUNNY'])
     with no_handlers_for_logger('mrjob.runner'):
         with patch.dict(os.environ, {'HADOOP_HOME': '100-Acre Wood'}):
             with launcher.make_runner() as runner:
                 self.assertIsInstance(runner, HadoopJobRunner)
Example #26
0
 def test_verbose(self):
     with no_handlers_for_logger('__main__'):
         with patch.object(sys, 'stderr', StringIO()) as stderr:
             MRJob.set_up_logging(verbose=True)
             log = logging.getLogger('__main__')
             log.info('INFO')
             log.debug('DEBUG')
             self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
Example #27
0
    def test_path_exists(self):
        fs = Filesystem()

        with patch.object(fs, 'exists'):
            with no_handlers_for_logger('mrjob.fs.base'):
                fs.path_exists('foo')

            fs.exists.assert_called_once_with('foo')
Example #28
0
 def test_default_options(self):
     with no_handlers_for_logger('__main__'):
         with patch.object(sys, 'stderr', cStringIO.StringIO()) as stderr:
             MRJob.set_up_logging()
             log = logging.getLogger('__main__')
             log.info('INFO')
             log.debug('DEBUG')
             self.assertEqual(stderr.getvalue(), 'INFO\n')
Example #29
0
    def test_path_join(self):
        fs = Filesystem()

        with patch.object(fs, 'join'):
            with no_handlers_for_logger('mrjob.fs.base'):
                fs.path_join('foo', 'bar')

            fs.join.assert_called_once_with('foo', 'bar')
Example #30
0
    def test_path_join(self):
        fs = Filesystem()

        with patch.object(fs, "join"):
            with no_handlers_for_logger("mrjob.fs.base"):
                fs.path_join("foo", "bar")

            fs.join.assert_called_once_with("foo", "bar")
Example #31
0
    def test_prefer_own_methods(self):
        # TODO: currently can't initialize HadoopRunner without setting these
        runner = HadoopJobRunner(hadoop_bin='hadoop',
                                 hadoop_home='kansas',
                                 hadoop_streaming_jar='streaming.jar')

        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)

            self.assertEqual(runner.ls, runner.fs.ls)

            # Hadoop Runner has its own version
            self.assertNotEqual(runner.get_hadoop_version,
                                runner.fs.get_hadoop_version)

            self.assertIn('deprecated: call HadoopJobRunner.fs.ls() directly',
                          stderr.getvalue())
            self.assertNotIn('get_hadoop_version', stderr.getvalue())
Example #32
0
    def test_wrapper_script_only_writes_to_stderr(self):
        job = MROSWalkJob([
            '-r', 'local',
            '--setup', 'echo stray output',
        ])
        job.sandbox()

        with no_handlers_for_logger('mrjob.local'):
            stderr = StringIO()
            log_to_stream('mrjob.local', stderr)

            with job.make_runner() as r:
                r.run()

                output = ''.join(r.stream_output())

                # stray ouput should be in stderr, not the job's output
                self.assertIn('stray output', stderr.getvalue())
                self.assertNotIn('stray output', output)
Example #33
0
    def test_with_header(self):
        lines = [
            'Counters: 1',
            '  File System Counters',
            '    FILE: Number of bytes read=86',
        ]

        with no_handlers_for_logger('mrjob.logs.step'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.step', stderr)

            self.assertEqual(_parse_indented_counters(lines), {
                'File System Counters': {
                    'FILE: Number of bytes read': 86,
                },
            })

            # header shouldn't freak it out
            self.assertEqual(stderr.getvalue(), '')
Example #34
0
    def test_emr_runner_option_store(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.conf'):
            log_to_stream('mrjob.conf', stderr)

            opts = EMRRunnerOptionStore(
                'emr',
                dict(base_tmp_dir='/scratch',
                     emr_job_flow_id='j-CLUSTERID',
                     emr_job_flow_pool_name='liver',
                     pool_emr_job_flows=True,
                     s3_scratch_uri='s3://bucket/walrus'), [])

            self.assertEqual(opts['cluster_id'], 'j-CLUSTERID')
            self.assertNotIn('emr_job_flow_id', opts)
            self.assertIn(
                'Deprecated option emr_job_flow_id has been renamed'
                ' to cluster_id', stderr.getvalue())

            self.assertEqual(opts['local_tmp_dir'], '/scratch')
            self.assertNotIn('base_tmp_dir', opts)
            self.assertIn(
                'Deprecated option base_tmp_dir has been renamed'
                ' to local_tmp_dir', stderr.getvalue())

            self.assertEqual(opts['pool_clusters'], True)
            self.assertNotIn('pool_emr_job_flows', opts)
            self.assertIn(
                'Deprecated option pool_emr_job_flows has been'
                ' renamed to pool_clusters', stderr.getvalue())

            self.assertEqual(opts['pool_name'], 'liver')
            self.assertNotIn('emr_job_flow_pool_name', opts)
            self.assertIn(
                'Deprecated option emr_job_flow_pool_name has been'
                ' renamed to pool_name', stderr.getvalue())

            self.assertEqual(opts['cloud_tmp_dir'], 's3://bucket/walrus')
            self.assertNotIn('s3_scratch_uri', opts)
            self.assertIn(
                'Deprecated option s3_scratch_uri has been renamed'
                ' to cloud_tmp_dir', stderr.getvalue())
Example #35
0
    def test_load_mrjob_conf_and_load_opts(self):
        conf_path = os.path.join(self.tmp_dir, 'mrjob.conf.2')
        with open(conf_path, 'w') as f:
            f.write('{"runners": {"foo": {"qux": "quux"}}}')

        with no_handlers_for_logger('mrjob.conf'):
            self.assertEqual(load_mrjob_conf(conf_path=conf_path),
                             {'runners': {
                                 'foo': {
                                     'qux': 'quux'
                                 }
                             }})
        self.assertEqual(
            load_opts_from_mrjob_conf('foo', conf_path=conf_path)[0][1],
            {'qux': 'quux'})
        # test missing options
        with logger_disabled('mrjob.conf'):
            self.assertEqual(
                load_opts_from_mrjob_conf('bar', conf_path=conf_path)[0][1],
                {})
Example #36
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(
            ['--python-bin', python_cmd, '--no-conf', '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with no_handlers_for_logger():
            mr_job.run_job()

        # expect debugging messages in stderr.
        stderr = mr_job.stderr.getvalue()

        # stderr is huge, so don't use assertIn()
        self.assertTrue(b'import mrjob' in stderr or  # Python 2
                        b"import 'mrjob'" in stderr)  # Python 3
        self.assertTrue(b'#' in stderr)

        # should still get expected results
        self.assertEqual(sorted(mr_job.stdout.getvalue().splitlines()),
                         sorted([b'1\tnull', b'1\t"bar"']))
Example #37
0
    def test_hadoop_runner_option_store(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.conf'):
            log_to_stream('mrjob.conf', stderr)

            opts = HadoopRunnerOptionStore(
                'hadoop',
                dict(base_tmp_dir='/scratch',
                     hdfs_scratch_dir='hdfs:///scratch'),
                [])

            self.assertEqual(opts['local_tmp_dir'], '/scratch')
            self.assertNotIn('base_tmp_dir', opts)
            self.assertIn('Deprecated option base_tmp_dir has been renamed'
                          ' to local_tmp_dir', stderr.getvalue())

            self.assertEqual(opts['hadoop_tmp_dir'], 'hdfs:///scratch')
            self.assertNotIn('hdfs_scratch_dir', opts)
            self.assertIn('Deprecated option hdfs_scratch_dir has been renamed'
                          ' to hadoop_tmp_dir', stderr.getvalue())
Example #38
0
    def test_conf_contain_only_include_file(self):
        """If a config file only include other configuration files
        no warnings are thrown as long as the included files are
        not empty.
        """

        # dummy configuration for include file 1
        conf = {
            'runners': {
                'inline': {
                    'local_tmp_dir': "include_file1_local_tmp_dir"
                }
            }
        }

        include_file_1 = self.save_conf('include_file_1', conf)

        # dummy configuration for include file 2
        conf = {
            'runners': {
                'inline': {
                    'local_tmp_dir': "include_file2_local_tmp_dir"
                }
            }
        }

        include_file_2 = self.save_conf('include_file_2', conf)

        # test configuration
        conf = {
            'include': [include_file_1, include_file_2]
        }
        path = self.save_conf('twoincludefiles', conf)

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.conf', stderr)
            InlineMRJobRunner(conf_paths=[path])
            self.assertEqual(
                "",
                stderr.getvalue())
Example #39
0
    def test_large_amounts_of_stderr(self):
        mr_job = MRVerboseJob(['--no-conf'])
        mr_job.sandbox()

        try:
            with no_handlers_for_logger():
                mr_job.run_job()
        except TimeoutException:
            raise
        except Exception, e:
            # we expect the job to throw an exception

            # look for expected output from MRVerboseJob
            stderr = mr_job.stderr.getvalue()
            assert_in("Counters from step 1:\n  Foo:\n    Bar: 10000", stderr)
            assert_in('status: 0\n', stderr)
            assert_in('status: 99\n', stderr)
            assert_not_in('status: 100\n', stderr)
            assert_in('STDERR: Qux\n', stderr)
            # exception should appear in exception message
            assert_in('BOOM', repr(e))
Example #40
0
    def test_archive_upload(self):
        job = MROSWalkJob([
            '-r',
            'local',
            '--archive',
            self.foo_tar_gz,
            '--archive',
            self.foo_tar_gz + '#foo',
        ])
        job.sandbox()

        with job.make_runner() as r:
            with no_handlers_for_logger('mrjob.local'):
                r.run()

            path_to_size = dict(
                job.parse_output_line(line) for line in r.stream_output())

        self.assertEqual(path_to_size.get('./foo.tar.gz/foo.py'),
                         self.foo_py_size)
        self.assertEqual(path_to_size.get('./foo/foo.py'), self.foo_py_size)
Example #41
0
    def test_emr_runner_option_store(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.conf'):
            log_to_stream('mrjob.conf', stderr)

            opts = EMRRunnerOptionStore(
                'emr',
                dict(base_tmp_dir='/scratch',
                     s3_scratch_uri='s3://bucket/walrus'), [])

            self.assertEqual(opts['local_tmp_dir'], '/scratch')
            self.assertNotIn('base_tmp_dir', opts)
            self.assertIn(
                'Deprecated option base_tmp_dir has been renamed'
                ' to local_tmp_dir', stderr.getvalue())

            self.assertEqual(opts['s3_tmp_dir'], 's3://bucket/walrus')
            self.assertNotIn('s3_scratch_uri', opts)
            self.assertIn(
                'Deprecated option s3_scratch_uri has been renamed'
                ' to s3_tmp_dir', stderr.getvalue())
Example #42
0
    def test_cleanup_options(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.runner'):
            log_to_stream('mrjob.runner', stderr)
            opts = RunnerOptionStore(
                'inline',
                dict(cleanup=['LOCAL_SCRATCH', 'REMOTE_SCRATCH'],
                     cleanup_on_failure=['SCRATCH']), [])

            self.assertEqual(opts['cleanup'], ['LOCAL_TMP', 'REMOTE_TMP'])
            self.assertIn(
                'Deprecated cleanup option LOCAL_SCRATCH has been renamed'
                ' to LOCAL_TMP', stderr.getvalue())
            self.assertIn(
                'Deprecated cleanup option REMOTE_SCRATCH has been renamed'
                ' to REMOTE_TMP', stderr.getvalue())

            # should quietly convert string to list
            self.assertEqual(opts['cleanup_on_failure'], ['TMP'])
            self.assertIn(
                'Deprecated cleanup_on_failure option SCRATCH has been renamed'
                ' to TMP', stderr.getvalue())
Example #43
0
    def test_failed_job(self):
        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v'])
        mr_job.sandbox()

        with no_handlers_for_logger('mrjob.dataproc'):
            stderr = StringIO()
            log_to_stream('mrjob.dataproc', stderr)

            self.mock_jobs_succeed = False

            with mr_job.make_runner() as runner:
                self.assertIsInstance(runner, DataprocJobRunner)

                self.assertRaises(StepFailedException, runner.run)

                self.assertIn(' => ERROR\n', stderr.getvalue())

                cluster_id = runner.get_cluster_id()

        # job should get terminated
        cluster = runner._get_cluster(cluster_id)
        self.assertEqual(_cluster_state_name(cluster.status.state), 'DELETING')
Example #44
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(
            ['--python-bin', python_cmd, '--no-conf', '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with no_handlers_for_logger():
            with mr_job.make_runner() as runner:
                runner.run()

                # expect python -v crud in stderr

                with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                    self.assertTrue(
                        any('import mrjob' in line or  # Python 2
                            "import 'mrjob'" in line for line in lines))

                with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                    self.assertTrue(any('#' in line for line in lines))

                # should still get expected results
                self.assertEqual(sorted(to_lines(runner.cat_output())),
                                 sorted([b'1\tnull\n', b'1\t"bar"\n']))
Example #45
0
    def test_cleanup_options(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.runner'):
            log_to_stream('mrjob.runner', stderr)
            opts = RunnerOptionStore(
                'inline',
                dict(cleanup=['LOCAL_SCRATCH', 'REMOTE_SCRATCH'],
                     cleanup_on_failure=['JOB_FLOW', 'SCRATCH']), [])

            self.assertEqual(opts['cleanup'], ['LOCAL_TMP', 'CLOUD_TMP'])
            self.assertIn(
                'Deprecated cleanup option LOCAL_SCRATCH has been renamed'
                ' to LOCAL_TMP', stderr.getvalue())
            self.assertIn(
                'Deprecated cleanup option REMOTE_SCRATCH has been renamed'
                ' to CLOUD_TMP', stderr.getvalue())

            self.assertEqual(opts['cleanup_on_failure'], ['CLUSTER', 'TMP'])
            self.assertIn(
                'Deprecated cleanup_on_failure option JOB_FLOW has been'
                ' renamed to CLUSTER', stderr.getvalue())
            self.assertIn(
                'Deprecated cleanup_on_failure option SCRATCH has been renamed'
                ' to TMP', stderr.getvalue())
Example #46
0
    def test_large_amounts_of_stderr(self):
        mr_job = MRVerboseJob(['--no-conf', '-r', 'local'])
        mr_job.sandbox()

        try:
            with no_handlers_for_logger():
                mr_job.run_job()
        except TimeoutException:
            raise
        except Exception as e:
            # we expect the job to throw an exception

            # look for expected output from MRVerboseJob
            stderr = mr_job.stderr.getvalue()
            self.assertIn(b"Counters from step 1:\n\tFoo\n\t\tBar=10000",
                          stderr)
            self.assertIn(b'status: 0\n', stderr)
            self.assertIn(b'status: 99\n', stderr)
            self.assertNotIn(b'status: 100\n', stderr)
            self.assertIn(b'STDERR: Qux\n', stderr)
            # exception should appear in exception message
            self.assertIn('BOOM', repr(e))
        else:
            raise AssertionError()
Example #47
0
    def test_large_amounts_of_stderr(self):
        mr_job = MRVerboseJob(['--no-conf', '-r', 'local', '-v'])
        mr_job.sandbox()

        try:
            with no_handlers_for_logger():
                mr_job.run_job()
        except TimeoutException:
            raise
        except SystemExit:
            # we expect the job to throw a StepFailedException,
            # which causes run_job to call sys.exit()

            # look for expected output from MRVerboseJob
            stderr = mr_job.stderr.getvalue()
            self.assertIn(b"Counters: 1\n\tFoo\n\t\tBar=10000", stderr)
            self.assertIn(b'Status: 0\n', stderr)
            self.assertIn(b'Status: 99\n', stderr)
            self.assertNotIn(b'Status: 100\n', stderr)
            self.assertIn(b'STDERR: Qux\n', stderr)
            # exception should appear in exception message
            self.assertIn(b'BOOM', stderr)
        else:
            raise AssertionError()
Example #48
0
    def test_other_environment_variable(self):
        self._add_hadoop_bin_for_envvar('HADOOP_YARN_MRJOB_DIR', 'bin')

        with no_handlers_for_logger('mrjob.fs.hadoop'):
            self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop'])
Example #49
0
 def test_local_runner(self):
     launcher = MRJobLauncher(args=['--no-conf', '-r', 'local', ''])
     with no_handlers_for_logger('mrjob.runner'):
         with launcher.make_runner() as runner:
             self.assertIsInstance(runner, LocalMRJobRunner)
Example #50
0
 def test_emr_runner(self):
     launcher = MRJobLauncher(args=['--no-conf', '-r', 'emr', ''])
     with no_handlers_for_logger('mrjob'):
         with patch_fs_s3():
             with launcher.make_runner() as runner:
                 self.assertIsInstance(runner, EMRJobRunner)