Beispiel #1
0
    def test_bootstrap_python_comes_before_bootstrap(self):
        mr_job = MRTwoStepJob(['-r', 'dataproc', '--bootstrap', 'true'])

        with mr_job.make_runner() as runner:
            self.assertEqual(
                runner._bootstrap,
                self.EXPECTED_BOOTSTRAP + [['true']])
Beispiel #2
0
    def test_dont_take_down_cluster_on_failure(self):
        runner = DataprocJobRunner(conf_paths=[])

        cluster_body = runner.api_client.cluster_create()
        cluster_id = cluster_body['clusterName']

        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v',
                               '--cluster-id', cluster_id])
        mr_job.sandbox()

        self._dataproc_client.job_get_advances_states = collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR'])

        with mr_job.make_runner() as runner:
            self.assertIsInstance(runner, DataprocJobRunner)

            with logger_disabled('mrjob.dataproc'):
                self.assertRaises(StepFailedException, runner.run)

            cluster = self.get_cluster_from_runner(runner, cluster_id)
            cluster_state = self._dataproc_client.get_state(cluster)
            self.assertEqual(cluster_state, 'RUNNING')

        # job shouldn't get terminated by cleanup
        cluster = self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]
        cluster_state = self._dataproc_client.get_state(cluster)
        self.assertEqual(cluster_state, 'RUNNING')
Beispiel #3
0
    def test_owner_and_label_switches(self):
        runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain']
        runner = MRTwoStepJob(runner_opts).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'ads_chain')
        assert_equal(match.group(2), 'ads')
Beispiel #4
0
    def test_attach_to_existing_job_flow(self):
        emr_conn = EMRJobRunner(conf_path=False).make_emr_conn()
        # set log_uri to None, so that when we describe the job flow, it
        # won't have the loguri attribute, to test Issue #112
        emr_job_flow_id = emr_conn.run_jobflow(
            name='Development Job Flow', log_uri=None)

        stdin = StringIO('foo\nbar\n')
        self.mock_emr_output = {(emr_job_flow_id, 1): [
            '1\t"bar"\n1\t"foo"\n2\tnull\n']}

        mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                               '-c', self.mrjob_conf_path,
                               '--emr-job-flow-id', emr_job_flow_id])
        mr_job.sandbox(stdin=stdin)

        results = []
        with mr_job.make_runner() as runner:
            runner.run()

            # Issue 182: don't create the bootstrap script when
            # attaching to another job flow
            assert_equal(runner._master_bootstrap_script, None)

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

        assert_equal(sorted(results),
            [(1, 'bar'), (1, 'foo'), (2, None)])
Beispiel #5
0
    def test_end_to_end(self):
        # read from STDIN, a regular file, and a .gz
        stdin = StringIO("foo\nbar\n")

        input_path = os.path.join(self.tmp_dir, "input")
        with open(input_path, "w") as input_file:
            input_file.write("bar\nqux\n")

        input_gz_path = os.path.join(self.tmp_dir, "input.gz")
        input_gz = gzip.GzipFile(input_gz_path, "w")
        input_gz.write("foo\n")
        input_gz.close()

        mr_job = MRTwoStepJob(["-c", self.mrjob_conf_path, "-", input_path, input_gz_path])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        with mr_job.make_runner() as runner:
            assert isinstance(runner, LocalMRJobRunner)
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            assert os.path.exists(local_tmp_dir)

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)

        assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)])
Beispiel #6
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf',
                               '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with mr_job.make_runner() as runner:
            runner.run()

            # expect python -v crud in stderr

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(any(
                    'import mrjob' in line or  # Python 2
                    "import 'mrjob'" in line
                    for line in lines))

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(any(
                    '#' in line for line in lines))

            # should still get expected results
            self.assertEqual(
                sorted(to_lines(runner.cat_output())),
                sorted([b'1\tnull\n', b'1\t"bar"\n']))
Beispiel #7
0
    def test_owner_and_label_switches(self):
        runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain']
        runner = MRTwoStepJob(runner_opts).make_runner()
        match = _JOB_KEY_RE.match(runner.get_job_key())

        self.assertEqual(match.group(1), 'ads_chain')
        self.assertEqual(match.group(2), 'ads')
Beispiel #8
0
    def test_failed_job(self):
        mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                               '-c', self.mrjob_conf_path])
        mr_job.sandbox()

        self.add_mock_s3_data({'walrus': {}})
        self.mock_emr_failures = {('j-MOCKJOBFLOW0', 0): None}

        with mr_job.make_runner() as runner:
            assert isinstance(runner, EMRJobRunner)

            with logger_disabled('mrjob.emr'):
                assert_raises(Exception, runner.run)

            emr_conn = botoemr.EmrConnection()
            job_flow_id = runner.get_emr_job_flow_id()
            for i in range(10):
                emr_conn.simulate_progress(job_flow_id)

            job_flow = emr_conn.describe_jobflow(job_flow_id)
            assert_equal(job_flow.state, 'FAILED')

        # job should get terminated on cleanup
        emr_conn = runner.make_emr_conn()
        job_flow_id = runner.get_emr_job_flow_id()
        for i in range(10):
            emr_conn.simulate_progress(job_flow_id)

        job_flow = emr_conn.describe_jobflow(job_flow_id)
        assert_equal(job_flow.state, 'TERMINATED')
Beispiel #9
0
    def test_missing_input(self):
        mr_job = MRTwoStepJob(['-r', 'inline', '/some/bogus/file/path'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            assert isinstance(runner, InlineMRJobRunner)
            self.assertRaises(IOError, runner.run)
Beispiel #10
0
    def test_end_to_end(self):
        # read from STDIN, a regular file, and a .gz
        stdin = BytesIO(b'foo\nbar\n')

        input_path = join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('bar\nqux\n')

        input_gz_path = join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b'foo\n')
        input_gz.close()

        mr_job = MRTwoStepJob(
            ['--runner', 'inline', '-', input_path, input_gz_path])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        with mr_job.make_runner() as runner:
            assert isinstance(runner, InlineMRJobRunner)
            runner.run()

            results.extend(mr_job.parse_output(runner.cat_output()))

            local_tmp_dir = runner._get_local_tmp_dir()
            assert exists(local_tmp_dir)

        # make sure cleanup happens
        assert not exists(local_tmp_dir)

        self.assertEqual(sorted(results),
                         [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
Beispiel #11
0
    def test_two_step_job(self):
        # good all-around test. MRTwoStepJob's first step logs counters, but
        # its second step does not
        job = MRTwoStepJob(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(b'foo\nbar\n'))

        with job.make_runner() as runner:
            runner.run()

        counters = runner.counters()

        # should have two steps worth of counters, even though it runs as a
        # single Spark job
        self.assertEqual(len(counters), 2)

        # first step counters should be {'count': {'combiners': <int>}}
        self.assertEqual(sorted(counters[0]), ['count'])
        self.assertEqual(sorted(counters[0]['count']), ['combiners'])
        self.assertIsInstance(counters[0]['count']['combiners'], int)

        # second step counters should be empty
        self.assertEqual(counters[1], {})

        log_output = '\n'.join(c[0][0] for c in self.log.info.call_args_list)
        log_lines = log_output.split('\n')

        # should log first step counters but not second step
        self.assertIn('Counters for step 1: 1', log_lines)
        self.assertIn('\tcount', log_output)
        self.assertNotIn('Counters for step 2', log_output)
Beispiel #12
0
    def test_bootstrap_python_switch(self):
        mr_job = MRTwoStepJob(["-r", "dataproc", "--bootstrap-python"])

        with mr_job.make_runner() as runner:
            self.assertEqual(runner._opts["bootstrap_python"], True)
            self.assertEqual(runner._bootstrap_python(), self.EXPECTED_BOOTSTRAP)
            self.assertEqual(runner._bootstrap, self.EXPECTED_BOOTSTRAP)
Beispiel #13
0
    def test_failed_job(self):
        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v'])
        mr_job.sandbox()

        with no_handlers_for_logger('mrjob.dataproc'):
            stderr = StringIO()
            log_to_stream('mrjob.dataproc', stderr)

            self._dataproc_client.job_get_advances_states = (
                collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR']))

            with mr_job.make_runner() as runner:
                self.assertIsInstance(runner, DataprocJobRunner)

                self.assertRaises(StepFailedException, runner.run)

                self.assertIn(' => ERROR\n', stderr.getvalue())

                cluster_id = runner.get_cluster_id()

        # job should get terminated
        cluster = (
            self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id])
        cluster_state = self._dataproc_client.get_state(cluster)
        self.assertEqual(cluster_state, 'DELETING')
Beispiel #14
0
    def test_attach_to_existing_cluster(self):
        runner = DataprocJobRunner(conf_paths=[])

        cluster_body = runner.api_client.cluster_create()
        cluster_id = cluster_body['clusterName']

        stdin = BytesIO(b'foo\nbar\n')

        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v',
                               '--cluster-id', cluster_id])
        mr_job.sandbox(stdin=stdin)

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            # Generate fake output
            self.put_job_output_parts(runner, [
                b'1\t"bar"\n1\t"foo"\n2\tnull\n'
            ])

            # Issue 182: don't create the bootstrap script when
            # attaching to another cluster
            self.assertIsNone(runner._master_bootstrap_script_path)

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

        self.assertEqual(sorted(results),
                         [(1, 'bar'), (1, 'foo'), (2, None)])
Beispiel #15
0
    def test_owner_and_label_switches(self):
        runner_opts = ["--no-conf", "--owner=ads", "--label=ads_chain"]
        runner = MRTwoStepJob(runner_opts).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), "ads_chain")
        self.assertEqual(match.group(2), "ads")
Beispiel #16
0
    def test_streaming_step_not_okay(self):
        job = MRTwoStepJob()
        job.sandbox()

        with job.make_runner() as runner:
            self.assertRaises(
                TypeError,
                runner._spark_script_args, 0)
Beispiel #17
0
 def test_default(self):
     mr_job = MRTwoStepJob(['-r', 'dataproc'])
     with mr_job.make_runner() as runner:
         self.assertEqual(runner._opts['bootstrap_python'], True)
         self.assertEqual(runner._bootstrap_python(),
                          self.EXPECTED_BOOTSTRAP)
         self.assertEqual(runner._bootstrap,
                          self.EXPECTED_BOOTSTRAP)
Beispiel #18
0
    def test_end_to_end(self):
        # read from STDIN, a local file, and a remote file
        stdin = StringIO('foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'w') as local_input_file:
            local_input_file.write('bar\nqux\n')

        input_to_upload = os.path.join(self.tmp_dir, 'remote_input')
        with open(input_to_upload, 'w') as input_to_upload_file:
            input_to_upload_file.write('foo\n')
        remote_input_path = 'hdfs:///data/foo'
        check_call([self.hadoop_bin,
                    'fs', '-put', input_to_upload, remote_input_path])

        # doesn't matter what the intermediate output is; just has to exist.
        add_mock_hadoop_output([''])
        add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])

        mr_job = MRTwoStepJob(['-r', 'hadoop', '-v',
                               '--no-conf', '--hadoop-arg', '-libjar',
                               '--hadoop-arg', 'containsJars.jar',
                               '-', local_input_path, remote_input_path])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        with mr_job.make_runner() as runner:
            assert isinstance(runner, HadoopJobRunner)
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            assert os.path.exists(local_tmp_dir)
            assert any(runner.ls(runner.get_output_dir()))

            # make sure we're writing to the correct path in HDFS
            hdfs_root = os.environ['MOCK_HDFS_ROOT']
            assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user'])
            home_dir = os.path.join(hdfs_root, 'user', getpass.getuser())
            assert_equal(os.listdir(home_dir), ['tmp'])
            assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob'])
            assert_equal(runner._opts['hadoop_extra_args'],
                         ['-libjar', 'containsJars.jar'])

        assert_equal(sorted(results),
                     [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)
        assert not any(runner.ls(runner.get_output_dir()))
Beispiel #19
0
    def test_debugging_works(self):
        mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                           '-c', self.mrjob_conf_path,
                           '--enable-emr-debugging'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            flow = runner.make_emr_conn().describe_jobflow(runner._emr_job_flow_id)
            assert_equal(flow.steps[0].name, 'Setup Hadoop Debugging')
Beispiel #20
0
    def test_default(self):
        mr_job = MRTwoStepJob(['-r', 'local'])
        mr_job.sandbox(stdin=BytesIO(b'foo\nbar\n'))

        with mr_job.make_runner() as runner:
            self.assertEqual(runner._num_mappers(0), cpu_count())
            self.assertEqual(runner._num_reducers(0), cpu_count())

            runner.run()

            self.pool.assert_called_with(processes=None)
Beispiel #21
0
    def test_echo_as_steps_python_bin(self):
        mr_job = MRTwoStepJob(["--steps", "--steps-python-bin", "echo", "--no-conf", "-r", "local"])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            assert isinstance(runner, LocalMRJobRunner)
            # MRTwoStepJob populates _steps in the runner, so un-populate
            # it here so that the runner actually tries to get the steps
            # via subprocess
            runner._steps = None
            self.assertRaises(ValueError, runner._get_steps)
Beispiel #22
0
    def test_three_cores(self):
        mr_job = MRTwoStepJob(['-r', 'local', '--num-cores', '3'])
        mr_job.sandbox(stdin=BytesIO(b'foo\nbar\n'))

        with mr_job.make_runner() as runner:
            self.assertEqual(runner._num_mappers(0), 3)
            self.assertEqual(runner._num_reducers(0), 3)

            runner.run()

            self.pool.assert_called_with(processes=3)
Beispiel #23
0
    def test_blank_out_counters_if_not_output(self):
        self.start(patch('mrjob.bin.MRJobBinRunner._run_spark_submit',
                         return_value=2))

        job = MRTwoStepJob(['-r', 'spark'])
        job.sandbox(stdin=BytesIO(b'foo\nbar\n'))

        with job.make_runner() as runner:
            self.assertRaises(StepFailedException, runner.run)

        # should blank out counters from failed step
        self.assertEqual(runner.counters(), [{}, {}])
Beispiel #24
0
    def test_setup_wrapper_script_uses_local_line_endings(self):
        job = MRTwoStepJob(["-r", "local", "--setup", "true"])
        job.sandbox(stdin=BytesIO())

        # tests #1071. Unfortunately, we mostly run these tests on machines
        # that use unix line endings anyway. So monitor open() instead
        with patch("mrjob.runner.open", create=True, side_effect=open) as m_open:
            with logger_disabled("mrjob.local"):
                with job.make_runner() as runner:
                    runner.run()

                    self.assertIn(call(runner._setup_wrapper_script_path, "w"), m_open.mock_calls)
Beispiel #25
0
    def make_runner(self, *args):
        """create a dummy job, and call make_runner() on it.
        Use this in a with block:

        with self.make_runner() as runner:
            ...
        """
        stdin = BytesIO(b'foo\nbar\n')
        mr_job = MRTwoStepJob(['-r', 'emr'] + list(args))
        mr_job.sandbox(stdin=stdin)

        return mr_job.make_runner()
Beispiel #26
0
    def test_default_hadoop_version(self):
        stdin = StringIO('foo\nbar\n')
        mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                               '-c', self.mrjob_conf_path])
        mr_job.sandbox(stdin=stdin)

        with mr_job.make_runner() as runner:
            runner.run()

            emr_conn = runner.make_emr_conn()
            job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id())

            assert_equal(job_flow.hadoopversion, '0.18')
Beispiel #27
0
    def test_unexpected_opt_from_command_line(self):
        # regression test for #1898. local runner doesn't support *zone*
        job = MRTwoStepJob(['-r', 'local', '--no-conf', '--zone', 'DANGER'])
        job.sandbox()

        with job.make_runner():
            self.assertTrue(self.log.warning.called)
            warnings = '\n'.join(
                arg[0][0] for arg in self.log.warning.call_args_list)

            self.assertIn('Unexpected option', warnings)
            self.assertIn('zone', warnings)
            self.assertIn('command line', warnings)
Beispiel #28
0
    def test_setup_wrapper_script_uses_local_line_endings(self):
        job = MRTwoStepJob(['-r', 'local', '--setup', 'true'])
        job.sandbox(stdin=BytesIO())

        # tests #1071. Unfortunately, we mostly run these tests on machines
        # that use unix line endings anyway. So monitor open() instead
        with patch(
                'mrjob.sim.open', create=True, side_effect=open) as m_open:
            with job.make_runner() as runner:
                runner.run()

                self.assertIn(
                    call(runner._setup_wrapper_script_path, 'w'),
                    m_open.mock_calls)
Beispiel #29
0
    def test_auto_everything(self):
        test_start = datetime.datetime.utcnow()

        os.environ["USER"] = "******"
        runner = MRTwoStepJob(["--no-conf"]).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), "mr_two_step_job")
        self.assertEqual(match.group(2), "mcp")

        job_start = datetime.datetime.strptime(match.group(3) + match.group(4), "%Y%m%d%H%M%S")
        job_start = job_start.replace(microsecond=int(match.group(5)))
        self.assertGreaterEqual(job_start, test_start)
        self.assertLessEqual(job_start - test_start, datetime.timedelta(seconds=5))
Beispiel #30
0
    def test_end_to_end(self):
        # read from STDIN, a local file, and a remote file
        stdin = StringIO("foo\nbar\n")

        local_input_path = os.path.join(self.tmp_dir, "input")
        with open(local_input_path, "w") as local_input_file:
            local_input_file.write("bar\nqux\n")

        input_to_upload = os.path.join(self.tmp_dir, "remote_input")
        with open(input_to_upload, "w") as input_to_upload_file:
            input_to_upload_file.write("foo\n")
        remote_input_path = "hdfs:///data/foo"
        check_call([self.hadoop_bin, "fs", "-put", input_to_upload, remote_input_path])

        # doesn't matter what the intermediate output is; just has to exist.
        add_mock_hadoop_output([""])
        add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])

        mr_job = MRTwoStepJob(["-r", "hadoop", "-v", "--no-conf", "-", local_input_path, remote_input_path])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        with mr_job.make_runner() as runner:
            assert isinstance(runner, HadoopJobRunner)
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            assert os.path.exists(local_tmp_dir)
            assert any(runner.ls(runner.get_output_dir()))

            # make sure we're writing to the correct path in HDFS
            hdfs_root = os.environ["MOCK_HDFS_ROOT"]
            assert_equal(sorted(os.listdir(hdfs_root)), ["data", "user"])
            home_dir = os.path.join(hdfs_root, "user", getpass.getuser())
            assert_equal(os.listdir(home_dir), ["tmp"])
            assert_equal(os.listdir(os.path.join(home_dir, "tmp")), ["mrjob"])

        assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)])

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)
        assert not any(runner.ls(runner.get_output_dir()))
Beispiel #31
0
 def run_job(self, args=()):
     args = ([sys.executable, MRTwoStepJob.mr_job_script()] + list(args) +
             ['--no-conf'])
     # add . to PYTHONPATH (in case mrjob isn't actually installed)
     env = combine_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')})
     proc = Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE, env=env)
     stdout, stderr = proc.communicate(input='foo\nbar\nbar\n')
     return stdout, stderr, proc.returncode
Beispiel #32
0
    def test_hadoop_output_format(self):
        output_format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat'

        # one-step job
        job1 = MRWordCount()
        # no cmd-line argument for this because it's part of job semantics
        job1.HADOOP_OUTPUT_FORMAT = output_format
        with job1.make_runner() as runner1:
            self.assertEqual(runner1._hadoop_args_for_step(0),
                             ['-outputformat', output_format])

        # multi-step job: only use -outputformat on the last step
        job2 = MRTwoStepJob()
        job2.HADOOP_OUTPUT_FORMAT = output_format
        with job2.make_runner() as runner2:
            self.assertEqual(runner2._hadoop_args_for_step(0), [])
            self.assertEqual(runner2._hadoop_args_for_step(1),
                             ['-outputformat', output_format])
Beispiel #33
0
    def test_echo_as_python_bin(self):
        # "echo" is a pretty poor substitute for Python, but it
        # should be available on most systems
        mr_job = MRTwoStepJob(
            ['--python-bin', 'echo', '--steps-python-bin', sys.executable,
             '--no-conf', '-r', 'local'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            assert isinstance(runner, LocalMRJobRunner)
            runner.run()
            output = b''.join(runner.cat_output())

        # the output should basically be the command we used to
        # run the last step, which in this case is a mapper
        self.assertIn(b'mr_two_step_job.py', output)
        self.assertIn(b'--step-num=1', output)
        self.assertIn(b'--mapper', output)
Beispiel #34
0
    def test_unexpected_opt_from_mrjob_conf(self):
        conf_path = self.makefile('mrjob.custom.conf')

        with open(conf_path, 'w') as f:
            dump_mrjob_conf(
                dict(runners=dict(local=dict(land='useless_swamp'))), f)

        job = MRTwoStepJob(['-r', 'local', '-c', conf_path])
        job.sandbox()

        with job.make_runner():
            self.assertTrue(self.log.warning.called)
            warnings = '\n'.join(
                arg[0][0] for arg in self.log.warning.call_args_list)

            self.assertIn('Unexpected option', warnings)
            self.assertIn('land', warnings)
            self.assertIn(conf_path, warnings)
Beispiel #35
0
    def _test_cloud_tmp_cleanup(self, mode, tmp_len):
        stdin = BytesIO(b'foo\nbar\n')

        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '-', '--cleanup', mode])
        mr_job.sandbox(stdin=stdin)

        with mr_job.make_runner() as runner:
            tmp_bucket, _ = parse_gcs_uri(runner._cloud_tmp_dir)

            runner.run()

            # this is set and unset before we can get at it unless we do this
            list(runner.cat_output())

            fs = runner.fs

        # with statement finishes, cleanup runs

        self.assertEqual(len(list(fs.client.bucket(tmp_bucket).list_blobs())),
                         tmp_len)
Beispiel #36
0
    def test_echo_as_steps_python_bin(self):
        mr_job = MRTwoStepJob([
            '--steps', '--steps-python-bin', 'echo', '--no-conf', '-r', 'local'
        ])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            assert isinstance(runner, LocalMRJobRunner)
            try:
                # make_runner() populates _steps in the runner, so un-populate
                # it here so that the runner actually tries to get the steps
                # via subprocess
                runner._steps = None
                runner._get_steps()
                assert False, 'Should throw exception'
            except ValueError, ex:
                output = str(ex)
                # the output should basically be the command used to
                # run the steps command
                self.assertIn('mr_two_step_job.py', output)
                self.assertIn('--steps', output)
Beispiel #37
0
    def test_attach_to_existing_cluster(self):
        runner = DataprocJobRunner(conf_paths=[])

        cluster_body = runner.api_client.cluster_create()
        cluster_id = cluster_body['clusterName']

        stdin = BytesIO(b'foo\nbar\n')

        mr_job = MRTwoStepJob(
            ['-r', 'dataproc', '-v', '--cluster-id', cluster_id])
        mr_job.sandbox(stdin=stdin)

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            # Generate fake output
            self.put_job_output_parts(runner,
                                      [b'1\t"bar"\n1\t"foo"\n2\tnull\n'])

            # Issue 182: don't create the bootstrap script when
            # attaching to another cluster
            self.assertIsNone(runner._master_bootstrap_script_path)

            results.extend(mr_job.parse_output(runner.cat_output()))

        self.assertEqual(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
Beispiel #38
0
    def test_end_to_end(self):
        # read from STDIN, a regular file, and a .gz
        stdin = BytesIO(b'foo\nbar\n')

        input_path = join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('bar\nqux\n')

        input_gz_path = join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b'foo\n')
        input_gz.close()

        mr_job = MRTwoStepJob(
            ['--runner', 'inline', '-', input_path, input_gz_path])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        with mr_job.make_runner() as runner:
            assert isinstance(runner, InlineMRJobRunner)
            runner.run()

            results.extend(mr_job.parse_output(runner.cat_output()))

            local_tmp_dir = runner._get_local_tmp_dir()
            assert exists(local_tmp_dir)

        # make sure cleanup happens
        assert not exists(local_tmp_dir)

        self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'),
                                           (5, None)])
Beispiel #39
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(
            ['--python-bin', python_cmd, '--no-conf', '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with mr_job.make_runner() as runner:
            runner.run()

            # expect python -v crud in stderr

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(
                    any('import mrjob' in line or  # Python 2
                        "import 'mrjob'" in line for line in lines))

            with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
                self.assertTrue(any('#' in line for line in lines))

            # should still get expected results
            self.assertEqual(sorted(to_lines(runner.cat_output())),
                             sorted([b'1\tnull\n', b'1\t"bar"\n']))
Beispiel #40
0
    def test_failed_job(self):
        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v'])
        mr_job.sandbox()

        with no_handlers_for_logger('mrjob.dataproc'):
            stderr = StringIO()
            log_to_stream('mrjob.dataproc', stderr)

            self.mock_jobs_succeed = False

            with mr_job.make_runner() as runner:
                self.assertIsInstance(runner, DataprocJobRunner)

                self.assertRaises(StepFailedException, runner.run)

                self.assertIn(' => ERROR\n', stderr.getvalue())

                cluster_id = runner.get_cluster_id()

        # job should get terminated
        cluster = runner._get_cluster(cluster_id)
        self.assertEqual(_cluster_state_name(cluster.status.state), 'DELETING')
Beispiel #41
0
    def test_two_step_job(self):
        input1_path = self.makefile('input1')
        input2_path = self.makefile('input2')

        job = MRTwoStepJob([
            '-r', 'hadoop',
            '--hadoop-bin', 'false',  # shouldn't run; just in case
            input1_path, input2_path])
        job.sandbox()

        with job.make_runner() as runner:
            runner._add_job_files_for_upload()

            input_uris_0 = runner._step_input_uris(0)
            self.assertEqual([os.path.basename(uri) for uri in input_uris_0],
                             ['input1', 'input2'])

            output_uri_0 = runner._step_output_uri(0)
            input_uris_1 = runner._step_input_uris(1)

            self.assertEqual(input_uris_1, [output_uri_0])

            output_uri_1 = runner._step_output_uri(1)
            self.assertEqual(output_uri_1, runner._output_dir)
Beispiel #42
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf'])
        mr_job.sandbox(stdin=['bar\n'])

        with no_handlers_for_logger():
            mr_job.run_job()

        # expect debugging messages in stderr
        assert_in('import mrjob', mr_job.stderr.getvalue())
        assert_in('#', mr_job.stderr.getvalue())

        # should still get expected results
        assert_equal(sorted(mr_job.parse_output()), [(1, None), (1, 'bar')])
Beispiel #43
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(
            ['--python-bin', python_cmd, '--no-conf', '-r', 'local'])
        mr_job.sandbox(stdin=['bar\n'])

        with no_handlers_for_logger():
            mr_job.run_job()

        # expect debugging messages in stderr
        self.assertIn('import mrjob', mr_job.stderr.getvalue())
        self.assertIn('#', mr_job.stderr.getvalue())

        # should still get expected results
        self.assertItemsEqual(mr_job.stdout.getvalue().splitlines(),
                              ['1\tnull', '1\t"bar"'])
Beispiel #44
0
    def test_python_dash_v_as_python_bin(self):
        python_cmd = cmd_line([sys.executable or 'python', '-v'])
        mr_job = MRTwoStepJob(
            ['--python-bin', python_cmd, '--no-conf', '-r', 'local'])
        mr_job.sandbox(stdin=[b'bar\n'])

        with no_handlers_for_logger():
            mr_job.run_job()

        # expect debugging messages in stderr.
        stderr = mr_job.stderr.getvalue()

        # stderr is huge, so don't use assertIn()
        self.assertTrue(b'import mrjob' in stderr or  # Python 2
                        b"import 'mrjob'" in stderr)  # Python 3
        self.assertTrue(b'#' in stderr)

        # should still get expected results
        self.assertEqual(sorted(mr_job.stdout.getvalue().splitlines()),
                         sorted([b'1\tnull', b'1\t"bar"']))
Beispiel #45
0
    def test_show_steps(self):
        mr_boring_job = MRBoringJob(['--steps'])
        mr_boring_job.sandbox()
        mr_boring_job.show_steps()
        self.assertEqual(mr_boring_job.stdout.getvalue(), 'MR\n')

        # final mappers don't show up in the step description
        mr_final_boring_job = MRFinalBoringJob(['--steps'])
        mr_final_boring_job.sandbox()
        mr_final_boring_job.show_steps()
        self.assertEqual(mr_final_boring_job.stdout.getvalue(), 'MR\n')

        mr_two_step_job = MRTwoStepJob(['--steps'])
        mr_two_step_job.sandbox()
        mr_two_step_job.show_steps()
        self.assertEqual(mr_two_step_job.stdout.getvalue(), 'MCR M\n')

        mr_no_mapper = MRNoMapper(['--steps'])
        mr_no_mapper.sandbox()
        mr_no_mapper.show_steps()
        self.assertEqual(mr_no_mapper.stdout.getvalue(), 'MR R\n')
Beispiel #46
0
    def test_end_to_end(self):
        # read from STDIN, a regular file, and a .gz
        stdin = StringIO('foo\nbar\n')

        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('bar\nqux\n')

        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz_glob = os.path.join(self.tmp_dir, '*.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write('foo\n')
        input_gz.close()

        mr_job = MRTwoStepJob([
            '-c', self.mrjob_conf_path, '-r', 'local', '-', input_path,
            input_gz_glob
        ])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        with mr_job.make_runner() as runner:
            assert isinstance(runner, LocalMRJobRunner)
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            assert os.path.exists(local_tmp_dir)
            self.assertEqual(runner.counters()[0]['count']['combiners'], 8)

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)

        self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'),
                                           (5, None)])
Beispiel #47
0
    def test_end_to_end_multiple_tasks(self):
        # read from STDIN, a regular file, and a .gz
        stdin = BytesIO(b'foo\nbar\n')

        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(b'bar\nqux\n')

        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b'foo\n')
        input_gz.close()

        mr_job = MRTwoStepJob([
            '-r', 'local', '--jobconf=mapred.map.tasks=2',
            '--jobconf=mapred.reduce.tasks=2', '-', input_path, input_gz_path
        ])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        with mr_job.make_runner() as runner:
            assert isinstance(runner, LocalMRJobRunner)
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            assert os.path.exists(local_tmp_dir)

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)

        self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'),
                                           (5, None)])
Beispiel #48
0
 def test_nonexistent_steps(self):
     mr_job = MRTwoStepJob()
     mr_job.sandbox()
     self.assertRaises(ValueError, mr_job.run_reducer, 1)
     self.assertRaises(ValueError, mr_job.run_mapper, 2)
     self.assertRaises(ValueError, mr_job.run_reducer, -1)
Beispiel #49
0
    def test_auto_label(self):
        runner = MRTwoStepJob(['--no-conf']).make_runner()
        match = JOB_NAME_RE.match(runner.get_job_name())

        self.assertEqual(match.group(1), 'mr_two_step_job')
        self.assertEqual(match.group(2), getpass.getuser())
Beispiel #50
0
    def test_streaming_step_not_okay(self):
        job = MRTwoStepJob()
        job.sandbox()

        with job.make_runner() as runner:
            self.assertRaises(TypeError, runner._spark_script_args, 0)
Beispiel #51
0
    def _test_end_to_end(self, args=()):
        # read from STDIN, a local file, and a remote file
        stdin = StringIO('foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'w') as local_input_file:
            local_input_file.write('bar\nqux\n')

        input_to_upload = os.path.join(self.tmp_dir, 'remote_input')
        with open(input_to_upload, 'w') as input_to_upload_file:
            input_to_upload_file.write('foo\n')
        remote_input_path = 'hdfs:///data/foo'
        check_call([
            self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path
        ])

        # doesn't matter what the intermediate output is; just has to exist.
        add_mock_hadoop_output([''])
        add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])

        mr_job = MRTwoStepJob([
            '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar',
            '--hadoop-arg', 'containsJars.jar'
        ] + list(args) + ['-', local_input_path, remote_input_path] +
                              ['--hadoop-input-format', 'FooFormat'] +
                              ['--hadoop-output-format', 'BarFormat'] +
                              ['--jobconf', 'x=y'])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        # don't care that --hadoop-*-format is deprecated
        with logger_disabled('mrjob.job'):
            runner = mr_job.make_runner()

        with runner as runner:  # i.e. call cleanup when we're done
            assert isinstance(runner, HadoopJobRunner)
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            assert os.path.exists(local_tmp_dir)
            assert any(runner.ls(runner.get_output_dir()))

            # make sure we're writing to the correct path in HDFS
            hdfs_root = os.environ['MOCK_HDFS_ROOT']
            assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user'])
            home_dir = os.path.join(hdfs_root, 'user', getpass.getuser())
            assert_equal(os.listdir(home_dir), ['tmp'])
            assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob'])
            assert_equal(runner._opts['hadoop_extra_args'],
                         ['-libjar', 'containsJars.jar'])

            # make sure mrjob.tar.gz is uploaded and in PYTHONPATH
            assert runner._mrjob_tar_gz_path
            mrjob_tar_gz_file_dicts = [
                file_dict for file_dict in runner._files
                if file_dict['path'] == runner._mrjob_tar_gz_path
            ]
            assert_equal(len(mrjob_tar_gz_file_dicts), 1)

            mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0]
            assert mrjob_tar_gz_file_dict['name']

            pythonpath = runner._get_cmdenv()['PYTHONPATH']
            assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':'))

        assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'),
                                       (5, None)])

        # make sure we called hadoop the way we expected
        with open(os.environ['MOCK_HADOOP_LOG']) as mock_log:
            hadoop_cmd_args = [shlex.split(line) for line in mock_log]

        jar_cmd_args = [
            args for args in hadoop_cmd_args if args[:1] == ['jar']
        ]
        assert_equal(len(jar_cmd_args), 2)
        step_0_args, step_1_args = jar_cmd_args

        # check input/output format
        assert_in('-inputformat', step_0_args)
        assert_not_in('-outputformat', step_0_args)
        assert_not_in('-inputformat', step_1_args)
        assert_in('-outputformat', step_1_args)

        # make sure -libjar extra arg comes before -mapper
        for args in (step_0_args, step_1_args):
            assert_in('-libjar', args)
            assert_in('-mapper', args)
            assert_lt(args.index('-libjar'), args.index('-mapper'))

        # make sure -jobconf made it through
        assert_in('-D', step_0_args)

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)
        assert not any(runner.ls(runner.get_output_dir()))
Beispiel #52
0
    def test_end_to_end(self):
        # read from STDIN, a local file, and a remote file
        stdin = StringIO('foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'w') as local_input_file:
            local_input_file.write('bar\nqux\n')

        remote_input_path = 's3://walrus/data/foo'
        self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}})

        # setup fake output
        self.mock_emr_output = {
            ('j-MOCKJOBFLOW0', 1):
            ['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']
        }

        mr_job = MRTwoStepJob([
            '-r', 'emr', '-v', '-c', self.mrjob_conf_path, '-',
            local_input_path, remote_input_path, '--hadoop-input-format',
            'FooFormat', '--hadoop-output-format', 'BarFormat'
        ])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs)

        with mr_job.make_runner() as runner:
            assert isinstance(runner, EMRJobRunner)

            # make sure that initializing the runner doesn't affect S3
            # (Issue #50)
            assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs)

            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            assert os.path.exists(local_tmp_dir)
            assert any(runner.ls(runner.get_output_dir()))

            emr_conn = runner.make_emr_conn()
            job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id())
            assert_equal(job_flow.state, 'COMPLETED')
            name_match = JOB_NAME_RE.match(job_flow.name)
            assert_equal(name_match.group(1), 'mr_two_step_job')
            assert_equal(name_match.group(2), getpass.getuser())

            # make sure our input and output formats are attached to
            # the correct steps
            assert_in('-inputformat', job_flow.steps[0].args)
            assert_not_in('-outputformat', job_flow.steps[0].args)
            assert_not_in('-inputformat', job_flow.steps[1].args)
            assert_in('-outputformat', job_flow.steps[1].args)

            # make sure mrjob.tar.gz is created and uploaded as
            # a bootstrap file
            assert runner._mrjob_tar_gz_path
            mrjob_tar_gz_file_dicts = [
                file_dict for file_dict in runner._files
                if file_dict['path'] == runner._mrjob_tar_gz_path
            ]

            assert_equal(len(mrjob_tar_gz_file_dicts), 1)

            mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0]
            assert mrjob_tar_gz_file_dict['name']
            assert_equal(mrjob_tar_gz_file_dict.get('bootstrap'), 'file')

            # shouldn't be in PYTHONPATH (we dump it directly in site-packages)
            pythonpath = runner._get_cmdenv().get('PYTHONPATH') or ''
            assert_not_in(mrjob_tar_gz_file_dict['name'],
                          pythonpath.split(':'))

        assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'),
                                       (5, None)])

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)
        assert not any(runner.ls(runner.get_output_dir()))

        # job should get terminated
        emr_conn = runner.make_emr_conn()
        job_flow_id = runner.get_emr_job_flow_id()
        for i in range(10):
            emr_conn.simulate_progress(job_flow_id)

        job_flow = emr_conn.describe_jobflow(job_flow_id)
        assert_equal(job_flow.state, 'TERMINATED')
Beispiel #53
0
    def test_base_classes_cant_have_steps(self):
        steps = MRTwoStepJob([])._steps_desc()

        self.assertRaises(NotImplementedError, MRJobRunner, steps=steps)
Beispiel #54
0
    def test_no_warning_by_default(self):
        job = MRTwoStepJob(['-r', 'local', '--no-conf'])
        job.sandbox()

        with job.make_runner():
            self.assertFalse(self.log.warning.called)
Beispiel #55
0
    def test_bootstrap_python_comes_before_bootstrap(self):
        mr_job = MRTwoStepJob(['-r', 'dataproc', '--bootstrap', 'true'])

        with mr_job.make_runner() as runner:
            self.assertEqual(runner._bootstrap,
                             self.EXPECTED_BOOTSTRAP + [['true']])
Beispiel #56
0
 def test_no_bootstrap_python_switch(self):
     mr_job = MRTwoStepJob(['-r', 'dataproc', '--no-bootstrap-python'])
     with mr_job.make_runner() as runner:
         self.assertEqual(runner._opts['bootstrap_python'], False)
         self.assertEqual(runner._bootstrap_python(), [])
         self.assertEqual(runner._bootstrap, [])
Beispiel #57
0
    def test_end_to_end(self):
        # read from STDIN, a local file, and a remote file
        stdin = StringIO('foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'w') as local_input_file:
            local_input_file.write('bar\nqux\n')

        input_to_upload = os.path.join(self.tmp_dir, 'remote_input')
        with open(input_to_upload, 'w') as input_to_upload_file:
            input_to_upload_file.write('foo\n')
        remote_input_path = 'hdfs:///data/foo'
        check_call([
            self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path
        ])

        # doesn't matter what the intermediate output is; just has to exist.
        add_mock_hadoop_output([''])
        add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])

        mr_job = MRTwoStepJob([
            '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar',
            '--hadoop-arg', 'containsJars.jar', '-', local_input_path,
            remote_input_path
        ])
        mr_job.sandbox(stdin=stdin)

        local_tmp_dir = None
        results = []

        with mr_job.make_runner() as runner:
            assert isinstance(runner, HadoopJobRunner)
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            assert os.path.exists(local_tmp_dir)
            assert any(runner.ls(runner.get_output_dir()))

            # make sure we're writing to the correct path in HDFS
            hdfs_root = os.environ['MOCK_HDFS_ROOT']
            assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user'])
            home_dir = os.path.join(hdfs_root, 'user', getpass.getuser())
            assert_equal(os.listdir(home_dir), ['tmp'])
            assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob'])
            assert_equal(runner._opts['hadoop_extra_args'],
                         ['-libjar', 'containsJars.jar'])

            # make sure mrjob.tar.gz is uploaded and in PYTHONPATH
            assert runner._mrjob_tar_gz_path
            mrjob_tar_gz_file_dicts = [
                file_dict for file_dict in runner._files
                if file_dict['path'] == runner._mrjob_tar_gz_path
            ]
            assert_equal(len(mrjob_tar_gz_file_dicts), 1)

            mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0]
            assert mrjob_tar_gz_file_dict['name']

            pythonpath = runner._get_cmdenv()['PYTHONPATH']
            assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':'))

        assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'),
                                       (5, None)])

        # make sure cleanup happens
        assert not os.path.exists(local_tmp_dir)
        assert not any(runner.ls(runner.get_output_dir()))