Exemple #1
0
    def test_dont_take_down_cluster_on_failure(self):
        runner1 = DataprocJobRunner(conf_paths=[])

        runner1._launch_cluster()
        cluster_id = runner1._cluster_id

        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v',
                               '--cluster-id', cluster_id])
        mr_job.sandbox()

        self.mock_jobs_succeed = False

        with mr_job.make_runner() as runner2:
            self.assertIsInstance(runner2, DataprocJobRunner)

            with logger_disabled('mrjob.dataproc'):
                self.assertRaises(StepFailedException, runner2.run)

            cluster2 = runner2._get_cluster(runner2._cluster_id)
            self.assertEqual(_cluster_state_name(cluster2.status.state),
                             'RUNNING')

        # job shouldn't get terminated by cleanup
        cluster1 = runner1._get_cluster(runner1._cluster_id)
        self.assertEqual(_cluster_state_name(cluster1.status.state),
                         'RUNNING')
Exemple #2
0
    def _simulate_progress(self, project_id, region, cluster_name):
        cluster_key = (project_id, region, cluster_name)
        cluster = self.mock_clusters[cluster_key]

        state_name = _cluster_state_name(cluster.status.state)

        if state_name == 'DELETING':
            del self.mock_clusters[cluster_key]
        else:
            # just move from STARTING to RUNNING
            cluster.status.state = _cluster_state_value('RUNNING')
Exemple #3
0
    def test_failed_job(self):
        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v'])
        mr_job.sandbox()

        with no_handlers_for_logger('mrjob.dataproc'):
            stderr = StringIO()
            log_to_stream('mrjob.dataproc', stderr)

            self.mock_jobs_succeed = False

            with mr_job.make_runner() as runner:
                self.assertIsInstance(runner, DataprocJobRunner)

                self.assertRaises(StepFailedException, runner.run)

                self.assertIn(' => ERROR\n', stderr.getvalue())

                cluster_id = runner.get_cluster_id()

        # job should get terminated
        cluster = runner._get_cluster(cluster_id)
        self.assertEqual(_cluster_state_name(cluster.status.state), 'DELETING')
Exemple #4
0
    def test_end_to_end(self):
        # read from STDIN, a local file, and a remote file
        stdin = BytesIO(b'foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'wb') as local_input_file:
            local_input_file.write(b'bar\nqux\n')

        remote_input_path = 'gs://walrus/data/foo'
        self.put_gcs_multi({
            remote_input_path: b'foo\n'
        })

        mr_job = MRHadoopFormatJob(['-r', 'dataproc', '-v',
                                    '-', local_input_path, remote_input_path,
                                    '--jobconf', 'x=y'])
        mr_job.sandbox(stdin=stdin)

        results = []

        mock_gcs_fs_snapshot = deepcopy(self.mock_gcs_fs)

        fake_gcs_output = [
            b'1\t"qux"\n2\t"bar"\n',
            b'2\t"foo"\n5\tnull\n'
        ]

        with mr_job.make_runner() as runner:
            self.assertIsInstance(runner, DataprocJobRunner)

            # make sure that initializing the runner doesn't affect GCS
            # (Issue #50)
            self.assertEqual(self.mock_gcs_fs, mock_gcs_fs_snapshot)

            runner.run()

            # setup fake output
            self.put_job_output_parts(runner, fake_gcs_output)

            results.extend(mr_job.parse_output(runner.cat_output()))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            self.assertTrue(os.path.exists(local_tmp_dir))
            self.assertTrue(any(runner.fs.ls(runner.get_output_dir())))

            name_match = _JOB_KEY_RE.match(runner._job_key)
            self.assertEqual(name_match.group(1), 'mr_hadoop_format_job')
            self.assertEqual(name_match.group(2), getpass.getuser())

            # make sure our input and output formats are attached to
            # the correct steps
            jobs = list(runner._list_jobs())
            self.assertEqual(len(jobs), 2)

            # put earliest job first
            jobs.sort(key=lambda j: j.reference.job_id)

            step_0_args = jobs[0].hadoop_job.args
            step_1_args = jobs[1].hadoop_job.args

            self.assertIn('-inputformat', step_0_args)
            self.assertNotIn('-outputformat', step_0_args)
            self.assertNotIn('-inputformat', step_1_args)
            self.assertIn('-outputformat', step_1_args)

            # make sure jobconf got through
            self.assertIn('-D', step_0_args)
            self.assertIn('x=y', step_0_args)
            self.assertIn('-D', step_1_args)
            # job overrides jobconf in step 1
            self.assertIn('x=z', step_1_args)

            # make sure mrjob.zip is created and uploaded as a bootstrap file
            self.assertTrue(os.path.exists(runner._mrjob_zip_path))
            self.assertIn(runner._mrjob_zip_path,
                          runner._upload_mgr.path_to_uri())
            self.assertIn(runner._mrjob_zip_path,
                          runner._bootstrap_dir_mgr.paths())

            cluster_id = runner.get_cluster_id()

        self.assertEqual(sorted(results),
                         [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])

        # make sure cleanup happens
        self.assertFalse(os.path.exists(local_tmp_dir))

        # we don't clean-up the output dir as we're relying on lifecycle
        # management
        output_dirs = list(runner.fs.ls(runner.get_output_dir()))
        self.assertEqual(len(fake_gcs_output), len(output_dirs))

        # job should get terminated
        cluster = runner._get_cluster(cluster_id)
        self.assertEqual(_cluster_state_name(cluster.status.state), 'DELETING')