Example #1
0
    def test_hadoop_format_attributes(self):
        mr_job = MRHadoopFormatJob()

        self.assertEqual(mr_job._runner_kwargs()['hadoop_input_format'],
                         'mapred.FooInputFormat')
        self.assertEqual(mr_job._runner_kwargs()['hadoop_output_format'],
                         'mapred.BarOutputFormat')
Example #2
0
    def test_hadoop_format_attributes(self):
        mr_job = MRHadoopFormatJob()

        self.assertEqual(mr_job._runner_kwargs()['hadoop_input_format'],
                         'mapred.FooInputFormat')
        self.assertEqual(mr_job._runner_kwargs()['hadoop_output_format'],
                         'mapred.BarOutputFormat')
Example #3
0
    def test_deprecated_command_line_options_override_attrs(self):
        mr_job = MRHadoopFormatJob([
            '--hadoop-input-format',
            'org.apache.hadoop.mapred.lib.NLineInputFormat',
            '--hadoop-output-format',
            'org.apache.hadoop.mapred.FileOutputFormat',
            ])

        with logger_disabled('mrjob.job'):
            assert_equal(mr_job.job_runner_kwargs()['hadoop_input_format'],
                         'org.apache.hadoop.mapred.lib.NLineInputFormat')
            assert_equal(mr_job.job_runner_kwargs()['hadoop_output_format'],
                         'org.apache.hadoop.mapred.FileOutputFormat')
Example #4
0
    def test_deprecated_command_line_options_override_attrs(self):
        mr_job = MRHadoopFormatJob([
            '--hadoop-input-format',
            'org.apache.hadoop.mapred.lib.NLineInputFormat',
            '--hadoop-output-format',
            'org.apache.hadoop.mapred.FileOutputFormat',
        ])

        with logger_disabled('mrjob.job'):
            assert_equal(mr_job.job_runner_kwargs()['hadoop_input_format'],
                         'org.apache.hadoop.mapred.lib.NLineInputFormat')
            assert_equal(mr_job.job_runner_kwargs()['hadoop_output_format'],
                         'org.apache.hadoop.mapred.FileOutputFormat')
Example #5
0
    def test_end_to_end(self):
        # read from STDIN, a local file, and a remote file
        stdin = BytesIO(b'foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'wb') as local_input_file:
            local_input_file.write(b'bar\nqux\n')

        remote_input_path = 'gs://walrus/data/foo'
        self.put_gcs_multi({remote_input_path: b'foo\n'})

        mr_job = MRHadoopFormatJob([
            '-r', 'dataproc', '-v', '-', local_input_path, remote_input_path,
            '--jobconf', 'x=y'
        ])
        mr_job.sandbox(stdin=stdin)

        results = []

        gcs_buckets_snapshot = copy.deepcopy(self._gcs_client._cache_buckets)
        gcs_objects_snapshot = copy.deepcopy(self._gcs_client._cache_objects)

        fake_gcs_output = [b'1\t"qux"\n2\t"bar"\n', b'2\t"foo"\n5\tnull\n']

        with mr_job.make_runner() as runner:
            self.assertIsInstance(runner, DataprocJobRunner)

            # make sure that initializing the runner doesn't affect GCS
            # (Issue #50)
            self.assertEqual(gcs_buckets_snapshot,
                             self._gcs_client._cache_buckets)
            self.assertEqual(gcs_objects_snapshot,
                             self._gcs_client._cache_objects)

            runner.run()

            # setup fake output
            self.put_job_output_parts(runner, fake_gcs_output)

            results.extend(mr_job.parse_output(runner.cat_output()))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            self.assertTrue(os.path.exists(local_tmp_dir))
            self.assertTrue(any(runner.fs.ls(runner.get_output_dir())))

            name_match = _JOB_KEY_RE.match(runner._job_key)
            self.assertEqual(name_match.group(1), 'mr_hadoop_format_job')
            self.assertEqual(name_match.group(2), getpass.getuser())

            # make sure our input and output formats are attached to
            # the correct steps
            jobs_list = runner.api_client.jobs().list(
                projectId=runner._gcp_project,
                region=_DATAPROC_API_REGION).execute()
            jobs = jobs_list['items']

            step_0_args = jobs[0]['hadoopJob']['args']
            step_1_args = jobs[1]['hadoopJob']['args']

            self.assertIn('-inputformat', step_0_args)
            self.assertNotIn('-outputformat', step_0_args)
            self.assertNotIn('-inputformat', step_1_args)
            self.assertIn('-outputformat', step_1_args)

            # make sure jobconf got through
            self.assertIn('-D', step_0_args)
            self.assertIn('x=y', step_0_args)
            self.assertIn('-D', step_1_args)
            # job overrides jobconf in step 1
            self.assertIn('x=z', step_1_args)

            # make sure mrjob.zip is created and uploaded as a bootstrap file
            self.assertTrue(os.path.exists(runner._mrjob_zip_path))
            self.assertIn(runner._mrjob_zip_path,
                          runner._upload_mgr.path_to_uri())
            self.assertIn(runner._mrjob_zip_path,
                          runner._bootstrap_dir_mgr.paths())

            cluster_id = runner.get_cluster_id()

        self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'),
                                           (5, None)])

        # make sure cleanup happens
        self.assertFalse(os.path.exists(local_tmp_dir))

        # we don't clean-up the output dir as we're relying on lifecycle
        # management
        output_dirs = list(runner.fs.ls(runner.get_output_dir()))
        self.assertEqual(len(fake_gcs_output), len(output_dirs))

        # job should get terminated
        cluster = (
            self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id])
        cluster_state = self._dataproc_client.get_state(cluster)
        self.assertEqual(cluster_state, 'DELETING')
Example #6
0
    def test_end_to_end(self):
        # read from STDIN, a local file, and a remote file
        stdin = BytesIO(b'foo\nbar\n')

        local_input_path = os.path.join(self.tmp_dir, 'input')
        with open(local_input_path, 'wb') as local_input_file:
            local_input_file.write(b'bar\nqux\n')

        remote_input_path = 'gs://walrus/data/foo'
        self.put_gcs_multi({
            remote_input_path: b'foo\n'
        })

        mr_job = MRHadoopFormatJob(['-r', 'dataproc', '-v',
                                    '-', local_input_path, remote_input_path,
                                    '--jobconf', 'x=y'])
        mr_job.sandbox(stdin=stdin)

        results = []

        gcs_buckets_snapshot = copy.deepcopy(self._gcs_client._cache_buckets)
        gcs_objects_snapshot = copy.deepcopy(self._gcs_client._cache_objects)

        fake_gcs_output = [
            b'1\t"qux"\n2\t"bar"\n',
            b'2\t"foo"\n5\tnull\n'
        ]

        with mr_job.make_runner() as runner:
            self.assertIsInstance(runner, DataprocJobRunner)

            # make sure that initializing the runner doesn't affect GCS
            # (Issue #50)
            self.assertEqual(gcs_buckets_snapshot, self._gcs_client._cache_buckets)
            self.assertEqual(gcs_objects_snapshot, self._gcs_client._cache_objects)

            runner.run()

            # setup fake output
            self.put_job_output_parts(runner, fake_gcs_output)

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            local_tmp_dir = runner._get_local_tmp_dir()
            # make sure cleanup hasn't happened yet
            self.assertTrue(os.path.exists(local_tmp_dir))
            self.assertTrue(any(runner.fs.ls(runner.get_output_dir())))

            name_match = _JOB_KEY_RE.match(runner._job_key)
            self.assertEqual(name_match.group(1), 'mr_hadoop_format_job')
            self.assertEqual(name_match.group(2), getpass.getuser())

            # make sure our input and output formats are attached to
            # the correct steps
            jobs_list = runner.api_client.jobs().list(projectId=runner._gcp_project, region=_DATAPROC_API_REGION).execute()
            jobs = jobs_list['items']

            step_0_args = jobs[0]['hadoopJob']['args']
            step_1_args = jobs[1]['hadoopJob']['args']

            self.assertIn('-inputformat', step_0_args)
            self.assertNotIn('-outputformat', step_0_args)
            self.assertNotIn('-inputformat', step_1_args)
            self.assertIn('-outputformat', step_1_args)

            # make sure jobconf got through
            self.assertIn('-D', step_0_args)
            self.assertIn('x=y', step_0_args)
            self.assertIn('-D', step_1_args)
            # job overrides jobconf in step 1
            self.assertIn('x=z', step_1_args)

            # make sure mrjob.tar.gz is created and uploaded as
            # a bootstrap file
            self.assertTrue(os.path.exists(runner._mrjob_tar_gz_path))
            self.assertIn(runner._mrjob_tar_gz_path,
                          runner._upload_mgr.path_to_uri())
            self.assertIn(runner._mrjob_tar_gz_path,
                          runner._bootstrap_dir_mgr.paths())

            cluster_id = runner.get_cluster_id()

        self.assertEqual(sorted(results),
                         [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])

        # make sure cleanup happens
        self.assertFalse(os.path.exists(local_tmp_dir))

        # we don't clean-up the output dir as we're relying on lifecycle management
        output_dirs = list(runner.fs.ls(runner.get_output_dir()))
        self.assertEqual(len(fake_gcs_output), len(output_dirs))

        # job should get terminated
        cluster = self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]
        cluster_state = self._dataproc_client.get_state(cluster)
        self.assertEqual(cluster_state, 'DELETING')