def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = BytesIO(b'foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'wb') as local_input_file: local_input_file.write(b'bar\nqux\n') remote_input_path = 'gs://walrus/data/foo' self.put_gcs_multi({remote_input_path: b'foo\n'}) mr_job = MRHadoopFormatJob([ '-r', 'dataproc', '-v', '-', local_input_path, remote_input_path, '--jobconf', 'x=y' ]) mr_job.sandbox(stdin=stdin) results = [] gcs_buckets_snapshot = copy.deepcopy(self._gcs_client._cache_buckets) gcs_objects_snapshot = copy.deepcopy(self._gcs_client._cache_objects) fake_gcs_output = [b'1\t"qux"\n2\t"bar"\n', b'2\t"foo"\n5\tnull\n'] with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) # make sure that initializing the runner doesn't affect GCS # (Issue #50) self.assertEqual(gcs_buckets_snapshot, self._gcs_client._cache_buckets) self.assertEqual(gcs_objects_snapshot, self._gcs_client._cache_objects) runner.run() # setup fake output self.put_job_output_parts(runner, fake_gcs_output) results.extend(mr_job.parse_output(runner.cat_output())) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet self.assertTrue(os.path.exists(local_tmp_dir)) self.assertTrue(any(runner.fs.ls(runner.get_output_dir()))) name_match = _JOB_KEY_RE.match(runner._job_key) self.assertEqual(name_match.group(1), 'mr_hadoop_format_job') self.assertEqual(name_match.group(2), getpass.getuser()) # make sure our input and output formats are attached to # the correct steps jobs_list = runner.api_client.jobs().list( projectId=runner._gcp_project, region=_DATAPROC_API_REGION).execute() jobs = jobs_list['items'] step_0_args = jobs[0]['hadoopJob']['args'] step_1_args = jobs[1]['hadoopJob']['args'] self.assertIn('-inputformat', step_0_args) self.assertNotIn('-outputformat', step_0_args) self.assertNotIn('-inputformat', step_1_args) self.assertIn('-outputformat', step_1_args) # make sure jobconf got through self.assertIn('-D', step_0_args) self.assertIn('x=y', step_0_args) self.assertIn('-D', step_1_args) # job overrides jobconf in step 1 self.assertIn('x=z', step_1_args) # make sure mrjob.zip is created and uploaded as a bootstrap file self.assertTrue(os.path.exists(runner._mrjob_zip_path)) self.assertIn(runner._mrjob_zip_path, runner._upload_mgr.path_to_uri()) self.assertIn(runner._mrjob_zip_path, runner._bootstrap_dir_mgr.paths()) cluster_id = runner.get_cluster_id() self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens self.assertFalse(os.path.exists(local_tmp_dir)) # we don't clean-up the output dir as we're relying on lifecycle # management output_dirs = list(runner.fs.ls(runner.get_output_dir())) self.assertEqual(len(fake_gcs_output), len(output_dirs)) # job should get terminated cluster = ( self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]) cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'DELETING')
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = BytesIO(b'foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'wb') as local_input_file: local_input_file.write(b'bar\nqux\n') remote_input_path = 'gs://walrus/data/foo' self.put_gcs_multi({ remote_input_path: b'foo\n' }) mr_job = MRHadoopFormatJob(['-r', 'dataproc', '-v', '-', local_input_path, remote_input_path, '--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) results = [] gcs_buckets_snapshot = copy.deepcopy(self._gcs_client._cache_buckets) gcs_objects_snapshot = copy.deepcopy(self._gcs_client._cache_objects) fake_gcs_output = [ b'1\t"qux"\n2\t"bar"\n', b'2\t"foo"\n5\tnull\n' ] with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) # make sure that initializing the runner doesn't affect GCS # (Issue #50) self.assertEqual(gcs_buckets_snapshot, self._gcs_client._cache_buckets) self.assertEqual(gcs_objects_snapshot, self._gcs_client._cache_objects) runner.run() # setup fake output self.put_job_output_parts(runner, fake_gcs_output) for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet self.assertTrue(os.path.exists(local_tmp_dir)) self.assertTrue(any(runner.fs.ls(runner.get_output_dir()))) name_match = _JOB_KEY_RE.match(runner._job_key) self.assertEqual(name_match.group(1), 'mr_hadoop_format_job') self.assertEqual(name_match.group(2), getpass.getuser()) # make sure our input and output formats are attached to # the correct steps jobs_list = runner.api_client.jobs().list(projectId=runner._gcp_project, region=_DATAPROC_API_REGION).execute() jobs = jobs_list['items'] step_0_args = jobs[0]['hadoopJob']['args'] step_1_args = jobs[1]['hadoopJob']['args'] self.assertIn('-inputformat', step_0_args) self.assertNotIn('-outputformat', step_0_args) self.assertNotIn('-inputformat', step_1_args) self.assertIn('-outputformat', step_1_args) # make sure jobconf got through self.assertIn('-D', step_0_args) self.assertIn('x=y', step_0_args) self.assertIn('-D', step_1_args) # job overrides jobconf in step 1 self.assertIn('x=z', step_1_args) # make sure mrjob.tar.gz is created and uploaded as # a bootstrap file self.assertTrue(os.path.exists(runner._mrjob_tar_gz_path)) self.assertIn(runner._mrjob_tar_gz_path, runner._upload_mgr.path_to_uri()) self.assertIn(runner._mrjob_tar_gz_path, runner._bootstrap_dir_mgr.paths()) cluster_id = runner.get_cluster_id() self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens self.assertFalse(os.path.exists(local_tmp_dir)) # we don't clean-up the output dir as we're relying on lifecycle management output_dirs = list(runner.fs.ls(runner.get_output_dir())) self.assertEqual(len(fake_gcs_output), len(output_dirs)) # job should get terminated cluster = self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id] cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'DELETING')