def test_end_to_end(self): # read from STDIN, a regular file, and a .gz stdin = BytesIO(b'foo\nbar\n') input_path = join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nqux\n') input_gz_path = join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\n') input_gz.close() mr_job = MRTwoStepJob( ['--runner', 'inline', '-', input_path, input_gz_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, InlineMRJobRunner) runner.run() results.extend(mr_job.parse_output(runner.cat_output())) local_tmp_dir = runner._get_local_tmp_dir() assert exists(local_tmp_dir) # make sure cleanup happens assert not exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_attach_to_existing_cluster(self): runner = DataprocJobRunner(conf_paths=[]) cluster_body = runner.api_client.cluster_create() cluster_id = cluster_body['clusterName'] stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob( ['-r', 'dataproc', '-v', '--cluster-id', cluster_id]) mr_job.sandbox(stdin=stdin) results = [] with mr_job.make_runner() as runner: runner.run() # Generate fake output self.put_job_output_parts(runner, [b'1\t"bar"\n1\t"foo"\n2\tnull\n']) # Issue 182: don't create the bootstrap script when # attaching to another cluster self.assertIsNone(runner._master_bootstrap_script_path) results.extend(mr_job.parse_output(runner.cat_output())) self.assertEqual(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
def test_attach_to_existing_cluster(self): runner = DataprocJobRunner(conf_paths=[]) cluster_body = runner.api_client.cluster_create() cluster_id = cluster_body['clusterName'] stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '--cluster-id', cluster_id]) mr_job.sandbox(stdin=stdin) results = [] with mr_job.make_runner() as runner: runner.run() # Generate fake output self.put_job_output_parts(runner, [ b'1\t"bar"\n1\t"foo"\n2\tnull\n' ]) # Issue 182: don't create the bootstrap script when # attaching to another cluster self.assertIsNone(runner._master_bootstrap_script_path) results.extend(mr_job.parse_output(runner.cat_output())) self.assertEqual(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf']) mr_job.sandbox(stdin=['bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr assert_in('import mrjob', mr_job.stderr.getvalue()) assert_in('#', mr_job.stderr.getvalue()) # should still get expected results assert_equal(sorted(mr_job.parse_output()), [(1, None), (1, 'bar')])