def test_default(self): mr_job = MRWordCount(["-r", "dataproc"]) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertRanIdleTimeoutScriptWith(runner, {"mrjob-max-secs-idle": "360"})
def test_persistent_cluster(self): mr_job = MRWordCount(["-r", "dataproc", "--max-hours-idle", "0.01"]) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertRanIdleTimeoutScriptWith(runner, {"mrjob-max-secs-idle": "36"})
def test_input_files(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\nfoo\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') with gzip.GzipFile(input_gz_path, 'wb') as input_gz: input_gz.write(b'foo\n') mr_job = MRWordCount(['-r', self.RUNNER, input_path, input_gz_path]) mr_job.sandbox() results = [] with mr_job.make_runner() as runner: runner.run() results.extend(mr_job.parse_output(runner.cat_output())) self.assertGreater(runner.counters()[0]['count']['combiners'], 2) self.assertEqual(sorted(results), [('file://' + input_path, 3), ('file://' + input_gz_path, 1)])
def test_input_files_and_setting_number_of_tasks(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\nfoo\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\n') input_gz.close() mr_job = MRWordCount(['-r', self.RUNNER, '--jobconf=mapred.map.tasks=3', '--jobconf=mapred.reduce.tasks=3', input_path, input_gz_path]) mr_job.sandbox() results = [] with mr_job.make_runner() as runner: runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) self.assertEqual(runner.counters()[0]['count']['combiners'], 3) self.assertEqual(sorted(results), [(input_path, 3), (input_gz_path, 1)])
def test_default(self): job = MRWordCount(['-r', 'dataproc']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertFalse(self.mock_Popen.called)
def test_default(self): mr_job = MRWordCount(['-r', 'dataproc']) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertRanIdleTimeoutScriptWith(runner, { 'mrjob-max-secs-idle': '360', })
def test_empty(self): job = MRWordCount(['-r', 'hadoop']) job.sandbox() with job.make_runner() as runner: runner._add_job_files_for_upload() args = runner._args_for_streaming_step(0) self.assertNotIn('-libjars', args)
def test_persistent_cluster(self): mr_job = MRWordCount(['-r', 'dataproc', '--max-hours-idle', '0.01']) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertRanIdleTimeoutScriptWith(runner, { 'mrjob-max-secs-idle': '36', })
def test_job_passes_in_steps(self): job = MRWordCount([]) job.sandbox() with job.make_runner() as runner: self.assertTrue(runner._steps) runner.run() self.assertFalse(self.log.warning.called)
def test_job_passes_in_steps(self): job = MRWordCount() job.sandbox() with job.make_runner() as runner: self.assertTrue(runner._steps) runner.run() self.assertFalse(self.log.warning.called)
def test_load_steps(self): job = MRWordCount() job.sandbox() with job.make_runner() as runner: runner._steps = None runner.run() self.assertTrue(runner._steps) self.assertTrue(self.log.warning.called)
def test_missing_gcloud_bin(self): self.mock_Popen.side_effect = OSError(2, 'No such file or directory') job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(self.mock_Popen.call_count, 1) self.assertTrue(runner._give_up_on_ssh_tunnel)
def test_error_from_gcloud_bin(self): self.mock_Popen.return_value.returncode = 255 job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertGreater(self.mock_Popen.call_count, 1) self.assertFalse(runner._give_up_on_ssh_tunnel)
def test_one_jar(self): job = MRWordCount([ '-r', 'hadoop', '--libjar', '/path/to/a.jar', ]) job.sandbox() with job.make_runner() as runner: runner._add_job_files_for_upload() args = runner._args_for_streaming_step(0) self.assertIn('-libjars', args) self.assertIn('/path/to/a.jar', args)
def test_custom_gcloud_bin(self): job = MRWordCount([ '-r', 'dataproc', '--ssh-tunnel', '--gcloud-bin', '/path/to/gcloud -v' ]) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(self.mock_Popen.call_count, 1) args = self.mock_Popen.call_args[0][0] self.assertEqual(args[:4], ['/path/to/gcloud', '-v', 'compute', 'ssh'])
def test_open_ssh_tunnel(self): job = MRWordCount( ['-r', 'dataproc', '--ssh-tunnel', '--ssh-tunnel-is-open']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(self.mock_Popen.call_count, 1) args = self.mock_Popen.call_args[0][0] self.assertIn('-L', args) self.assertIn('-N', args) self.assertIn('-n', args) self.assertIn('-q', args) self.assertIn('-g', args) self.assertIn('-4', args)
def test_log_messages(self): self.get_lines.return_value = [ '18/04/17 22:06:15 INFO mapreduce.Job: map 100% reduce 0%\n', '18/04/17 22:07:34 INFO mapreduce.Job: Counters: 1\n', '\tFile System Counters\n', '\t\tFILE: Number of bytes read=819\n', ] mr_job = MRWordCount(['-r', 'dataproc']) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertIn(call(' map 100% reduce 0%'), self.log.info.call_args_list) self.assertIn( call('Counters: 1\n\tFile System Counters\n\t\tFILE:' ' Number of bytes read=819'), self.log.info.call_args_list)
def test_input_files(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\nfoo\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') with gzip.GzipFile(input_gz_path, 'wb') as input_gz: input_gz.write(b'foo\n') mr_job = MRWordCount(['-r', self.RUNNER, input_path, input_gz_path]) mr_job.sandbox() results = [] with mr_job.make_runner() as runner: runner.run() results.extend(mr_job.parse_output(runner.cat_output())) self.assertGreater(runner.counters()[0]['count']['combiners'], 2) self.assertEqual(sorted(results), [(input_path, 3), (input_gz_path, 1)])
def test_default_ssh_tunnel(self): job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(self.mock_Popen.call_count, 1) args_tuple, kwargs = self.mock_Popen.call_args args = args_tuple[0] self.assertEqual(kwargs, dict(stdin=PIPE, stdout=PIPE, stderr=PIPE)) self.assertEqual(args[:3], ['gcloud', 'compute', 'ssh']) self.assertIn('-L', args) self.assertIn('-N', args) self.assertIn('-n', args) self.assertIn('-q', args) self.assertNotIn('-g', args) self.assertNotIn('-4', args) self.mock_Popen.stdin.called_once_with(b'\n\n')
def test_dash_for_stdin(self): job = MRWordCount(['-']) job.sandbox() with job.make_runner() as runner: runner.run()
def make_runner(self, *args): mr_job = MRWordCount(args) mr_job.sandbox() return mr_job.make_runner()
def test_stdin_is_fine(self): job = MRWordCount([]) job.sandbox() with job.make_runner() as runner: runner.run()
def test_stdin_is_fine(self): job = MRWordCount() job.sandbox() with job.make_runner() as runner: runner.run()