Esempio n. 1
0
    def test_default(self):
        mr_job = MRWordCount(["-r", "dataproc"])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {"mrjob-max-secs-idle": "360"})
Esempio n. 2
0
    def test_persistent_cluster(self):
        mr_job = MRWordCount(["-r", "dataproc", "--max-hours-idle", "0.01"])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {"mrjob-max-secs-idle": "36"})
Esempio n. 3
0
    def test_input_files(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(b'bar\nqux\nfoo\n')

        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        with gzip.GzipFile(input_gz_path, 'wb') as input_gz:
            input_gz.write(b'foo\n')

        mr_job = MRWordCount(['-r', self.RUNNER,
                              input_path, input_gz_path])
        mr_job.sandbox()

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            results.extend(mr_job.parse_output(runner.cat_output()))

            self.assertGreater(runner.counters()[0]['count']['combiners'], 2)

        self.assertEqual(sorted(results),
                         [('file://' + input_path, 3),
                          ('file://' + input_gz_path, 1)])
Esempio n. 4
0
    def test_input_files_and_setting_number_of_tasks(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(b'bar\nqux\nfoo\n')

        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b'foo\n')
        input_gz.close()

        mr_job = MRWordCount(['-r', self.RUNNER,
                              '--jobconf=mapred.map.tasks=3',
                              '--jobconf=mapred.reduce.tasks=3',
                              input_path, input_gz_path])
        mr_job.sandbox()

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            self.assertEqual(runner.counters()[0]['count']['combiners'], 3)

        self.assertEqual(sorted(results),
                         [(input_path, 3), (input_gz_path, 1)])
Esempio n. 5
0
    def test_default(self):
        job = MRWordCount(['-r', 'dataproc'])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertFalse(self.mock_Popen.called)
Esempio n. 6
0
    def test_default(self):
        mr_job = MRWordCount(['-r', 'dataproc'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {
                'mrjob-max-secs-idle': '360',
            })
Esempio n. 7
0
    def test_empty(self):
        job = MRWordCount(['-r', 'hadoop'])
        job.sandbox()

        with job.make_runner() as runner:
            runner._add_job_files_for_upload()
            args = runner._args_for_streaming_step(0)

            self.assertNotIn('-libjars', args)
Esempio n. 8
0
    def test_empty(self):
        job = MRWordCount(['-r', 'hadoop'])
        job.sandbox()

        with job.make_runner() as runner:
            runner._add_job_files_for_upload()
            args = runner._args_for_streaming_step(0)

            self.assertNotIn('-libjars', args)
Esempio n. 9
0
    def test_default(self):
        mr_job = MRWordCount(['-r', 'dataproc'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {
                'mrjob-max-secs-idle': '360',
            })
Esempio n. 10
0
    def test_persistent_cluster(self):
        mr_job = MRWordCount(['-r', 'dataproc', '--max-hours-idle', '0.01'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {
                'mrjob-max-secs-idle': '36',
            })
Esempio n. 11
0
    def test_persistent_cluster(self):
        mr_job = MRWordCount(['-r', 'dataproc', '--max-hours-idle', '0.01'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {
                'mrjob-max-secs-idle': '36',
            })
Esempio n. 12
0
    def test_job_passes_in_steps(self):
        job = MRWordCount([])
        job.sandbox()

        with job.make_runner() as runner:
            self.assertTrue(runner._steps)

            runner.run()

            self.assertFalse(self.log.warning.called)
Esempio n. 13
0
    def test_job_passes_in_steps(self):
        job = MRWordCount()
        job.sandbox()

        with job.make_runner() as runner:
            self.assertTrue(runner._steps)

            runner.run()

            self.assertFalse(self.log.warning.called)
Esempio n. 14
0
    def test_load_steps(self):
        job = MRWordCount()
        job.sandbox()

        with job.make_runner() as runner:
            runner._steps = None

            runner.run()

            self.assertTrue(runner._steps)
            self.assertTrue(self.log.warning.called)
Esempio n. 15
0
    def test_missing_gcloud_bin(self):
        self.mock_Popen.side_effect = OSError(2, 'No such file or directory')

        job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel'])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertEqual(self.mock_Popen.call_count, 1)
        self.assertTrue(runner._give_up_on_ssh_tunnel)
Esempio n. 16
0
    def test_load_steps(self):
        job = MRWordCount()
        job.sandbox()

        with job.make_runner() as runner:
            runner._steps = None

            runner.run()

            self.assertTrue(runner._steps)
            self.assertTrue(self.log.warning.called)
Esempio n. 17
0
    def test_error_from_gcloud_bin(self):
        self.mock_Popen.return_value.returncode = 255

        job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel'])

        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertGreater(self.mock_Popen.call_count, 1)
        self.assertFalse(runner._give_up_on_ssh_tunnel)
Esempio n. 18
0
    def test_one_jar(self):
        job = MRWordCount([
            '-r', 'hadoop',
            '--libjar', '/path/to/a.jar',
        ])
        job.sandbox()

        with job.make_runner() as runner:
            runner._add_job_files_for_upload()
            args = runner._args_for_streaming_step(0)

            self.assertIn('-libjars', args)
            self.assertIn('/path/to/a.jar', args)
Esempio n. 19
0
    def test_one_jar(self):
        job = MRWordCount([
            '-r',
            'hadoop',
            '--libjar',
            '/path/to/a.jar',
        ])
        job.sandbox()

        with job.make_runner() as runner:
            runner._add_job_files_for_upload()
            args = runner._args_for_streaming_step(0)

            self.assertIn('-libjars', args)
            self.assertIn('/path/to/a.jar', args)
Esempio n. 20
0
    def test_custom_gcloud_bin(self):
        job = MRWordCount([
            '-r', 'dataproc', '--ssh-tunnel', '--gcloud-bin',
            '/path/to/gcloud -v'
        ])

        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertEqual(self.mock_Popen.call_count, 1)
        args = self.mock_Popen.call_args[0][0]

        self.assertEqual(args[:4], ['/path/to/gcloud', '-v', 'compute', 'ssh'])
Esempio n. 21
0
    def test_open_ssh_tunnel(self):
        job = MRWordCount(
            ['-r', 'dataproc', '--ssh-tunnel', '--ssh-tunnel-is-open'])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertEqual(self.mock_Popen.call_count, 1)
        args = self.mock_Popen.call_args[0][0]

        self.assertIn('-L', args)
        self.assertIn('-N', args)
        self.assertIn('-n', args)
        self.assertIn('-q', args)

        self.assertIn('-g', args)
        self.assertIn('-4', args)
Esempio n. 22
0
    def test_log_messages(self):
        self.get_lines.return_value = [
            '18/04/17 22:06:15 INFO mapreduce.Job:  map 100% reduce 0%\n',
            '18/04/17 22:07:34 INFO mapreduce.Job: Counters: 1\n',
            '\tFile System Counters\n',
            '\t\tFILE: Number of bytes read=819\n',
        ]

        mr_job = MRWordCount(['-r', 'dataproc'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()

        self.assertIn(call('  map 100% reduce 0%'),
                      self.log.info.call_args_list)

        self.assertIn(
            call('Counters: 1\n\tFile System Counters\n\t\tFILE:'
                 ' Number of bytes read=819'), self.log.info.call_args_list)
Esempio n. 23
0
    def test_input_files(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(b'bar\nqux\nfoo\n')

        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        with gzip.GzipFile(input_gz_path, 'wb') as input_gz:
            input_gz.write(b'foo\n')

        mr_job = MRWordCount(['-r', self.RUNNER,
                              input_path, input_gz_path])
        mr_job.sandbox()

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            results.extend(mr_job.parse_output(runner.cat_output()))

            self.assertGreater(runner.counters()[0]['count']['combiners'], 2)

        self.assertEqual(sorted(results),
                         [(input_path, 3), (input_gz_path, 1)])
Esempio n. 24
0
    def test_default_ssh_tunnel(self):
        job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel'])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertEqual(self.mock_Popen.call_count, 1)
        args_tuple, kwargs = self.mock_Popen.call_args
        args = args_tuple[0]

        self.assertEqual(kwargs, dict(stdin=PIPE, stdout=PIPE, stderr=PIPE))

        self.assertEqual(args[:3], ['gcloud', 'compute', 'ssh'])

        self.assertIn('-L', args)
        self.assertIn('-N', args)
        self.assertIn('-n', args)
        self.assertIn('-q', args)

        self.assertNotIn('-g', args)
        self.assertNotIn('-4', args)

        self.mock_Popen.stdin.called_once_with(b'\n\n')
Esempio n. 25
0
    def test_dash_for_stdin(self):
        job = MRWordCount(['-'])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()
Esempio n. 26
0
 def make_runner(self, *args):
     mr_job = MRWordCount(args)
     mr_job.sandbox()
     return mr_job.make_runner()
Esempio n. 27
0
 def make_runner(self, *args):
     mr_job = MRWordCount(args)
     mr_job.sandbox()
     return mr_job.make_runner()
Esempio n. 28
0
    def test_dash_for_stdin(self):
        job = MRWordCount(['-'])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()
Esempio n. 29
0
    def test_stdin_is_fine(self):
        job = MRWordCount([])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()
Esempio n. 30
0
    def test_stdin_is_fine(self):
        job = MRWordCount()
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()