def test_set_runner_class(self):
        spark_submit_main(['-r', 'emr', 'foo.py', 'arg1'])

        self.assertEqual(self.runner_class.alias, 'emr')

        self.assertTrue(self.runner_class.called)
        self.assertTrue(self.runner.run.called)
    def test_end_to_end(self):
        script_path = spark_wordcount_script.__file__
        if script_path.endswith('.pyc'):
            script_path = script_path[:-1]

        input_path = self.makefile(
            'input', b'one fish\ntwo fish\nred fish\nblue fish\n')

        # don't create this path, let Spark do it
        output_path = join(self.tmp_dir, 'output')
        self.assertFalse(exists(output_path))

        spark_submit_main(
            ['-r', 'local', script_path, input_path, output_path])

        self.assertTrue(exists(output_path))

        word_counts = {}

        for path in glob(join(output_path, 'part-*')):
            with open(path) as f:
                for line in f:
                    word, count = safeeval(line)
                    word_counts[word] = count

        self.assertEqual(word_counts, dict(blue=1, fish=4, one=1, red=1,
                                           two=1))
Exemple #3
0
    def test_pass_through_to_step_spark_args(self):
        spark_submit_main(['--class', 'Backpack',
                           '--name', 'Backpack',
                           '--num-executors', '3',
                           '--conf', 'foo=BAR',
                           '--name', 'Mochila',
                           'dora.jar', 'arg1'])

        # --class becomes part of step
        # --conf is an alias for a mrjob opt, goes to runner
        # other args end up in spark-args as-is
        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['arg1'],
                jar='dora.jar',
                jobconf={},
                main_class='Backpack',
                spark_args=[
                    '--name', 'Backpack',
                    '--num-executors', '3',
                    '--name', 'Mochila',
                ],
                type='spark_jar',
            )
        ])

        self.assertEqual(kwargs['jobconf'], dict(foo='BAR'))
    def test_pass_through_to_step_spark_args(self):
        spark_submit_main([
            '--class', 'Backpack', '--name', 'Backpack', '--num-executors',
            '3', '--conf', 'foo=BAR', '--name', 'Mochila', 'dora.jar', 'arg1'
        ])

        # --class becomes part of step
        # --conf is an alias for a mrjob opt, goes to runner
        # other args end up in spark-args as-is
        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['arg1'],
                jar='dora.jar',
                jobconf={},
                main_class='Backpack',
                spark_args=[
                    '--name',
                    'Backpack',
                    '--num-executors',
                    '3',
                    '--name',
                    'Mochila',
                ],
                type='spark_jar',
            )
        ])

        self.assertEqual(kwargs['jobconf'], dict(foo='BAR'))
Exemple #5
0
    def test_end_to_end(self):
        script_path = spark_wordcount_script.__file__
        if script_path.endswith('.pyc'):
            script_path = script_path[:-1]

        input_path = self.makefile(
            'input', b'one fish\ntwo fish\nred fish\nblue fish\n')

        # don't create this path, let Spark do it
        output_path = join(self.tmp_dir, 'output')
        self.assertFalse(exists(output_path))

        spark_submit_main(
            ['-r', 'local', script_path, input_path, output_path])

        self.assertTrue(exists(output_path))

        word_counts = {}

        for path in glob(join(output_path, 'part-*')):
            with open(path) as f:
                for line in f:
                    word, count = safeeval(line)
                    word_counts[word] = count

        self.assertEqual(word_counts,
                         dict(blue=1, fish=4, one=1, red=1, two=1))
Exemple #6
0
    def test_set_runner_class(self):
        spark_submit_main(['-r', 'emr', 'foo.py', 'arg1'])

        self.assertEqual(self.runner_class.alias, 'emr')

        self.assertTrue(self.runner_class.called)
        self.assertTrue(self.runner.run.called)
    def test_hard_coded_kwargs(self):
        spark_submit_main(['foo.py', 'arg1'])

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['check_input_paths'], False)
        self.assertEqual(kwargs['input_paths'], [os.devnull])
        self.assertEqual(kwargs['output_dir'], None)
    def test_filters_runner_kwargs(self):
        # may want to change this behavior; see #1898
        spark_submit_main(['-r', 'emr', 'foo.py', 'arg1'])

        kwargs = self.get_runner_kwargs()

        self.assertIn('region', kwargs)
        self.assertNotIn('hadoop_bin', kwargs)
Exemple #9
0
    def test_hard_coded_kwargs(self):
        spark_submit_main(['foo.py', 'arg1'])

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['check_input_paths'], False)
        self.assertEqual(kwargs['input_paths'], [os.devnull])
        self.assertEqual(kwargs['output_dir'], None)
Exemple #10
0
    def test_filters_runner_kwargs(self):
        # may want to change this behavior; see #1898
        spark_submit_main(['-r', 'emr', 'foo.py', 'arg1'])

        kwargs = self.get_runner_kwargs()

        self.assertIn('region', kwargs)
        self.assertNotIn('hadoop_bin', kwargs)
    def test_allow_py3_extension(self):
        spark_submit_main(['foo.py3', 'arg1', 'arg2'])

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['arg1', 'arg2'],
                jobconf={},
                script='foo.py3',
                spark_args=[],
                type='spark_script',
            )
        ])
    def test_no_script_args_okay(self):
        spark_submit_main(['foo.py'])

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=[],
                jobconf={},
                script='foo.py',
                spark_args=[],
                type='spark_script',
            )
        ])
Exemple #13
0
    def test_allow_py3_extension(self):
        spark_submit_main(['foo.py3', 'arg1', 'arg2'])

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['arg1', 'arg2'],
                jobconf={},
                script='foo.py3',
                spark_args=[],
                type='spark_script',
            )
        ])
Exemple #14
0
    def test_no_script_args_okay(self):
        spark_submit_main(['foo.py'])

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=[],
                jobconf={},
                script='foo.py',
                spark_args=[],
                type='spark_script',
            )
        ])
    def test_jar_main_class(self):
        spark_submit_main(
            ['--class', 'Backpack', 'dora.jar', 'arg1', 'arg2', 'arg3'])

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['arg1', 'arg2', 'arg3'],
                jar='dora.jar',
                jobconf={},
                main_class='Backpack',
                spark_args=[],
                type='spark_jar',
            )
        ])
    def test_runner_kwargs(self):
        spark_submit_main([
            '--hadoop-bin', 'super-hadoop', '--master', 'local', '--py-files',
            'bar.py,baz.py', 'foo.py', 'arg1'
        ])

        kwargs = self.get_runner_kwargs()

        # regular old runner arg
        self.assertEqual(kwargs['hadoop_bin'], 'super-hadoop')

        # spark alias for mrjob opt
        self.assertEqual(kwargs['spark_master'], 'local')

        # arg with custom parser
        self.assertEqual(kwargs['py_files'], ['bar.py', 'baz.py'])
Exemple #17
0
    def test_runner_kwargs(self):
        spark_submit_main(['--hadoop-bin', 'super-hadoop',
                           '--master', 'local',
                           '--py-files', 'bar.py,baz.py',
                           'foo.py', 'arg1'])

        kwargs = self.get_runner_kwargs()

        # regular old runner arg
        self.assertEqual(kwargs['hadoop_bin'], 'super-hadoop')

        # spark alias for mrjob opt
        self.assertEqual(kwargs['spark_master'], 'local')

        # arg with custom parser
        self.assertEqual(kwargs['py_files'], ['bar.py', 'baz.py'])
Exemple #18
0
    def test_jar_main_class(self):
        spark_submit_main(['--class', 'Backpack',
                           'dora.jar', 'arg1', 'arg2', 'arg3'])

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['arg1', 'arg2', 'arg3'],
                jar='dora.jar',
                jobconf={},
                main_class='Backpack',
                spark_args=[],
                type='spark_jar',
            )
        ])
    def test_basic(self):
        spark_submit_main(['foo.py', 'arg1', 'arg2'])

        self.assertEqual(self.runner_class.alias, 'spark')

        self.assertTrue(self.runner_class.called)
        self.assertTrue(self.runner.run.called)

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['arg1', 'arg2'],
                jobconf={},
                script='foo.py',
                spark_args=[],
                type='spark_script',
            )
        ])
Exemple #20
0
    def test_basic(self):
        spark_submit_main(['foo.py', 'arg1', 'arg2'])

        self.assertEqual(self.runner_class.alias, 'spark')

        self.assertTrue(self.runner_class.called)
        self.assertTrue(self.runner.run.called)

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['arg1', 'arg2'],
                jobconf={},
                script='foo.py',
                spark_args=[],
                type='spark_script',
            )
        ])
Exemple #21
0
    def test_jar_step(self):
        spark_submit_main(['dora.jar', 'arg1', 'arg2', 'arg3'])

        self.assertEqual(self.runner_class.alias, 'spark')

        self.assertTrue(self.runner_class.called)
        self.assertTrue(self.runner.run.called)

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['arg1', 'arg2', 'arg3'],
                jar='dora.jar',
                jobconf={},
                main_class=None,
                spark_args=[],
                type='spark_jar',
            )
        ])
    def test_switches_to_spark_script(self):
        # regression test for #2070
        spark_submit_main(['foo.py', '--bar', 'baz'])

        self.assertEqual(self.runner_class.alias, 'spark')

        self.assertTrue(self.runner_class.called)
        self.assertTrue(self.runner.run.called)

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['--bar', 'baz'],
                jobconf={},
                script='foo.py',
                spark_args=[],
                type='spark_script',
            )
        ])
    def test_jar_step(self):
        spark_submit_main(['dora.jar', 'arg1', 'arg2', 'arg3'])

        self.assertEqual(self.runner_class.alias, 'spark')

        self.assertTrue(self.runner_class.called)
        self.assertTrue(self.runner.run.called)

        kwargs = self.get_runner_kwargs()

        self.assertEqual(kwargs['steps'], [
            dict(
                args=['arg1', 'arg2', 'arg3'],
                jar='dora.jar',
                jobconf={},
                main_class=None,
                spark_args=[],
                type='spark_jar',
            )
        ])
    def test_end_to_end(self):
        script_path = self.makefile('foo.py')

        spark_submit_main(['-r', 'emr', script_path, 'arg1'])

        emr_client = self.client('emr')

        cluster_ids = [c['Id'] for c in emr_client.list_clusters()['Clusters']]
        self.assertEqual(len(cluster_ids), 1)
        cluster_id = cluster_ids[0]

        steps = emr_client.list_steps(ClusterId=cluster_id)['Steps']
        self.assertEqual(len(steps), 1)
        step = steps[0]

        self.assertEqual(step['Status']['State'], 'COMPLETED')
        step_args = step['Config']['Args']

        self.assertEqual(step_args[0], 'spark-submit')
        self.assertEqual(step_args[-1], 'arg1')
        self.assertTrue(step_args[-2].endswith('/foo.py'))
Exemple #25
0
    def test_end_to_end(self):
        script_path = self.makefile('foo.py')

        spark_submit_main(
            ['-r', 'emr', script_path, 'arg1'])

        emr_client = self.client('emr')

        cluster_ids = [c['Id'] for c in
                       emr_client.list_clusters()['Clusters']]
        self.assertEqual(len(cluster_ids), 1)
        cluster_id = cluster_ids[0]

        steps = emr_client.list_steps(ClusterId=cluster_id)['Steps']
        self.assertEqual(len(steps), 1)
        step = steps[0]

        self.assertEqual(step['Status']['State'], 'COMPLETED')
        step_args = step['Config']['Args']

        self.assertEqual(step_args[0], 'spark-submit')
        self.assertEqual(step_args[-1], 'arg1')
        self.assertTrue(step_args[-2].endswith('/foo.py'))
Exemple #26
0
 def test_cleanup_called(self):
     spark_submit_main(['-r', 'emr', 'foo.py', 'arg1'])
     self.assertTrue(self.runner.cleanup.called)
 def test_cleanup_called(self):
     spark_submit_main(['-r', 'emr', 'foo.py', 'arg1'])
     self.assertTrue(self.runner.cleanup.called)