コード例 #1
0
    def test_get_file_splits_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('bar\nqux\nfoo\nbar\nqux\nfoo\n')

        input_path2 = os.path.join(self.tmp_dir, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b'foo\nbar\nbar\n')

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_path, input_path2], 3)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, 'rb') as f:
                content.extend(f.readlines())

        self.assertEqual(sorted(content), [
            b'bar\n', b'bar\n', b'bar\n', b'bar\n', b'foo\n', b'foo\n',
            b'foo\n', b'qux\n', b'qux\n'
        ])
コード例 #2
0
ファイル: test_runner.py プロジェクト: irskep/mrjob
    def test_jobconf_from_step(self):
        jobconf = {"FOO": "bar", "BAZ": "qux"}
        # Hack in steps rather than creating a new MRJob subclass
        runner = LocalMRJobRunner(jobconf=jobconf)
        runner._steps = [{"jobconf": {"BAZ": "quux", "BAX": "Arnold"}}]

        self.assertEqual(runner._hadoop_args_for_step(0), ["-D", "BAX=Arnold", "-D", "BAZ=quux", "-D", "FOO=bar"])
コード例 #3
0
    def test_get_file_splits_sorted_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(
                b'1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n'
                b'3\tqux\n3\tqux\n')

        runner = LocalMRJobRunner(conf_paths=[])

        file_splits = runner._get_file_splits([input_path],
                                              3,
                                              keep_sorted=True)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved in sorted order
        content = []
        for file_name in sorted(file_splits.keys()):
            with open(file_name, 'rb') as f:
                content.extend(f.readlines())

        self.assertEqual(content, [
            b'1\tbar\n', b'1\tbar\n', b'1\tbar\n', b'2\tfoo\n', b'2\tfoo\n',
            b'2\tfoo\n', b'3\tqux\n', b'3\tqux\n', b'3\tqux\n'
        ])
コード例 #4
0
ファイル: runner_test.py プロジェクト: chomp/mrjob
    def test_empty_no_user(self):
        self.getuser_should_fail = True
        runner = LocalMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'no_script')
        assert_equal(match.group(2), 'no_user')
コード例 #5
0
ファイル: test_local.py プロジェクト: alanhdu/mrjob
    def test_get_file_splits_sorted_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, "input")
        with open(input_path, "wb") as input_file:
            input_file.write(b"1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n" b"3\tqux\n3\tqux\n")

        runner = LocalMRJobRunner(conf_paths=[])

        file_splits = runner._get_file_splits([input_path], 3, keep_sorted=True)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved in sorted order
        content = []
        for file_name in sorted(file_splits.keys()):
            with open(file_name, "rb") as f:
                content.extend(f.readlines())

        self.assertEqual(
            content,
            [
                b"1\tbar\n",
                b"1\tbar\n",
                b"1\tbar\n",
                b"2\tfoo\n",
                b"2\tfoo\n",
                b"2\tfoo\n",
                b"3\tqux\n",
                b"3\tqux\n",
                b"3\tqux\n",
            ],
        )
コード例 #6
0
    def test_owner_and_label_kwargs(self):
        runner = LocalMRJobRunner(conf_path=False,
                                  owner='ads', label='ads_chain')
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'ads_chain')
        assert_equal(match.group(2), 'ads')
コード例 #7
0
    def test_auto_owner(self):
        os.environ['USER'] = '******'
        runner = LocalMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'no_script')
        assert_equal(match.group(2), 'mcp')
コード例 #8
0
ファイル: runner_test.py プロジェクト: chomp/mrjob
    def test_owner_and_label_kwargs(self):
        runner = LocalMRJobRunner(conf_path=False,
                                  owner='ads', label='ads_chain')
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'ads_chain')
        assert_equal(match.group(2), 'ads')
コード例 #9
0
ファイル: test_local.py プロジェクト: Affirm/mrjob
    def _test_spark_executor_memory(self, conf_value, megs):
        runner = LocalMRJobRunner(
            jobconf={'spark.executor.memory': conf_value})

        self.assertEqual(runner._spark_master(),
                         'local-cluster[%d,1,%d]' % (
                             cpu_count(), megs))
コード例 #10
0
    def test_empty_no_user(self):
        self.getuser_should_fail = True
        runner = LocalMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'no_script')
        assert_equal(match.group(2), 'no_user')
コード例 #11
0
ファイル: runner_test.py プロジェクト: chomp/mrjob
    def test_auto_owner(self):
        os.environ['USER'] = '******'
        runner = LocalMRJobRunner(conf_path=False)
        match = JOB_NAME_RE.match(runner.get_job_name())

        assert_equal(match.group(1), 'no_script')
        assert_equal(match.group(2), 'mcp')
コード例 #12
0
    def _test_spark_executor_memory(self, conf_value, megs):
        runner = LocalMRJobRunner(
            jobconf={'spark.executor.memory': conf_value})

        self.assertEqual(runner._spark_master(),
                         'local-cluster[%d,1,%d]' % (
                             cpu_count(), megs))
コード例 #13
0
ファイル: test_runner.py プロジェクト: bryankim220/mrjob
    def test_empty_jobconf_values(self):
        # value of None means to omit that jobconf
        jobconf = {'foo': '', 'bar': None}
        runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf)

        self.assertEqual(runner._hadoop_conf_args({}, 0, 1),
                         ['-D', 'foo='])
コード例 #14
0
    def test_stream_output(self):
        a_dir_path = os.path.join(self.tmp_dir, 'a')
        b_dir_path = os.path.join(self.tmp_dir, 'b')
        l_dir_path = os.path.join(self.tmp_dir, '_logs')
        os.mkdir(a_dir_path)
        os.mkdir(b_dir_path)
        os.mkdir(l_dir_path)

        a_file_path = os.path.join(a_dir_path, 'part-00000')
        b_file_path = os.path.join(b_dir_path, 'part-00001')
        c_file_path = os.path.join(self.tmp_dir, 'part-00002')
        x_file_path = os.path.join(l_dir_path, 'log.xml')
        y_file_path = os.path.join(self.tmp_dir, '_SUCCESS')

        with open(a_file_path, 'w') as f:
            f.write('A')

        with open(b_file_path, 'w') as f:
            f.write('B')

        with open(c_file_path, 'w') as f:
            f.write('C')

        with open(x_file_path, 'w') as f:
            f.write('<XML XML XML/>')

        with open(y_file_path, 'w') as f:
            f.write('I win')

        runner = LocalMRJobRunner()
        runner._output_dir = self.tmp_dir
        assert_equal(sorted(runner.stream_output()),
                     ['A', 'B', 'C'])
コード例 #15
0
ファイル: test_local.py プロジェクト: eklitzke/mrjob
    def test_get_file_splits_sorted_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write(
                '1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n'
                '3\tqux\n3\tqux\n')

        runner = LocalMRJobRunner(conf_paths=[])

        file_splits = runner._get_file_splits([input_path], 3,
                                              keep_sorted=True)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved in sorted order
        content = []
        for file_name in sorted(file_splits.keys()):
            f = open(file_name, 'r')
            content.extend(f.readlines())

        self.assertEqual(content,
                         ['1\tbar\n', '1\tbar\n', '1\tbar\n',
                          '2\tfoo\n', '2\tfoo\n', '2\tfoo\n',
                          '3\tqux\n', '3\tqux\n', '3\tqux\n'])
コード例 #16
0
ファイル: test_local.py プロジェクト: alanhdu/mrjob
    def test_get_file_splits_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, "input")
        with open(input_path, "w") as input_file:
            input_file.write("bar\nqux\nfoo\nbar\nqux\nfoo\n")

        input_path2 = os.path.join(self.tmp_dir, "input2")
        with open(input_path2, "wb") as input_file:
            input_file.write(b"foo\nbar\nbar\n")

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_path, input_path2], 3)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, "rb") as f:
                content.extend(f.readlines())

        self.assertEqual(
            sorted(content), [b"bar\n", b"bar\n", b"bar\n", b"bar\n", b"foo\n", b"foo\n", b"foo\n", b"qux\n", b"qux\n"]
        )
コード例 #17
0
ファイル: test_local.py プロジェクト: eklitzke/mrjob
    def test_get_file_splits_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('bar\nqux\nfoo\nbar\nqux\nfoo\n')

        input_path2 = os.path.join(self.tmp_dir, 'input2')
        with open(input_path2, 'w') as input_file:
            input_file.write('foo\nbar\nbar\n')

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_path, input_path2], 3)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            f = open(file_name)
            content.extend(f.readlines())

        self.assertEqual(sorted(content),
                         ['bar\n', 'bar\n', 'bar\n', 'bar\n', 'foo\n',
                          'foo\n', 'foo\n', 'qux\n', 'qux\n'])
コード例 #18
0
ファイル: test_runner.py プロジェクト: pyzen/mrjob
 def test_hadoop_output_format(self):
     format = "org.apache.hadoop.mapred.SequenceFileOutputFormat"
     runner = LocalMRJobRunner(conf_paths=[], hadoop_output_format=format)
     self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ["-outputformat", format])
     # test multi-step job
     self.assertEqual(runner._hadoop_conf_args({}, 0, 2), [])
     self.assertEqual(runner._hadoop_conf_args({}, 1, 2), ["-outputformat", format])
コード例 #19
0
ファイル: test_runner.py プロジェクト: duedil-ltd/mrjob
 def test_jobconf_job_name_custom(self):
     jobconf = {'BAX': 'Arnold', 'mapred.job.name': 'Foo'}
     runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf,
                               hadoop_version='0.18')
     self.assertEqual(runner._hadoop_conf_args({}, 0, 1),
                      ['-jobconf', 'BAX=Arnold',
                       '-jobconf', 'mapred.job.name=Foo'
                       ])
コード例 #20
0
ファイル: test_runner.py プロジェクト: duedil-ltd/mrjob
    def test_partitioner(self):
        partitioner = 'org.apache.hadoop.mapreduce.Partitioner'

        runner = LocalMRJobRunner(conf_paths=[], partitioner=partitioner)
        self.assertEqual(runner._hadoop_conf_args({}, 0, 1),
                         ['-D', 'mapred.job.name=None > None',
                          '-partitioner', partitioner,
                          ])
コード例 #21
0
ファイル: test_runner.py プロジェクト: Affirm/mrjob
    def test_command_streaming_step_without_mr_job_script(self):
        # you don't need a script to run commands
        steps = MRCmdJob(['--mapper-cmd', 'cat'])._steps_desc()

        runner = LocalMRJobRunner(steps=steps, stdin=BytesIO(b'dog\n'))

        runner.run()
        runner.cleanup()
コード例 #22
0
 def test_cmdenv(self):
     cmdenv = {'FOO': 'bar', 'BAZ': 'qux', 'BAX': 'Arnold'}
     runner = LocalMRJobRunner(conf_paths=[], cmdenv=cmdenv)
     self.assertEqual(runner._hadoop_conf_args({}, 0, 1),
                      ['-cmdenv', 'BAX=Arnold',
                       '-cmdenv', 'BAZ=qux',
                       '-cmdenv', 'FOO=bar',
                       ])
コード例 #23
0
ファイル: test_local.py プロジェクト: eklitzke/mrjob
 def test_cmdenv(self):
     cmdenv = {'FOO': 'bar', 'BAZ': 'qux', 'BAX': 'Arnold'}
     runner = LocalMRJobRunner(conf_paths=[], cmdenv=cmdenv)
     self.assertEqual(runner._hadoop_conf_args(0, 1),
                      ['-cmdenv', 'BAX=Arnold',
                       '-cmdenv', 'BAZ=qux',
                       '-cmdenv', 'FOO=bar',
                       ])
コード例 #24
0
ファイル: test_runner.py プロジェクト: pyzen/mrjob
 def test_jobconf(self):
     jobconf = {"FOO": "bar", "BAZ": "qux", "BAX": "Arnold"}
     runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf)
     self.assertEqual(runner._hadoop_conf_args({}, 0, 1), ["-D", "BAX=Arnold", "-D", "BAZ=qux", "-D", "FOO=bar"])
     runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf, hadoop_version="0.18")
     self.assertEqual(
         runner._hadoop_conf_args({}, 0, 1), ["-jobconf", "BAX=Arnold", "-jobconf", "BAZ=qux", "-jobconf", "FOO=bar"]
     )
コード例 #25
0
 def test_hadoop_output_format(self):
     format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat'
     runner = LocalMRJobRunner(conf_paths=[], hadoop_output_format=format)
     self.assertEqual(runner._hadoop_conf_args({}, 0, 1),
                      ['-outputformat', format])
     # test multi-step job
     self.assertEqual(runner._hadoop_conf_args({}, 0, 2), [])
     self.assertEqual(runner._hadoop_conf_args({}, 1, 2),
                  ['-outputformat', format])
コード例 #26
0
ファイル: test_runner.py プロジェクト: Anihc/mrjob
 def test_jobconf_from_step(self):
     jobconf = {'FOO': 'bar', 'BAZ': 'qux'}
     runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf)
     step = {'jobconf': {'BAZ': 'quux', 'BAX': 'Arnold'}}
     self.assertEqual(runner._hadoop_conf_args(step, 0, 1),
                      ['-D', 'BAX=Arnold',
                       '-D', 'BAZ=quux',
                       '-D', 'FOO=bar',
                       ])
コード例 #27
0
ファイル: test_local.py プロジェクト: AnthonyNystrom/mrjob
 def test_environment_variables_018(self):
     runner = LocalMRJobRunner(hadoop_version='0.18', conf_paths=[])
     # clean up after we're done. On windows, job names are only to
     # the millisecond, so these two tests end up trying to create
     # the same temp dir
     with runner as runner:
         runner._setup_working_dir()
         self.assertIn('mapred_cache_localArchives',
                       runner._subprocess_env('M', 0, 0).keys())
コード例 #28
0
 def test_jobconf_from_step(self):
     jobconf = {'FOO': 'bar', 'BAZ': 'qux'}
     runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf)
     step = {'jobconf': {'BAZ': 'quux', 'BAX': 'Arnold'}}
     self.assertEqual(runner._hadoop_conf_args(step, 0, 1),
                      ['-D', 'BAX=Arnold',
                       '-D', 'BAZ=quux',
                       '-D', 'FOO=bar',
                       ])
コード例 #29
0
ファイル: test_local.py プロジェクト: eklitzke/mrjob
 def test_environment_variables_018(self):
     runner = LocalMRJobRunner(hadoop_version='0.18', conf_paths=[])
     # clean up after we're done. On windows, job names are only to
     # the millisecond, so these two tests end up trying to create
     # the same temp dir
     with runner as runner:
         runner._setup_working_dir()
         self.assertIn('mapred_cache_localArchives',
                       runner._subprocess_env('mapper', 0, 0).keys())
コード例 #30
0
 def test_hadoop_output_format(self):
     format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat'
     runner = LocalMRJobRunner(conf_path=False, hadoop_output_format=format)
     assert_equal(runner._hadoop_conf_args(0, 1),
                  ['-outputformat', format])
     # test multi-step job
     assert_equal(runner._hadoop_conf_args(0, 2), [])
     assert_equal(runner._hadoop_conf_args(1, 2),
                  ['-outputformat', format])
コード例 #31
0
ファイル: test_runner.py プロジェクト: yuanda/mrjob
 def test_configuration_translation(self):
     jobconf = {'mapred.jobtracker.maxtasks.per.job': 1}
     with no_handlers_for_logger('mrjob.compat'):
         runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf,
                               hadoop_version='0.21')
     self.assertEqual(runner._hadoop_conf_args({}, 0, 1),
                      ['-D', 'mapred.jobtracker.maxtasks.per.job=1',
                       '-D', 'mapreduce.jobtracker.maxtasks.perjob=1'
                       ])
コード例 #32
0
ファイル: test_local.py プロジェクト: eklitzke/mrjob
 def test_hadoop_input_format(self):
     format = 'org.apache.hadoop.mapred.SequenceFileInputFormat'
     runner = LocalMRJobRunner(conf_paths=[], hadoop_input_format=format)
     self.assertEqual(runner._hadoop_conf_args(0, 1),
                      ['-inputformat', format])
     # test multi-step job
     self.assertEqual(runner._hadoop_conf_args(0, 2),
                      ['-inputformat', format])
     self.assertEqual(runner._hadoop_conf_args(1, 2), [])
コード例 #33
0
ファイル: runner_test.py プロジェクト: chomp/mrjob
    def test_job_name_prefix_is_now_label(self):
        old_way = LocalMRJobRunner(conf_path=False, job_name_prefix='ads_chain')
        old_opts = old_way.get_opts()

        new_way = LocalMRJobRunner(conf_path=False, label='ads_chain')
        new_opts = new_way.get_opts()

        assert_equal(old_opts, new_opts)
        assert_equal(old_opts['label'], 'ads_chain')
        assert_not_in('job_name_prefix', old_opts)
コード例 #34
0
ファイル: test_runner.py プロジェクト: anirudhreddy92/mrjob
    def test_jobconf_from_step(self):
        jobconf = {'FOO': 'bar', 'BAZ': 'qux'}
        # Hack in steps rather than creating a new MRJob subclass
        runner = LocalMRJobRunner(jobconf=jobconf)
        runner._steps = [{'jobconf': {'BAZ': 'quux', 'BAX': 'Arnold'}}]

        self.assertEqual(runner._hadoop_args_for_step(0),
                         ['-D', 'BAX=Arnold',
                          '-D', 'BAZ=quux',
                          '-D', 'FOO=bar',
                          ])
コード例 #35
0
ファイル: test_runner.py プロジェクト: Milkigit/mrjob
    def test_jobconf_from_step(self):
        jobconf = {'FOO': 'bar', 'BAZ': 'qux'}
        # Hack in steps rather than creating a new MRJob subclass
        runner = LocalMRJobRunner(jobconf=jobconf)
        runner._steps = [{'jobconf': {'BAZ': 'quux', 'BAX': 'Arnold'}}]

        self.assertEqual(runner._hadoop_args_for_step(0),
                         ['-D', 'BAX=Arnold',
                          '-D', 'BAZ=quux',
                          '-D', 'FOO=bar',
                          ])
コード例 #36
0
ファイル: test_runner.py プロジェクト: duedil-ltd/mrjob
 def test_hadoop_output_format(self):
     format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat'
     runner = LocalMRJobRunner(conf_paths=[], hadoop_output_format=format)
     self.assertEqual(runner._hadoop_conf_args({}, 0, 1),
                      ['-D', 'mapred.job.name=None > None',
                       '-outputformat', format])
     # test multi-step job
     self.assertEqual(runner._hadoop_conf_args({}, 0, 2),
                      ['-D', 'mapred.job.name=None > None (step 1 of 2)'])
     self.assertEqual(runner._hadoop_conf_args({}, 1, 2),
                      ['-D', 'mapred.job.name=None > None (step 2 of 2)',
                       '-outputformat', format
                       ])
コード例 #37
0
ファイル: test_runner.py プロジェクト: duedil-ltd/mrjob
 def test_hadoop_extra_args_comes_first(self):
     runner = LocalMRJobRunner(
         cmdenv={'FOO': 'bar'},
         conf_paths=[],
         hadoop_extra_args=['-libjar', 'qux.jar'],
         hadoop_input_format='FooInputFormat',
         hadoop_output_format='BarOutputFormat',
         jobconf={'baz': 'quz'},
         partitioner='java.lang.Object',
     )
     # hadoop_extra_args should come first
     conf_args = runner._hadoop_conf_args({}, 0, 1)
     self.assertEqual(conf_args[:2], ['-libjar', 'qux.jar'])
     self.assertEqual(len(conf_args), 14)
コード例 #38
0
ファイル: test_runner.py プロジェクト: pyzen/mrjob
 def test_hadoop_extra_args_comes_first(self):
     runner = LocalMRJobRunner(
         cmdenv={"FOO": "bar"},
         conf_paths=[],
         hadoop_extra_args=["-libjar", "qux.jar"],
         hadoop_input_format="FooInputFormat",
         hadoop_output_format="BarOutputFormat",
         jobconf={"baz": "quz"},
         partitioner="java.lang.Object",
     )
     # hadoop_extra_args should come first
     conf_args = runner._hadoop_conf_args({}, 0, 1)
     self.assertEqual(conf_args[:2], ["-libjar", "qux.jar"])
     self.assertEqual(len(conf_args), 12)
コード例 #39
0
 def test_hadoop_extra_args_comes_first(self):
     runner = LocalMRJobRunner(
         cmdenv={'FOO': 'bar'},
         conf_paths=[],
         hadoop_extra_args=['-libjar', 'qux.jar'],
         hadoop_input_format='FooInputFormat',
         hadoop_output_format='BarOutputFormat',
         jobconf={'baz': 'quz'},
         partitioner='java.lang.Object',
     )
     # hadoop_extra_args should come first
     conf_args = runner._hadoop_conf_args({}, 0, 1)
     self.assertEqual(conf_args[:2], ['-libjar', 'qux.jar'])
     self.assertEqual(len(conf_args), 12)
コード例 #40
0
ファイル: test_local.py プロジェクト: AnthonyNystrom/mrjob
class TestIronPythonEnvironment(unittest.TestCase):
    def setUp(self):
        self.runner = LocalMRJobRunner(conf_paths=[])
        self.runner._setup_working_dir()

    def test_env_ironpython(self):
        with patch.object(local, 'is_ironpython', True):
            environment = self.runner._subprocess_env('M', 0, 0)
            self.assertIn('IRONPYTHONPATH', environment)

    def test_env_no_ironpython(self):
        with patch.object(local, 'is_ironpython', False):
            environment = self.runner._subprocess_env('M', 0, 0)
            self.assertNotIn('IRONPYTHONPATH', environment)
コード例 #41
0
ファイル: test_local.py プロジェクト: DrMavenRebe/mrjob
class TestIronPythonEnvironment(unittest.TestCase):

    def setUp(self):
        self.runner = LocalMRJobRunner(conf_paths=[])
        self.runner._setup_working_dir()

    def test_env_ironpython(self):
        with patch.object(local, 'is_ironpython', True):
            environment = self.runner._subprocess_env('M', 0, 0)
            self.assertIn('IRONPYTHONPATH', environment)

    def test_env_no_ironpython(self):
        with patch.object(local, 'is_ironpython', False):
            environment = self.runner._subprocess_env('M', 0, 0)
            self.assertNotIn('IRONPYTHONPATH', environment)
コード例 #42
0
ファイル: test_local.py プロジェクト: eklitzke/mrjob
 def test_jobconf(self):
     jobconf = {'FOO': 'bar', 'BAZ': 'qux', 'BAX': 'Arnold'}
     runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf)
     self.assertEqual(runner._hadoop_conf_args(0, 1),
                      ['-D', 'BAX=Arnold',
                       '-D', 'BAZ=qux',
                       '-D', 'FOO=bar',
                       ])
     runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf,
                               hadoop_version='0.18')
     self.assertEqual(runner._hadoop_conf_args(0, 1),
                      ['-jobconf', 'BAX=Arnold',
                       '-jobconf', 'BAZ=qux',
                       '-jobconf', 'FOO=bar',
                       ])
コード例 #43
0
    def make_runner(self):
        """Make a runner based on command-line arguments, so we can
        launch this job on EMR, on Hadoop, or locally.

        :rtype: :py:class:`mrjob.runner.MRJobRunner`
        """
        if self.options.runner == 'emr':
            # avoid requiring dependencies (such as boto3) for other runners
            from mrjob.emr import EMRJobRunner
            return EMRJobRunner(**self.emr_job_runner_kwargs())

        elif self.options.runner == 'dataproc':
            from mrjob.dataproc import DataprocJobRunner
            return DataprocJobRunner(**self.dataproc_job_runner_kwargs())

        elif self.options.runner == 'hadoop':
            from mrjob.hadoop import HadoopJobRunner
            return HadoopJobRunner(**self.hadoop_job_runner_kwargs())

        elif self.options.runner == 'inline':
            raise ValueError("inline is not supported in the multi-lingual"
                             " launcher.")

        else:
            # run locally by default
            from mrjob.local import LocalMRJobRunner
            return LocalMRJobRunner(**self.local_job_runner_kwargs())
コード例 #44
0
 def test_extra_kwargs_passed_in_directly_okay(self):
     with logger_disabled('mrjob.runner'):
         with LocalMRJobRunner(conf_path=False,
                               base_tmp_dir='/var/tmp',
                               foo='bar') as runner:
             self.assertEqual(runner._opts['base_tmp_dir'], '/var/tmp')
             self.assertNotIn('bar', runner._opts)
コード例 #45
0
    def make_runner(self):
        """Make a runner based on command-line arguments, so we can
        launch this job on EMR, on Hadoop, or locally.

        :rtype: :py:class:`mrjob.runner.MRJobRunner`
        """
        # have to import here so that we can still run the MRJob
        # without importing boto
        from mrjob.emr import EMRJobRunner
        from mrjob.hadoop import HadoopJobRunner
        from mrjob.local import LocalMRJobRunner

        if self.options.runner == 'emr':
            return EMRJobRunner(**self.emr_job_runner_kwargs())

        elif self.options.runner == 'hadoop':
            return HadoopJobRunner(**self.hadoop_job_runner_kwargs())

        elif self.options.runner == 'inline':
            raise ValueError("inline is not supported in the multi-lingual"
                             " launcher.")

        else:
            # run locally by default
            return LocalMRJobRunner(**self.local_job_runner_kwargs())
コード例 #46
0
    def test_cleanup_after_with_statement(self):
        local_tmp_dir = None

        with LocalMRJobRunner() as runner:
            local_tmp_dir = runner._get_local_tmp_dir()
            assert os.path.exists(local_tmp_dir)

        assert not os.path.exists(local_tmp_dir)
コード例 #47
0
    def _test_cleanup_after_with_statement(self, mode, should_exist):
        with LocalMRJobRunner(cleanup=mode) as runner:
            self.local_tmp_dir = runner._get_local_tmp_dir()
            assert os.path.exists(self.local_tmp_dir)

        assert_equal(os.path.exists(self.local_tmp_dir), should_exist)
        if not should_exist:
            self.local_tmp_dir = None
コード例 #48
0
ファイル: test_local.py プロジェクト: Affirm/mrjob
    def gz_test(self, dir_path_name):
        contents_gz = [b'bar\n', b'qux\n', b'foo\n', b'bar\n',
                       b'qux\n', b'foo\n']
        contents_normal = [b'foo\n', b'bar\n', b'bar\n']
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = join(dir_path_name, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b''.join(contents_gz))
        input_gz.close()
        input_path2 = join(dir_path_name, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b''.join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info['orig_name'] == input_gz_path:
                self.assertEqual(split_info['start'], 0)
                self.assertEqual(split_info['length'],
                                 os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, 'rb') as f:
                lines = list(to_lines(decompress(f, file_name)))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content),
                         all_contents_sorted)
コード例 #49
0
    def gz_test(self, dir_path_name):
        contents_gz = [
            b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n'
        ]
        contents_normal = [b'foo\n', b'bar\n', b'bar\n']
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = join(dir_path_name, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b''.join(contents_gz))
        input_gz.close()
        input_path2 = join(dir_path_name, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b''.join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info['orig_name'] == input_gz_path:
                self.assertEqual(split_info['start'], 0)
                self.assertEqual(split_info['length'],
                                 os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, 'rb') as f:
                lines = list(to_lines(decompress(f, file_name)))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content), all_contents_sorted)
コード例 #50
0
    def test_create_mrjob_tar_gz(self):
        with LocalMRJobRunner(conf_path=False) as runner:
            mrjob_tar_gz_path = runner._create_mrjob_tar_gz()
            mrjob_tar_gz = tarfile.open(mrjob_tar_gz_path)
            contents = mrjob_tar_gz.getnames()

            for path in contents:
                assert_equal(path[:6], 'mrjob/')

            assert_in('mrjob/job.py', contents)
コード例 #51
0
    def test_cat_uncompressed(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('bar\nfoo\n')

        with LocalMRJobRunner() as runner:
            output = []
            for line in runner.cat(input_path):
                output.append(line)

        assert_equal(output, ['bar\n', 'foo\n'])
コード例 #52
0
    def test_cleanup_deprecated(self):
        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob', stderr)
            with LocalMRJobRunner(cleanup=CLEANUP_DEFAULT) as runner:
                self.local_tmp_dir = runner._get_local_tmp_dir()
                assert os.path.exists(self.local_tmp_dir)

            assert_equal(os.path.exists(self.local_tmp_dir), False)
            self.local_tmp_dir = None
            assert_in('deprecated', stderr.getvalue())
コード例 #53
0
ファイル: test_runner.py プロジェクト: yzhanggithub/mrjob
    def test_command_streaming_step_without_mr_job_script(self):
        # you don't need a script to run commands
        steps = MRCmdJob(['--mapper-cmd', 'cat'])._steps_desc()

        runner = LocalMRJobRunner(steps=steps, stdin=BytesIO(b'dog\n'))

        runner.run()
        runner.cleanup()
コード例 #54
0
    def test_cat_compressed(self):
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'w')
        input_gz.write('foo\nbar\n')
        input_gz.close()

        with LocalMRJobRunner() as runner:
            output = []
            for line in runner.cat(input_gz_path):
                output.append(line)

        assert_equal(output, ['foo\n', 'bar\n'])

        input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
        input_bz2 = bz2.BZ2File(input_bz2_path, 'w')
        input_bz2.write('bar\nbar\nfoo\n')
        input_bz2.close()

        with LocalMRJobRunner() as runner:
            output = []
            for line in runner.cat(input_bz2_path):
                output.append(line)

        assert_equal(output, ['bar\n', 'bar\n', 'foo\n'])
コード例 #55
0
    def test_job_name_prefix_is_now_label(self):
        with logger_disabled('mrjob.runner'):
            old_way = LocalMRJobRunner(
                conf_path=False, job_name_prefix='ads_chain')
        old_opts = old_way.get_opts()

        new_way = LocalMRJobRunner(conf_path=False, label='ads_chain')
        new_opts = new_way.get_opts()

        assert_equal(old_opts, new_opts)
        assert_equal(old_opts['label'], 'ads_chain')
        assert_not_in('job_name_prefix', old_opts)
コード例 #56
0
 def test_jobconf(self):
     jobconf = {'FOO': 'bar', 'BAZ': 'qux', 'BAX': 'Arnold'}
     runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf)
     self.assertEqual(runner._hadoop_conf_args({}, 0, 1),
                      ['-D', 'BAX=Arnold',
                       '-D', 'BAZ=qux',
                       '-D', 'FOO=bar',
                       ])
     runner = LocalMRJobRunner(conf_paths=[], jobconf=jobconf,
                               hadoop_version='0.18')
     self.assertEqual(runner._hadoop_conf_args({}, 0, 1),
                      ['-jobconf', 'BAX=Arnold',
                       '-jobconf', 'BAZ=qux',
                       '-jobconf', 'FOO=bar',
                       ])
コード例 #57
0
    def test_partitioner(self):
        partitioner = 'org.apache.hadoop.mapreduce.Partitioner'

        runner = LocalMRJobRunner(conf_paths=[], partitioner=partitioner)
        self.assertEqual(runner._hadoop_conf_args({}, 0, 1),
                         ['-partitioner', partitioner])