Beispiel #1
0
    def test_jobconf_simulated_by_runner(self):
        input_path = os.path.join(self.tmp_dir, "input")
        with open(input_path, "wb") as input_file:
            input_file.write("foo\n")

        upload_path = os.path.join(self.tmp_dir, "upload")
        with open(upload_path, "wb") as upload_file:
            upload_file.write("PAYLOAD")

        mr_job = MRTestJobConf(
            [
                "-r",
                self.RUNNER,
                "--jobconf=user.defined=something",
                "--jobconf=mapred.map.tasks=1",
                "--file",
                upload_path,
                input_path,
            ]
        )
        mr_job.sandbox()

        results = {}

        # between the single line of input and setting mapred.map.tasks to 1,
        # we should be restricted to only one task, which will give more
        # predictable results

        with mr_job.make_runner() as runner:
            script_path = runner._script_path

            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results[key] = value

        working_dir = results["mapreduce.job.local.dir"]
        self.assertEqual(working_dir, os.path.join(runner._get_local_tmp_dir(), "job_local_dir", "0", "mapper", "0"))

        self.assertEqual(results["mapreduce.job.cache.archives"], "")
        self.assertEqual(
            results["mapreduce.job.cache.files"], script_path + "#mr_test_jobconf.py" + "," + upload_path + "#upload"
        )
        self.assertEqual(results["mapreduce.job.cache.local.archives"], "")
        self.assertEqual(
            results["mapreduce.job.cache.local.files"],
            os.path.join(working_dir, "mr_test_jobconf.py") + "," + os.path.join(working_dir, "upload"),
        )
        self.assertEqual(results["mapreduce.job.id"], runner._job_name)

        self.assertEqual(results["mapreduce.map.input.file"], input_path)
        self.assertEqual(results["mapreduce.map.input.length"], "4")
        self.assertEqual(results["mapreduce.map.input.start"], "0")
        self.assertEqual(results["mapreduce.task.attempt.id"], "attempt_%s_mapper_000000_0" % runner._job_name)
        self.assertEqual(results["mapreduce.task.id"], "task_%s_mapper_000000" % runner._job_name)
        self.assertEqual(results["mapreduce.task.ismap"], "true")
        self.assertEqual(results["mapreduce.task.output.dir"], runner._output_dir)
        self.assertEqual(results["mapreduce.task.partition"], "0")
        self.assertEqual(results["user.defined"], "something")
Beispiel #2
0
    def test_others(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('foo\n')

        mr_job = MRTestJobConf(['-r', 'inline',
                                '--jobconf=user.defined=something',
                               input_path])
        mr_job.sandbox()

        results = {}

        with mr_job.make_runner() as runner:
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results[key] = value

        self.assertEqual(results['mapreduce.job.id'], runner._job_name)
        self.assertEqual(results['mapreduce.job.local.dir'],
                         runner._working_dir)
        self.assertEqual(results['mapreduce.map.input.file'], input_path)
        self.assertEqual(results['mapreduce.map.input.length'], '4')
        self.assertEqual(results['mapreduce.map.input.start'], '0')
        self.assertEqual(results['mapreduce.task.attempt.id'],
                       'attempt_%s_mapper_000000_0' % runner._job_name)
        self.assertEqual(results['mapreduce.task.id'],
                       'task_%s_mapper_000000' % runner._job_name)
        self.assertEqual(results['mapreduce.task.ismap'], 'true')
        self.assertEqual(results['mapreduce.task.output.dir'],
                         runner._output_dir)
        self.assertEqual(results['mapreduce.task.partition'], '0')
        self.assertEqual(results['user.defined'], 'something')
Beispiel #3
0
    def test_jobconf_simulated_by_runner(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write('foo\n')

        upload_path = os.path.join(self.tmp_dir, 'upload')
        with open(upload_path, 'wb') as upload_file:
            upload_file.write('PAYLOAD')

        mr_job = MRTestJobConf(['-r', self.RUNNER,
                                '--jobconf=user.defined=something',
                                '--jobconf=mapred.map.tasks=1',
                                '--file', upload_path,
                               input_path])
        mr_job.sandbox()

        results = {}

        # between the single line of input and setting mapred.map.tasks to 1,
        # we should be restricted to only one task, which will give more
        # predictable results

        with mr_job.make_runner() as runner:
            script_path = runner._script_path

            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results[key] = value

        working_dir = results['mapreduce.job.local.dir']
        self.assertEqual(working_dir,
                         os.path.join(runner._get_local_tmp_dir(),
                                      'job_local_dir', '0', 'mapper', '0'))

        self.assertEqual(results['mapreduce.job.cache.archives'], '')
        self.assertEqual(results['mapreduce.job.cache.files'],
                         script_path + '#mr_test_jobconf.py' + ',' +
                         upload_path + '#upload')
        self.assertEqual(results['mapreduce.job.cache.local.archives'], '')
        self.assertEqual(
            results['mapreduce.job.cache.local.files'],
            os.path.join(working_dir, 'mr_test_jobconf.py') + ',' +
            os.path.join(working_dir, 'upload'))
        self.assertEqual(results['mapreduce.job.id'], runner._job_name)

        self.assertEqual(results['mapreduce.map.input.file'], input_path)
        self.assertEqual(results['mapreduce.map.input.length'], '4')
        self.assertEqual(results['mapreduce.map.input.start'], '0')
        self.assertEqual(results['mapreduce.task.attempt.id'],
                       'attempt_%s_mapper_000000_0' % runner._job_name)
        self.assertEqual(results['mapreduce.task.id'],
                       'task_%s_mapper_000000' % runner._job_name)
        self.assertEqual(results['mapreduce.task.ismap'], 'true')
        self.assertEqual(results['mapreduce.task.output.dir'],
                         runner._output_dir)
        self.assertEqual(results['mapreduce.task.partition'], '0')
        self.assertEqual(results['user.defined'], 'something')
Beispiel #4
0
    def test_mapper_init(self):
        two_lines_path = self.makefile('two_lines', b'line\nother line\n')

        job = MRTestJobConf(
            ['-r', 'spark', '--emulate-map-input-file', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output['mapreduce.map.input.file'],
                             'file://' + two_lines_path)
Beispiel #5
0
    def test_empty_file(self):
        two_lines_path = self.makefile('two_lines', b'line\nother line\n')
        no_lines_path = self.makefile('no_lines', b'')

        job = MRTestJobConf(
            ['-r', 'spark', '--emulate-map-input-file', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            paths = [
                path for jobconf, path in job.parse_output(runner.cat_output())
                if jobconf == 'mapreduce.map.input.file'
            ]

            # ideally, no_lines_path would appear too, but what we care
            # about is that we don't get a crash from trying to read
            # the "first" line of the file
            self.assertEqual(paths, ['file://' + two_lines_path])
Beispiel #6
0
    def test_others(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('foo\n')

        mr_job = MRTestJobConf(
            ['-r', 'inline', '--jobconf=user.defined=something', input_path])
        mr_job.sandbox()

        results = {}

        with mr_job.make_runner() as runner:
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results[key] = value

        self.assertEqual(results['mapreduce.job.id'], runner._job_name)
        self.assertEqual(results['mapreduce.job.local.dir'],
                         runner._working_dir)
        self.assertEqual(results['mapreduce.map.input.file'], input_path)
        self.assertEqual(results['mapreduce.map.input.length'], '4')
        self.assertEqual(results['mapreduce.map.input.start'], '0')
        self.assertEqual(results['mapreduce.task.attempt.id'],
                         'attempt_%s_mapper_000000_0' % runner._job_name)
        self.assertEqual(results['mapreduce.task.id'],
                         'task_%s_mapper_000000' % runner._job_name)
        self.assertEqual(results['mapreduce.task.ismap'], 'true')
        self.assertEqual(results['mapreduce.task.output.dir'],
                         runner._output_dir)
        self.assertEqual(results['mapreduce.task.partition'], '0')
        self.assertEqual(results['user.defined'], 'something')
Beispiel #7
0
    def test_jobconf_simulated_by_runner(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write('foo\n')

        upload_path = os.path.join(self.tmp_dir, 'upload')
        with open(upload_path, 'wb') as upload_file:
            upload_file.write('PAYLOAD')

        mr_job = MRTestJobConf([
            '-r', self.RUNNER, '--jobconf=user.defined=something',
            '--jobconf=mapred.map.tasks=1', '--file', upload_path, input_path
        ])
        mr_job.sandbox()

        results = {}

        # between the single line of input and setting mapred.map.tasks to 1,
        # we should be restricted to only one task, which will give more
        # predictable results

        with mr_job.make_runner() as runner:
            script_path = runner._script_path

            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results[key] = value

        working_dir = results['mapreduce.job.local.dir']
        self.assertEqual(
            working_dir,
            os.path.join(runner._get_local_tmp_dir(), 'job_local_dir', '0',
                         'mapper', '0'))

        self.assertEqual(results['mapreduce.job.cache.archives'], '')
        self.assertEqual(
            results['mapreduce.job.cache.files'], script_path +
            '#mr_test_jobconf.py' + ',' + upload_path + '#upload')
        self.assertEqual(results['mapreduce.job.cache.local.archives'], '')
        self.assertEqual(
            results['mapreduce.job.cache.local.files'],
            os.path.join(working_dir, 'mr_test_jobconf.py') + ',' +
            os.path.join(working_dir, 'upload'))
        self.assertEqual(results['mapreduce.job.id'], runner._job_name)

        self.assertEqual(results['mapreduce.map.input.file'], input_path)
        self.assertEqual(results['mapreduce.map.input.length'], '4')
        self.assertEqual(results['mapreduce.map.input.start'], '0')
        self.assertEqual(results['mapreduce.task.attempt.id'],
                         'attempt_%s_mapper_000000_0' % runner._job_name)
        self.assertEqual(results['mapreduce.task.id'],
                         'task_%s_mapper_000000' % runner._job_name)
        self.assertEqual(results['mapreduce.task.ismap'], 'true')
        self.assertEqual(results['mapreduce.task.output.dir'],
                         runner._output_dir)
        self.assertEqual(results['mapreduce.task.partition'], '0')
        self.assertEqual(results['user.defined'], 'something')
Beispiel #8
0
    def test_jobconf_simulated_by_runner(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(b'foo\n')

        upload_path = os.path.join(self.tmp_dir, 'upload')
        with open(upload_path, 'wb') as upload_file:
            upload_file.write(b'PAYLOAD')

        # use --no-bootstrap-mrjob so we don't have to worry about
        # mrjob.tar.gz and the setup wrapper script
        self.add_mrjob_to_pythonpath()
        mr_job = MRTestJobConf([
            '-r', self.RUNNER, '--no-bootstrap-mrjob',
            '--jobconf=user.defined=something', '--jobconf=mapred.map.tasks=1',
            '--file', upload_path, input_path
        ])

        mr_job.sandbox()

        results = {}

        # between the single line of input and setting mapred.map.tasks to 1,
        # we should be restricted to only one task, which will give more
        # predictable results

        with mr_job.make_runner() as runner:
            script_path = runner._script_path

            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results[key] = value

        working_dir = results['mapreduce.job.local.dir']
        self.assertEqual(
            working_dir,
            os.path.join(runner._get_local_tmp_dir(), 'job_local_dir', '0',
                         'mapper', '0'))

        self.assertEqual(results['mapreduce.job.cache.archives'], '')
        expected_cache_files = [
            script_path + '#mr_test_jobconf.py', upload_path + '#upload'
        ] + [
            '%s#%s' % (path, name)
            for path, name in self._extra_expected_local_files(runner)
        ]
        self.assertEqual(
            sorted(results['mapreduce.job.cache.files'].split(',')),
            sorted(expected_cache_files))
        self.assertEqual(results['mapreduce.job.cache.local.archives'], '')
        expected_local_files = [
            os.path.join(working_dir, 'mr_test_jobconf.py'),
            os.path.join(working_dir, 'upload')
        ] + [
            os.path.join(working_dir, name)
            for path, name in self._extra_expected_local_files(runner)
        ]
        self.assertEqual(
            sorted(results['mapreduce.job.cache.local.files'].split(',')),
            sorted(expected_local_files))
        self.assertEqual(results['mapreduce.job.id'], runner._job_key)

        self.assertEqual(results['mapreduce.map.input.file'], input_path)
        self.assertEqual(results['mapreduce.map.input.length'], '4')
        self.assertEqual(results['mapreduce.map.input.start'], '0')
        self.assertEqual(results['mapreduce.task.attempt.id'],
                         'attempt_%s_mapper_00000_0' % runner._job_key)
        self.assertEqual(results['mapreduce.task.id'],
                         'task_%s_mapper_00000' % runner._job_key)
        self.assertEqual(results['mapreduce.task.ismap'], 'true')
        self.assertEqual(results['mapreduce.task.output.dir'],
                         runner._output_dir)
        self.assertEqual(results['mapreduce.task.partition'], '0')
        self.assertEqual(results['user.defined'], 'something')
Beispiel #9
0
    def test_jobconf_simulated_by_runner(self):
        # use a .gz file so there's only one split
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        with gzip.GzipFile(input_gz_path, 'wb') as input_gz:
            input_gz.write(b'foo\n')
        input_gz_size = os.stat(input_gz_path)[stat.ST_SIZE]

        upload_path = os.path.join(self.tmp_dir, 'upload')
        with open(upload_path, 'wb') as upload_file:
            upload_file.write(b'PAYLOAD')

        # use --no-bootstrap-mrjob so we don't have to worry about
        # mrjob.tar.gz and the setup wrapper script
        self.add_mrjob_to_pythonpath()
        mr_job = MRTestJobConf(['-r', self.RUNNER,
                                '--no-bootstrap-mrjob',
                                '--jobconf=user.defined=something',
                                '--file', upload_path,
                               input_gz_path])

        mr_job.sandbox()

        results = {}

        # between the single line of input and setting mapred.map.tasks to 1,
        # we should be restricted to only one task, which will give more
        # predictable results

        with mr_job.make_runner() as runner:
            script_path = runner._script_path

            runner.run()

            results.update(dict(mr_job.parse_output(runner.cat_output())))

        working_dir = results['mapreduce.job.local.dir']
        self.assertEqual(working_dir,
                         os.path.join(runner._get_local_tmp_dir(),
                                      'step', '000', 'mapper', '00000', 'wd'))

        self.assertEqual(results['mapreduce.job.cache.archives'], '')

        expected_cache_files = [
            script_path + '#mr_test_jobconf.py',
            upload_path + '#upload'
        ] + [
            '%s#%s' % (path, name)
            for path, name in self._extra_expected_local_files(runner)
        ]
        self.assertEqual(
            sorted(results['mapreduce.job.cache.files'].split(',')),
            sorted(expected_cache_files))

        self.assertEqual(results['mapreduce.job.cache.local.archives'], '')
        expected_local_files = [
            os.path.join(working_dir, 'mr_test_jobconf.py'),
            os.path.join(working_dir, 'upload')
        ] + [
            os.path.join(working_dir, name)
            for path, name in self._extra_expected_local_files(runner)
        ]
        self.assertEqual(
            sorted(results['mapreduce.job.cache.local.files'].split(',')),
            sorted(expected_local_files))
        self.assertEqual(results['mapreduce.job.id'], runner._job_key)

        self.assertEqual(results['mapreduce.map.input.file'], input_gz_path)
        self.assertEqual(results['mapreduce.map.input.length'],
                         str(input_gz_size))
        self.assertEqual(results['mapreduce.map.input.start'], '0')
        self.assertEqual(results['mapreduce.task.attempt.id'],
                         'attempt_%s_mapper_00000_0' % runner._job_key)
        self.assertEqual(results['mapreduce.task.id'],
                         'task_%s_mapper_00000' % runner._job_key)
        self.assertEqual(results['mapreduce.task.ismap'], 'true')
        self.assertEqual(results['mapreduce.task.output.dir'],
                         runner._output_dir)
        self.assertEqual(results['mapreduce.task.partition'], '0')
        self.assertEqual(results['user.defined'], 'something')
Beispiel #10
0
    def test_jobconf_simulated_by_runner(self):
        # use a .gz file so there's only one split
        input_gz_path = join(self.tmp_dir, 'input.gz')
        with gzip.GzipFile(input_gz_path, 'wb') as input_gz:
            input_gz.write(b'foo\n')
        input_gz_size = os.stat(input_gz_path)[stat.ST_SIZE]

        upload_path = join(self.tmp_dir, 'upload')
        with open(upload_path, 'wb') as upload_file:
            upload_file.write(b'PAYLOAD')

        # use --no-bootstrap-mrjob so we don't have to worry about
        # mrjob.tar.gz and the setup wrapper script
        self.add_mrjob_to_pythonpath()
        mr_job = MRTestJobConf([
            '-r', self.RUNNER, '--no-bootstrap-mrjob',
            '-D=user.defined=something', '--files', upload_path, input_gz_path
        ])

        mr_job.sandbox()

        results = {}

        # between the single line of input and setting mapred.map.tasks to 1,
        # we should be restricted to only one task, which will give more
        # predictable results

        with mr_job.make_runner() as runner:
            script_path = runner._script_path

            runner.run()

            results.update(dict(mr_job.parse_output(runner.cat_output())))

        working_dir = results['mapreduce.job.local.dir']
        self.assertEqual(
            working_dir,
            join(runner._get_local_tmp_dir(), 'step', '000', 'mapper', '00000',
                 'wd'))

        self.assertEqual(results['mapreduce.job.cache.archives'], '')

        expected_cache_files = [
            script_path + '#mr_test_jobconf.py', upload_path + '#upload'
        ] + [
            '%s#%s' % (path, name)
            for path, name in self._extra_expected_local_files(runner)
        ]
        self.assertEqual(
            sorted(results['mapreduce.job.cache.files'].split(',')),
            sorted(expected_cache_files))

        self.assertEqual(results['mapreduce.job.cache.local.archives'], '')
        expected_local_files = [
            join(working_dir, 'mr_test_jobconf.py'),
            join(working_dir, 'upload')
        ] + [
            join(working_dir, name)
            for path, name in self._extra_expected_local_files(runner)
        ]
        self.assertEqual(
            sorted(results['mapreduce.job.cache.local.files'].split(',')),
            sorted(expected_local_files))
        self.assertEqual(results['mapreduce.job.id'], runner._job_key)

        self.assertEqual(results['mapreduce.map.input.file'],
                         'file://' + input_gz_path)
        self.assertEqual(results['mapreduce.map.input.length'],
                         str(input_gz_size))
        self.assertEqual(results['mapreduce.map.input.start'], '0')
        self.assertEqual(results['mapreduce.task.attempt.id'],
                         'attempt_%s_mapper_00000_0' % runner._job_key)
        self.assertEqual(results['mapreduce.task.id'],
                         'task_%s_mapper_00000' % runner._job_key)
        self.assertEqual(results['mapreduce.task.ismap'], 'true')
        self.assertEqual(results['mapreduce.task.output.dir'],
                         runner._output_dir)
        self.assertEqual(results['mapreduce.task.partition'], '0')
        self.assertEqual(results['user.defined'], 'something')