Ejemplo n.º 1
0
    def test_mapper_init(self):
        two_lines_path = self.makefile('two_lines', b'line\nother line\n')

        job = MRTestJobConf(
            ['-r', 'spark', '--emulate-map-input-file', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output['mapreduce.map.input.file'],
                             'file://' + two_lines_path)
Ejemplo n.º 2
0
    def test_empty_file(self):
        two_lines_path = self.makefile('two_lines', b'line\nother line\n')
        no_lines_path = self.makefile('no_lines', b'')

        job = MRTestJobConf(
            ['-r', 'spark', '--emulate-map-input-file', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            paths = [
                path for jobconf, path in job.parse_output(runner.cat_output())
                if jobconf == 'mapreduce.map.input.file'
            ]

            # ideally, no_lines_path would appear too, but what we care
            # about is that we don't get a crash from trying to read
            # the "first" line of the file
            self.assertEqual(paths, ['file://' + two_lines_path])
Ejemplo n.º 3
0
    def test_jobconf_simulated_by_runner(self):
        # use a .gz file so there's only one split
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        with gzip.GzipFile(input_gz_path, 'wb') as input_gz:
            input_gz.write(b'foo\n')
        input_gz_size = os.stat(input_gz_path)[stat.ST_SIZE]

        upload_path = os.path.join(self.tmp_dir, 'upload')
        with open(upload_path, 'wb') as upload_file:
            upload_file.write(b'PAYLOAD')

        # use --no-bootstrap-mrjob so we don't have to worry about
        # mrjob.tar.gz and the setup wrapper script
        self.add_mrjob_to_pythonpath()
        mr_job = MRTestJobConf(['-r', self.RUNNER,
                                '--no-bootstrap-mrjob',
                                '-D=user.defined=something',
                                '--files', upload_path,
                               input_gz_path])

        mr_job.sandbox()

        results = {}

        # between the single line of input and setting mapred.map.tasks to 1,
        # we should be restricted to only one task, which will give more
        # predictable results

        with mr_job.make_runner() as runner:
            script_path = runner._script_path

            runner.run()

            results.update(dict(mr_job.parse_output(runner.cat_output())))

        working_dir = results['mapreduce.job.local.dir']
        self.assertEqual(working_dir,
                         os.path.join(runner._get_local_tmp_dir(),
                                      'step', '000', 'mapper', '00000', 'wd'))

        self.assertEqual(results['mapreduce.job.cache.archives'], '')

        expected_cache_files = [
            script_path + '#mr_test_jobconf.py',
            upload_path + '#upload'
        ] + [
            '%s#%s' % (path, name)
            for path, name in self._extra_expected_local_files(runner)
        ]
        self.assertEqual(
            sorted(results['mapreduce.job.cache.files'].split(',')),
            sorted(expected_cache_files))

        self.assertEqual(results['mapreduce.job.cache.local.archives'], '')
        expected_local_files = [
            os.path.join(working_dir, 'mr_test_jobconf.py'),
            os.path.join(working_dir, 'upload')
        ] + [
            os.path.join(working_dir, name)
            for path, name in self._extra_expected_local_files(runner)
        ]
        self.assertEqual(
            sorted(results['mapreduce.job.cache.local.files'].split(',')),
            sorted(expected_local_files))
        self.assertEqual(results['mapreduce.job.id'], runner._job_key)

        self.assertEqual(results['mapreduce.map.input.file'], input_gz_path)
        self.assertEqual(results['mapreduce.map.input.length'],
                         str(input_gz_size))
        self.assertEqual(results['mapreduce.map.input.start'], '0')
        self.assertEqual(results['mapreduce.task.attempt.id'],
                         'attempt_%s_mapper_00000_0' % runner._job_key)
        self.assertEqual(results['mapreduce.task.id'],
                         'task_%s_mapper_00000' % runner._job_key)
        self.assertEqual(results['mapreduce.task.ismap'], 'true')
        self.assertEqual(results['mapreduce.task.output.dir'],
                         runner._output_dir)
        self.assertEqual(results['mapreduce.task.partition'], '0')
        self.assertEqual(results['user.defined'], 'something')
Ejemplo n.º 4
0
    def test_jobconf_simulated_by_runner(self):
        # use a .gz file so there's only one split
        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        with gzip.GzipFile(input_gz_path, 'wb') as input_gz:
            input_gz.write(b'foo\n')
        input_gz_size = os.stat(input_gz_path)[stat.ST_SIZE]

        upload_path = os.path.join(self.tmp_dir, 'upload')
        with open(upload_path, 'wb') as upload_file:
            upload_file.write(b'PAYLOAD')

        # use --no-bootstrap-mrjob so we don't have to worry about
        # mrjob.tar.gz and the setup wrapper script
        self.add_mrjob_to_pythonpath()
        mr_job = MRTestJobConf(['-r', self.RUNNER,
                                '--no-bootstrap-mrjob',
                                '--jobconf=user.defined=something',
                                '--file', upload_path,
                               input_gz_path])

        mr_job.sandbox()

        results = {}

        # between the single line of input and setting mapred.map.tasks to 1,
        # we should be restricted to only one task, which will give more
        # predictable results

        with mr_job.make_runner() as runner:
            script_path = runner._script_path

            runner.run()

            results.update(dict(mr_job.parse_output(runner.cat_output())))

        working_dir = results['mapreduce.job.local.dir']
        self.assertEqual(working_dir,
                         os.path.join(runner._get_local_tmp_dir(),
                                      'step', '000', 'mapper', '00000', 'wd'))

        self.assertEqual(results['mapreduce.job.cache.archives'], '')

        expected_cache_files = [
            script_path + '#mr_test_jobconf.py',
            upload_path + '#upload'
        ] + [
            '%s#%s' % (path, name)
            for path, name in self._extra_expected_local_files(runner)
        ]
        self.assertEqual(
            sorted(results['mapreduce.job.cache.files'].split(',')),
            sorted(expected_cache_files))

        self.assertEqual(results['mapreduce.job.cache.local.archives'], '')
        expected_local_files = [
            os.path.join(working_dir, 'mr_test_jobconf.py'),
            os.path.join(working_dir, 'upload')
        ] + [
            os.path.join(working_dir, name)
            for path, name in self._extra_expected_local_files(runner)
        ]
        self.assertEqual(
            sorted(results['mapreduce.job.cache.local.files'].split(',')),
            sorted(expected_local_files))
        self.assertEqual(results['mapreduce.job.id'], runner._job_key)

        self.assertEqual(results['mapreduce.map.input.file'], input_gz_path)
        self.assertEqual(results['mapreduce.map.input.length'],
                         str(input_gz_size))
        self.assertEqual(results['mapreduce.map.input.start'], '0')
        self.assertEqual(results['mapreduce.task.attempt.id'],
                         'attempt_%s_mapper_00000_0' % runner._job_key)
        self.assertEqual(results['mapreduce.task.id'],
                         'task_%s_mapper_00000' % runner._job_key)
        self.assertEqual(results['mapreduce.task.ismap'], 'true')
        self.assertEqual(results['mapreduce.task.output.dir'],
                         runner._output_dir)
        self.assertEqual(results['mapreduce.task.partition'], '0')
        self.assertEqual(results['user.defined'], 'something')