Example #1
0
    def test_uses_generic_jobconf(self):
        self.assertEqual(uses_generic_jobconf('0.18'), False)
        self.assertEqual(uses_generic_jobconf('0.20'), True)
        self.assertEqual(uses_generic_jobconf('0.21'), True)

        # default to True
        self.assertEqual(uses_generic_jobconf(None), True)
Example #2
0
    def test_uses_generic_jobconf(self):
        self.assertEqual(uses_generic_jobconf('0.18'), False)
        self.assertEqual(uses_generic_jobconf('0.20'), True)
        self.assertEqual(uses_generic_jobconf('0.21'), True)

        # default to True
        self.assertEqual(uses_generic_jobconf(None), True)
Example #3
0
    def _hadoop_conf_args(self, step, step_num, num_steps):
        """Build a list of extra arguments to the hadoop binary.

        This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*,
        *hadoop_output_format*, *jobconf*, and *partitioner*.

        This doesn't handle input, output, mappers, reducers, or uploading
        files.
        """
        assert 0 <= step_num < num_steps

        args = []

        jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf'))

        # hadoop_extra_args
        args.extend(self._opts['hadoop_extra_args'])

        # new-style jobconf
        version = self.get_hadoop_version()
        if uses_generic_jobconf(version):
            for key, value in sorted(jobconf.iteritems()):
                args.extend(['-D', '%s=%s' % (key, value)])

        # partitioner
        if self._partitioner:
            args.extend(['-partitioner', self._partitioner])

        # cmdenv
        for key, value in sorted(self._get_cmdenv().iteritems()):
            args.append('-cmdenv')
            args.append('%s=%s' % (key, value))

        # hadoop_input_format
        if (step_num == 0 and self._hadoop_input_format):
            args.extend(['-inputformat', self._hadoop_input_format])

        # hadoop_output_format
        if (step_num == num_steps - 1 and self._hadoop_output_format):
            args.extend(['-outputformat', self._hadoop_output_format])

        # old-style jobconf
        if not uses_generic_jobconf(version):
            for key, value in sorted(jobconf.iteritems()):
                args.extend(['-jobconf', '%s=%s' % (key, value)])

        return args
Example #4
0
    def _hadoop_conf_args(self, step_num, num_steps):
        """Build a list of extra arguments to the hadoop binary.

        This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*,
        *hadoop_output_format*, *jobconf*, and *partitioner*.

        This doesn't handle input, output, mappers, reducers, or uploading
        files.
        """
        assert 0 <= step_num < num_steps

        args = []

        # hadoop_extra_args
        args.extend(self._opts['hadoop_extra_args'])

        # new-style jobconf
        version = self.get_hadoop_version()
        if compat.uses_generic_jobconf(version):
            for key, value in sorted(self._opts['jobconf'].iteritems()):
                args.extend(['-D', '%s=%s' % (key, value)])

        # partitioner
        if self._partitioner:
            args.extend(['-partitioner', self._partitioner])

        # cmdenv
        for key, value in sorted(self._get_cmdenv().iteritems()):
            args.append('-cmdenv')
            args.append('%s=%s' % (key, value))

        # hadoop_input_format
        if (step_num == 0 and self._hadoop_input_format):
            args.extend(['-inputformat', self._hadoop_input_format])

        # hadoop_output_format
        if (step_num == num_steps - 1 and self._hadoop_output_format):
            args.extend(['-outputformat', self._hadoop_output_format])

        # old-style jobconf
        if not compat.uses_generic_jobconf(version):
            for key, value in sorted(self._opts['jobconf'].iteritems()):
                args.extend(['-jobconf', '%s=%s' % (key, value)])

        return args
Example #5
0
    def _hadoop_conf_args(self, step, step_num, num_steps):
        """Build a list of extra arguments to the hadoop binary.

        This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*,
        *hadoop_output_format*, *jobconf*, and *partitioner*.

        This doesn't handle input, output, mappers, reducers, or uploading
        files.
        """
        assert 0 <= step_num < num_steps

        args = []

        jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf'))

        # hadoop_extra_args
        args.extend(self._opts['hadoop_extra_args'])

        # new-style jobconf
        version = self.get_hadoop_version()

        # translate the jobconf configuration names to match
        # the hadoop version
        jobconf = add_translated_jobconf_for_hadoop_version(jobconf, version)
        if uses_generic_jobconf(version):
            for key, value in sorted(jobconf.iteritems()):
                if value is not None:
                    args.extend(['-D', '%s=%s' % (key, value)])
        # old-style jobconf
        else:
            for key, value in sorted(jobconf.iteritems()):
                if value is not None:
                    args.extend(['-jobconf', '%s=%s' % (key, value)])

        # partitioner
        if self._partitioner:
            args.extend(['-partitioner', self._partitioner])

        # cmdenv
        for key, value in sorted(self._opts['cmdenv'].iteritems()):
            args.append('-cmdenv')
            args.append('%s=%s' % (key, value))

        # hadoop_input_format
        if (step_num == 0 and self._hadoop_input_format):
            args.extend(['-inputformat', self._hadoop_input_format])

        # hadoop_output_format
        if (step_num == num_steps - 1 and self._hadoop_output_format):
            args.extend(['-outputformat', self._hadoop_output_format])

        return args
Example #6
0
    def _hadoop_args_for_step(self, step_num):
        """Build a list of extra arguments to the hadoop binary.

        This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*,
        *hadoop_output_format*, *jobconf*, and *partitioner*.

        This doesn't handle input, output, mappers, reducers, or uploading
        files.
        """
        assert 0 <= step_num < self._num_steps()

        args = []

        # hadoop_extra_args
        args.extend(self._opts['hadoop_extra_args'])

        # new-style jobconf
        version = self.get_hadoop_version()

        # translate the jobconf configuration names to match
        # the hadoop version
        jobconf = self._jobconf_for_step(step_num)

        if uses_generic_jobconf(version):
            for key, value in sorted(jobconf.items()):
                if value is not None:
                    args.extend(['-D', '%s=%s' % (key, value)])
        # old-style jobconf
        else:
            for key, value in sorted(jobconf.items()):
                if value is not None:
                    args.extend(['-jobconf', '%s=%s' % (key, value)])

        # partitioner
        if self._partitioner:
            args.extend(['-partitioner', self._partitioner])

        # cmdenv
        for key, value in sorted(self._opts['cmdenv'].items()):
            args.append('-cmdenv')
            args.append('%s=%s' % (key, value))

        # hadoop_input_format
        if (step_num == 0 and self._hadoop_input_format):
            args.extend(['-inputformat', self._hadoop_input_format])

        # hadoop_output_format
        if (step_num == self._num_steps() - 1 and self._hadoop_output_format):
            args.extend(['-outputformat', self._hadoop_output_format])

        return args
Example #7
0
    def _hadoop_conf_args(self, step, step_num, num_steps):
        """Build a list of extra arguments to the hadoop binary.

        This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*,
        *hadoop_output_format*, *jobconf*, and *partitioner*.

        This doesn't handle input, output, mappers, reducers, or uploading
        files.
        """
        assert 0 <= step_num < num_steps

        args = []

        jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf'))
        job_name = jobconf.get('mapred.job.name', None)

        # Set a default job name
        if not job_name:
            job_name = "%s > %s" % (self._script_path, self._output_dir)

        # Add the step into the job name
        if num_steps > 1:
            job_name = "%s (step %s of %s)" % (job_name, step_num + 1, num_steps)

        jobconf['mapred.job.name'] = job_name

        # hadoop_extra_args
        args.extend(self._opts['hadoop_extra_args'])

        # new-style jobconf
        version = self.get_hadoop_version()

        # translate the jobconf configuration names to match
        # the hadoop version
        jobconf = add_translated_jobconf_for_hadoop_version(jobconf,
                                                            version)
        if uses_generic_jobconf(version):
            for key, value in sorted(jobconf.iteritems()):
                args.extend(['-D', '%s=%s' % (key, value)])
        # old-style jobconf
        else:
            for key, value in sorted(jobconf.iteritems()):
                args.extend(['-jobconf', '%s=%s' % (key, value)])

        # partitioner
        if self._partitioner:
            args.extend(['-partitioner', self._partitioner])

        # cmdenv
        for key, value in sorted(self._opts['cmdenv'].iteritems()):
            args.append('-cmdenv')
            args.append('%s=%s' % (key, value))

        # hadoop_input_format
        if (step_num == 0 and self._hadoop_input_format):
            args.extend(['-inputformat', self._hadoop_input_format])

        # hadoop_output_format
        if (step_num == num_steps - 1 and self._hadoop_output_format):
            args.extend(['-outputformat', self._hadoop_output_format])

        return args
Example #8
0
 def test_uses_generic_jobconf(self):
     assert_equal(uses_generic_jobconf('0.18'), False)
     assert_equal(uses_generic_jobconf('0.20'), True)
     assert_equal(uses_generic_jobconf('0.21'), True)
Example #9
0
 def test_uses_generic_jobconf(self):
     assert_equal(uses_generic_jobconf('0.18'), False)
     assert_equal(uses_generic_jobconf('0.20'), True)
     assert_equal(uses_generic_jobconf('0.21'), True)