Example #1
0
 def test_translate_jobconf(self):
     assert_equal(translate_jobconf('user.name', '0.18'), 'user.name')
     assert_equal(translate_jobconf('mapreduce.job.user.name', '0.18'),
                  'user.name')
     assert_equal(translate_jobconf('user.name', '0.19'), 'user.name')
     assert_equal(translate_jobconf('mapreduce.job.user.name', '0.19.2'),
                  'user.name')
     assert_equal(translate_jobconf('user.name', '0.21'),
                  'mapreduce.job.user.name')
Example #2
0
 def test_translate_jobconf(self):
     assert_equal(translate_jobconf('user.name', '0.18'),
                  'user.name')
     assert_equal(translate_jobconf('mapreduce.job.user.name', '0.18'),
                  'user.name')
     assert_equal(translate_jobconf('user.name', '0.19'),
                  'user.name')
     assert_equal(translate_jobconf('mapreduce.job.user.name', '0.19.2'),
                  'user.name')
     assert_equal(translate_jobconf('user.name', '0.21'),
                  'mapreduce.job.user.name')
Example #3
0
    def _update_jobconf_for_hadoop_version(self, jobconf, hadoop_version):
        """If *jobconf* (a dict) contains jobconf variables from the wrong
        version of Hadoop, add variables for the right one.

        If *hadoop_version* is empty, do nothing.
        """
        if not hadoop_version:  # this happens for sim runner
            return

        translations = {}  # for warning, below

        for key, value in sorted(jobconf.items()):
            new_key = translate_jobconf(key, hadoop_version)
            if new_key not in jobconf:
                jobconf[new_key] = value
                translations[key] = new_key

        if translations:
            log.warning(
                "Detected hadoop configuration property names that"
                " do not match hadoop version %s:"
                "\nThey have been translated as follows\n %s",
                hadoop_version,
                '\n'.join([
                    "%s: %s" % (key, new_key) for key, new_key
                    in sorted(translations.items())]))
Example #4
0
    def _simulate_jobconf_for_step(self,
                                   task_type,
                                   step_num,
                                   task_num,
                                   map_split=None):
        j = {}

        # TODO: these are really poor imtations of Hadoop IDs. See #1254
        j['mapreduce.job.id'] = self._job_key
        j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % (
            self._job_key, task_type.lower(), step_num, task_num)
        j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % (
            self._job_key, task_type.lower(), step_num, task_num)

        j['mapreduce.task.ismap'] = str(task_type == 'mapper').lower()

        # TODO: is this the correct format?
        j['mapreduce.task.partition'] = str(task_num)

        j['mapreduce.task.output.dir'] = self._output_dir_for_step(step_num)

        working_dir = self._task_working_dir(task_type, step_num, task_num)
        j['mapreduce.job.local.dir'] = working_dir

        for x in ('archive', 'file'):
            named_paths = sorted(self._working_dir_mgr.name_to_path(x).items())

            # mapreduce.job.cache.archives
            # mapreduce.job.cache.files
            j['mapreduce.job.cache.%ss' % x] = ','.join(
                '%s#%s' % (path, name) for name, path in named_paths)

            # mapreduce.job.cache.local.archives
            # mapreduce.job.cache.local.files
            j['mapreduce.job.cache.local.%ss' % x] = ','.join(
                join(working_dir, name) for name, path in named_paths)

        if map_split:
            j['mapreduce.map.input.file'] = 'file://' + map_split['file']
            j['mapreduce.map.input.length'] = str(map_split['length'])
            j['mapreduce.map.input.start'] = str(map_split['start'])

        # translate to correct version

        # don't use translate_jobconf_dict(); that's meant to add keys
        # to user-supplied jobconf
        hadoop_version = self.get_hadoop_version()

        if hadoop_version:
            return {
                translate_jobconf(k, hadoop_version): v
                for k, v in j.items()
            }
        else:
            return {
                tk: v
                for k, v in j.items()
                for tk in translate_jobconf_for_all_versions(k)
            }
Example #5
0
    def _subprocess_env(self,
                        step_type,
                        step_num,
                        task_num,
                        input_file=None,
                        input_start=None,
                        input_length=None):
        """Set up environment variables for a subprocess (mapper, etc.)

        This combines, in decreasing order of priority:

        * environment variables set by the **cmdenv** option
        * **jobconf** environment variables set by our job (e.g.
          ``mapreduce.task.ismap`)
        * environment variables from **jobconf** options, translated to
          whatever version of Hadoop we're emulating
        * the current environment
        * PYTHONPATH set to current working directory

        We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH``
        environment variables are handled specially.
        """
        version = self.get_hadoop_version()

        jobconf_env = dict(
            (translate_jobconf(k, version).replace('.', '_'), str(v))
            for (k, v) in self._opts['jobconf'].iteritems())

        internal_jobconf = self._simulate_jobconf_for_step(
            step_type,
            step_num,
            task_num,
            input_file=input_file,
            input_start=input_start,
            input_length=input_length)

        internal_jobconf_env = dict(
            (translate_jobconf(k, version).replace('.', '_'), str(v))
            for (k, v) in internal_jobconf.iteritems())

        # keep the current environment because we need PATH to find binaries
        # and make PYTHONPATH work
        return combine_local_envs(os.environ, jobconf_env,
                                  internal_jobconf_env, self._get_cmdenv())
Example #6
0
    def _subprocess_env(self, step_type, step_num, task_num, input_file=None,
                        input_start=None, input_length=None):
        """Set up environment variables for a subprocess (mapper, etc.)

        This combines, in decreasing order of priority:

        * environment variables set by the **cmdenv** option
        * **jobconf** environment variables set by our job (e.g.
          ``mapreduce.task.ismap`)
        * environment variables from **jobconf** options, translated to
          whatever version of Hadoop we're emulating
        * the current environment
        * PYTHONPATH set to current working directory

        We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH``
        environment variables are handled specially.
        """
        version = self.get_hadoop_version()

        jobconf_env = dict(
            (translate_jobconf(k, version).replace('.', '_'), str(v))
            for (k, v) in self._opts['jobconf'].iteritems())

        internal_jobconf = self._simulate_jobconf_for_step(
            step_type, step_num, task_num, input_file=input_file,
            input_start=input_start, input_length=input_length)

        internal_jobconf_env = dict(
            (translate_jobconf(k, version).replace('.', '_'), str(v))
            for (k, v) in internal_jobconf.iteritems())

        ironpython_env = {'IRONPYTHONPATH': os.getcwd()} if is_ironpython \
                         else {}

        # keep the current environment because we need PATH to find binaries
        # and make PYTHONPATH work
        return combine_local_envs({'PYTHONPATH': os.getcwd()},
                                  ironpython_env,
                                  os.environ,
                                  jobconf_env,
                                  internal_jobconf_env,
                                  self._get_cmdenv())
Example #7
0
File: sim.py Project: Affirm/mrjob
    def _simulate_jobconf_for_step(
            self, task_type, step_num, task_num, map_split=None):
        j = {}

        # TODO: these are really poor imtations of Hadoop IDs. See #1254
        j['mapreduce.job.id'] = self._job_key
        j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % (
            self._job_key, task_type.lower(), step_num, task_num)
        j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % (
            self._job_key, task_type.lower(), step_num, task_num)

        j['mapreduce.task.ismap'] = str(task_type == 'mapper').lower()

        # TODO: is this the correct format?
        j['mapreduce.task.partition'] = str(task_num)

        j['mapreduce.task.output.dir'] = self._output_dir_for_step(step_num)

        working_dir = self._task_working_dir(task_type, step_num, task_num)
        j['mapreduce.job.local.dir'] = working_dir

        for x in ('archive', 'file'):
            named_paths = sorted(self._working_dir_mgr.name_to_path(x).items())

            # mapreduce.job.cache.archives
            # mapreduce.job.cache.files
            j['mapreduce.job.cache.%ss' % x] = ','.join(
                '%s#%s' % (path, name) for name, path in named_paths)

            # mapreduce.job.cache.local.archives
            # mapreduce.job.cache.local.files
            j['mapreduce.job.cache.local.%ss' % x] = ','.join(
                join(working_dir, name) for name, path in named_paths)

        if map_split:
            # mapreduce.map.input.file
            # mapreduce.map.input.start
            # mapreduce.map.input.length
            for key, value in map_split.items():
                j['mapreduce.map.input.' + key] = str(value)

        # translate to correct version

        # don't use translate_jobconf_dict(); that's meant to add keys
        # to user-supplied jobconf
        hadoop_version = self.get_hadoop_version()

        if hadoop_version:
            return {translate_jobconf(k, hadoop_version): v
                    for k, v in j.items()}
        else:
            return {tk: v for k, v in j.items()
                    for tk in translate_jobconf_for_all_versions(k)}
Example #8
0
    def _args_for_streaming_step(self, step_num):
        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # set up uploading from HDFS to the working dir
        args.extend(self._upload_args())

        # if no reducer, shut off reducer tasks. This has to come before
        # extra hadoop args, which could contain jar-specific args
        # (e.g. -outputformat). See #1331.
        #
        # might want to just integrate this into _hadoop_args_for_step?
        if not reducer:
            args.extend([
                '-D',
                ('%s=0' % translate_jobconf('mapreduce.job.reduces',
                                            self.get_hadoop_version()))
            ])

        # -libjars (#198)
        if self._opts['libjars']:
            args.extend(['-libjars', ','.join(self._opts['libjars'])])

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjars) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._step_input_uris(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._step_output_uri(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)

        return args
Example #9
0
    def test_translate_jobconf(self):
        self.assertEqual(translate_jobconf("user.name", "0.20"), "user.name")
        self.assertEqual(translate_jobconf("mapreduce.job.user.name", "0.20"), "user.name")
        self.assertEqual(translate_jobconf("mapreduce.job.user.name", "0.20.2"), "user.name")
        self.assertEqual(translate_jobconf("user.name", "0.21"), "mapreduce.job.user.name")

        self.assertEqual(translate_jobconf("user.name", "1.0"), "user.name")
        self.assertEqual(translate_jobconf("user.name", "2.0"), "mapreduce.job.user.name")

        self.assertEqual(translate_jobconf("foo.bar", "2.0"), "foo.bar")
Example #10
0
    def _args_for_streaming_step(self, step_num):
        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step_num))

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # set up uploading from HDFS to the working dir
        args.extend(
            self._upload_args(self._upload_mgr))

        # if no reducer, shut off reducer tasks. This has to come before
        # extra hadoop args, which could contain jar-specific args
        # (e.g. -outputformat). See #1331.
        #
        # might want to just integrate this into _hadoop_args_for_step?
        if not reducer:
            args.extend(['-D', ('%s=0' % translate_jobconf(
                'mapreduce.job.reduces', self.get_hadoop_version()))])

        # -libjars (#198)
        if self._opts['libjars']:
            args.extend(['-libjars', ','.join(self._opts['libjars'])])

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjars) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)

        return args
Example #11
0
    def test_translate_jobconf(self):
        self.assertEqual(translate_jobconf('user.name', '0.18'), 'user.name')
        self.assertEqual(translate_jobconf('mapreduce.job.user.name', '0.18'),
                         'user.name')
        self.assertEqual(translate_jobconf('user.name', '0.19'), 'user.name')
        self.assertEqual(
            translate_jobconf('mapreduce.job.user.name', '0.19.2'),
            'user.name')
        self.assertEqual(translate_jobconf('user.name', '0.21'),
                         'mapreduce.job.user.name')

        self.assertEqual(translate_jobconf('user.name', '1.0'), 'user.name')
        self.assertEqual(translate_jobconf('user.name', '2.0'),
                         'mapreduce.job.user.name')

        self.assertEqual(translate_jobconf('foo.bar', '2.0'), 'foo.bar')
Example #12
0
File: bin.py Project: suzil/mrjob
    def _hadoop_streaming_jar_args(self, step_num):
        """The arguments that come after ``hadoop jar <streaming jar path>``
        when running a Hadoop streaming job."""
        args = []

        # get command for each part of the job
        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        # set up uploading from HDFS/cloud storage to the working dir
        args.extend(self._upload_args())

        # if no reducer, shut off reducer tasks. This has to come before
        # extra hadoop args, which could contain jar-specific args
        # (e.g. -outputformat). See #1331.
        #
        # might want to just integrate this into _hadoop_args_for_step?
        if not reducer:
            args.extend([
                '-D',
                ('%s=0' % translate_jobconf('mapreduce.job.reduces',
                                            self.get_hadoop_version()))
            ])

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._step_input_uris(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._step_output_uri(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)

        return args
Example #13
0
    def _hadoop_streaming_jar_args(self, step_num):
        """The arguments that come after ``hadoop jar <streaming jar path>``
        when running a Hadoop streaming job."""
        args = []

        # get command for each part of the job
        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step_num))

        # set up uploading from HDFS/cloud storage to the working dir
        args.extend(self._upload_args())

        # if no reducer, shut off reducer tasks. This has to come before
        # extra hadoop args, which could contain jar-specific args
        # (e.g. -outputformat). See #1331.
        #
        # might want to just integrate this into _hadoop_args_for_step?
        if not reducer:
            args.extend(['-D', ('%s=0' % translate_jobconf(
                'mapreduce.job.reduces', self.get_hadoop_version()))])

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        # set up input
        for input_uri in self._step_input_uris(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._step_output_uri(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)

        return args
Example #14
0
    def test_translate_jobconf(self):
        self.assertEqual(translate_jobconf('user.name', '0.18'),
                         'user.name')
        self.assertEqual(translate_jobconf('mapreduce.job.user.name', '0.18'),
                         'user.name')
        self.assertEqual(translate_jobconf('user.name', '0.19'),
                         'user.name')
        self.assertEqual(
            translate_jobconf('mapreduce.job.user.name', '0.19.2'),
            'user.name')
        self.assertEqual(translate_jobconf('user.name', '0.21'),
                         'mapreduce.job.user.name')

        self.assertEqual(translate_jobconf('user.name', '1.0'),
                         'user.name')
        self.assertEqual(translate_jobconf('user.name', '2.0'),
                         'mapreduce.job.user.name')

        self.assertEqual(translate_jobconf('foo.bar', '2.0'), 'foo.bar')
Example #15
0
    def _args_for_streaming_step(self, step_num):
        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # set up uploading from HDFS to the working dir
        args.extend(self._upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num))

        # if no reducer, shut off reducer tasks
        if not reducer:
            args.extend([
                '-D',
                ('%s=0' % translate_jobconf('mapreduce.job.reduces',
                                            self.get_hadoop_version()))
            ])

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)

        return args
Example #16
0
    def _args_for_streaming_step(self, step_num):
        hadoop_streaming_jar = self.get_hadoop_streaming_jar()
        if not hadoop_streaming_jar:
            raise Exception('no Hadoop streaming jar')

        args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar]

        # set up uploading from HDFS to the working dir
        args.extend(
            self._upload_args(self._upload_mgr))

        # Add extra hadoop args first as hadoop args could be a hadoop
        # specific argument (e.g. -libjar) which must come before job
        # specific args.
        args.extend(self._hadoop_args_for_step(step_num))

        mapper, combiner, reducer = (
            self._hadoop_streaming_commands(step_num))

        # if no reducer, shut off reducer tasks
        if not reducer:
            args.extend(['-D', ('%s=0' % translate_jobconf(
                'mapreduce.job.reduces', self.get_hadoop_version()))])

        # set up input
        for input_uri in self._hdfs_step_input_files(step_num):
            args.extend(['-input', input_uri])

        # set up output
        args.append('-output')
        args.append(self._hdfs_step_output_dir(step_num))

        args.append('-mapper')
        args.append(mapper)

        if combiner:
            args.append('-combiner')
            args.append(combiner)

        if reducer:
            args.append('-reducer')
            args.append(reducer)

        return args
Example #17
0
    def _sort_values_jobconf(self):
        """Jobconf dictionary to enable sorting by value.
        """
        if not self._sort_values:
            return {}

        # translate _SORT_VALUES_JOBCONF to the correct Hadoop version,
        # without logging a warning
        hadoop_version = self.get_hadoop_version()

        jobconf = {}
        for k, v in _SORT_VALUES_JOBCONF.items():
            if hadoop_version:
                jobconf[translate_jobconf(k, hadoop_version)] = v
            else:
                for j in translate_jobconf_for_all_versions(k):
                    jobconf[j] = v

        return jobconf
Example #18
0
    def _sort_values_jobconf(self):
        """Jobconf dictionary to enable sorting by value.
        """
        if not self._sort_values:
            return {}

        # translate _SORT_VALUES_JOBCONF to the correct Hadoop version,
        # without logging a warning
        hadoop_version = self.get_hadoop_version()

        jobconf = {}
        for k, v in _SORT_VALUES_JOBCONF.items():
            if hadoop_version:
                jobconf[translate_jobconf(k, hadoop_version)] = v
            else:
                for j in translate_jobconf_for_all_versions(k):
                    jobconf[j] = v

        return jobconf
Example #19
0
    def _process_jobconf_args(self, jobconf):
        if jobconf:
            for (conf_arg, value) in jobconf.iteritems():
                # Internally, use one canonical Hadoop version
                canon_arg = translate_jobconf(conf_arg, '0.21')

                if canon_arg == 'mapreduce.job.maps':
                    self._map_tasks = int(value)
                    if self._map_tasks < 1:
                        raise ValueError('%s should be at least 1' % conf_arg)
                elif canon_arg == 'mapreduce.job.reduces':
                    self._reduce_tasks = int(value)
                    if self._reduce_tasks < 1:
                        raise ValueError('%s should be at least 1' % conf_arg)
                elif canon_arg == 'mapreduce.job.local.dir':
                    # Hadoop supports multiple direcories. Sticking with only
                    # one here
                    if not os.path.isdir(value):
                        raise IOError("Directory %s does not exist" % value)
                    self._working_dir = value
Example #20
0
    def _process_jobconf_args(self, jobconf):
        if jobconf:
            for (conf_arg, value) in jobconf.iteritems():
                # Internally, use one canonical Hadoop version
                canon_arg = translate_jobconf(conf_arg, '0.21')

                if canon_arg == 'mapreduce.job.maps':
                    self._map_tasks = int(value)
                    if self._map_tasks < 1:
                        raise ValueError(
                            '%s should be at least 1' % conf_arg)
                elif canon_arg == 'mapreduce.job.reduces':
                    self._reduce_tasks = int(value)
                    if self._reduce_tasks < 1:
                        raise ValueError('%s should be at least 1' % conf_arg)
                elif canon_arg == 'mapreduce.job.local.dir':
                    # Hadoop supports multiple direcories. Sticking with only
                    # one here
                    if not os.path.isdir(value):
                        raise IOError("Directory %s does not exist" % value)
                    self._working_dir = value
Example #21
0
    def _simulate_jobconf_for_step(self,
                                   step_num,
                                   step_type,
                                   task_num,
                                   working_dir,
                                   input_file=None,
                                   input_start=None,
                                   input_length=None):
        """Simulate jobconf variables set by Hadoop to indicate input
        files, files uploaded, working directory, etc. for a particular step.

        Returns a dictionary mapping jobconf variable name
        (e.g. ``'mapreduce.map.input.file'``) to its value, which is always
        a string.
        """
        # By convention, we use the newer (Hadoop 2) jobconf names and
        # translate them at the very end.
        j = {}

        j['mapreduce.job.id'] = self._job_key
        j['mapreduce.task.output.dir'] = self._output_dir
        j['mapreduce.job.local.dir'] = working_dir
        # archives and files for jobconf
        cache_archives = []
        cache_files = []
        cache_local_archives = []
        cache_local_files = []

        files = self._working_dir_mgr.name_to_path('file').items()
        for name, path in files:
            cache_files.append('%s#%s' % (path, name))
            cache_local_files.append(os.path.join(working_dir, name))

        archives = self._working_dir_mgr.name_to_path('archive').items()
        for name, path in archives:
            cache_archives.append('%s#%s' % (path, name))
            cache_local_archives.append(os.path.join(working_dir, name))

        # TODO: could add mtime info here too (e.g.
        # mapreduce.job.cache.archives.timestamps) here too
        j['mapreduce.job.cache.files'] = (','.join(cache_files))
        j['mapreduce.job.cache.local.files'] = (','.join(cache_local_files))
        j['mapreduce.job.cache.archives'] = (','.join(cache_archives))
        j['mapreduce.job.cache.local.archives'] = (
            ','.join(cache_local_archives))

        # task and attempt IDs
        # TODO: these are a crappy imitation of task/attempt IDs (see #1254)
        j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % (
            self._job_key, step_type.lower(), step_num, task_num)
        # (we only have one attempt)
        j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % (
            self._job_key, step_type.lower(), step_num, task_num)

        # not actually sure what's correct for combiners here. It'll definitely
        # be true if we're just using pipes to simulate a combiner though
        j['mapreduce.task.ismap'] = str(step_type in ('mapper',
                                                      'combiner')).lower()

        j['mapreduce.task.partition'] = str(task_num)

        if input_file is not None:
            j['mapreduce.map.input.file'] = input_file
        if input_start is not None:
            j['mapreduce.map.input.start'] = str(input_start)
        if input_length is not None:
            j['mapreduce.map.input.length'] = str(input_length)

        version = self.get_hadoop_version()
        if version:
            # translate to correct version
            j = dict((translate_jobconf(k, version), v) for k, v in j.items())
        else:
            # use all versions
            j = dict((variant, v) for k, v in j.items()
                     for variant in translate_jobconf_for_all_versions(k))

        return j
Example #22
0
    def _simulate_jobconf_for_step(
            self, step_num, step_type, task_num, working_dir,
            input_file=None, input_start=None, input_length=None):
        """Simulate jobconf variables set by Hadoop to indicate input
        files, files uploaded, working directory, etc. for a particular step.

        Returns a dictionary mapping jobconf variable name
        (e.g. ``'mapreduce.map.input.file'``) to its value, which is always
        a string.
        """
        # By convention, we use the newer (Hadoop 0.21+) jobconf names and
        # translate them at the very end.
        j = {}

        j['mapreduce.job.id'] = self._job_name
        j['mapreduce.task.output.dir'] = self._output_dir
        j['mapreduce.job.local.dir'] = working_dir
        # archives and files for jobconf
        cache_archives = []
        cache_files = []
        cache_local_archives = []
        cache_local_files = []

        files = self._working_dir_mgr.name_to_path('file').iteritems()
        for name, path in files:
            cache_files.append('%s#%s' % (path, name))
            cache_local_files.append(os.path.join(working_dir, name))

        archives = self._working_dir_mgr.name_to_path('archive').iteritems()
        for name, path in archives:
            cache_archives.append('%s#%s' % (path, name))
            cache_local_archives.append(os.path.join(working_dir, name))

        # TODO: could add mtime info here too (e.g.
        # mapreduce.job.cache.archives.timestamps) here too
        j['mapreduce.job.cache.files'] = (','.join(cache_files))
        j['mapreduce.job.cache.local.files'] = (','.join(cache_local_files))
        j['mapreduce.job.cache.archives'] = (','.join(cache_archives))
        j['mapreduce.job.cache.local.archives'] = (
            ','.join(cache_local_archives))

        # task and attempt IDs
        j['mapreduce.task.id'] = 'task_%s_%s_%05d%d' % (
            self._job_name, step_type.lower(), step_num, task_num)
        # (we only have one attempt)
        j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%05d%d_0' % (
            self._job_name, step_type.lower(), step_num, task_num)

        # not actually sure what's correct for combiners here. It'll definitely
        # be true if we're just using pipes to simulate a combiner though
        j['mapreduce.task.ismap'] = str(
            step_type in ('mapper', 'combiner')).lower()

        j['mapreduce.task.partition'] = str(task_num)

        if input_file is not None:
            j['mapreduce.map.input.file'] = input_file
        if input_start is not None:
            j['mapreduce.map.input.start'] = str(input_start)
        if input_length is not None:
            j['mapreduce.map.input.length'] = str(input_length)

        # translate to correct version
        version = self.get_hadoop_version()
        j = dict((translate_jobconf(k, version), v) for k, v in j.iteritems())

        return j