def _simulate_jobconf_for_step(self, task_type, step_num, task_num, map_split=None): j = {} # TODO: these are really poor imtations of Hadoop IDs. See #1254 j['mapreduce.job.id'] = self._job_key j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % ( self._job_key, task_type.lower(), step_num, task_num) j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % ( self._job_key, task_type.lower(), step_num, task_num) j['mapreduce.task.ismap'] = str(task_type == 'mapper').lower() # TODO: is this the correct format? j['mapreduce.task.partition'] = str(task_num) j['mapreduce.task.output.dir'] = self._output_dir_for_step(step_num) working_dir = self._task_working_dir(task_type, step_num, task_num) j['mapreduce.job.local.dir'] = working_dir for x in ('archive', 'file'): named_paths = sorted(self._working_dir_mgr.name_to_path(x).items()) # mapreduce.job.cache.archives # mapreduce.job.cache.files j['mapreduce.job.cache.%ss' % x] = ','.join( '%s#%s' % (path, name) for name, path in named_paths) # mapreduce.job.cache.local.archives # mapreduce.job.cache.local.files j['mapreduce.job.cache.local.%ss' % x] = ','.join( join(working_dir, name) for name, path in named_paths) if map_split: j['mapreduce.map.input.file'] = 'file://' + map_split['file'] j['mapreduce.map.input.length'] = str(map_split['length']) j['mapreduce.map.input.start'] = str(map_split['start']) # translate to correct version # don't use translate_jobconf_dict(); that's meant to add keys # to user-supplied jobconf hadoop_version = self.get_hadoop_version() if hadoop_version: return { translate_jobconf(k, hadoop_version): v for k, v in j.items() } else: return { tk: v for k, v in j.items() for tk in translate_jobconf_for_all_versions(k) }
def _simulate_jobconf_for_step( self, task_type, step_num, task_num, map_split=None): j = {} # TODO: these are really poor imtations of Hadoop IDs. See #1254 j['mapreduce.job.id'] = self._job_key j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % ( self._job_key, task_type.lower(), step_num, task_num) j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % ( self._job_key, task_type.lower(), step_num, task_num) j['mapreduce.task.ismap'] = str(task_type == 'mapper').lower() # TODO: is this the correct format? j['mapreduce.task.partition'] = str(task_num) j['mapreduce.task.output.dir'] = self._output_dir_for_step(step_num) working_dir = self._task_working_dir(task_type, step_num, task_num) j['mapreduce.job.local.dir'] = working_dir for x in ('archive', 'file'): named_paths = sorted(self._working_dir_mgr.name_to_path(x).items()) # mapreduce.job.cache.archives # mapreduce.job.cache.files j['mapreduce.job.cache.%ss' % x] = ','.join( '%s#%s' % (path, name) for name, path in named_paths) # mapreduce.job.cache.local.archives # mapreduce.job.cache.local.files j['mapreduce.job.cache.local.%ss' % x] = ','.join( join(working_dir, name) for name, path in named_paths) if map_split: # mapreduce.map.input.file # mapreduce.map.input.start # mapreduce.map.input.length for key, value in map_split.items(): j['mapreduce.map.input.' + key] = str(value) # translate to correct version # don't use translate_jobconf_dict(); that's meant to add keys # to user-supplied jobconf hadoop_version = self.get_hadoop_version() if hadoop_version: return {translate_jobconf(k, hadoop_version): v for k, v in j.items()} else: return {tk: v for k, v in j.items() for tk in translate_jobconf_for_all_versions(k)}
def _sort_values_jobconf(self): """Jobconf dictionary to enable sorting by value. """ if not self._sort_values: return {} # translate _SORT_VALUES_JOBCONF to the correct Hadoop version, # without logging a warning hadoop_version = self.get_hadoop_version() jobconf = {} for k, v in _SORT_VALUES_JOBCONF.items(): if hadoop_version: jobconf[translate_jobconf(k, hadoop_version)] = v else: for j in translate_jobconf_for_all_versions(k): jobconf[j] = v return jobconf
def _sort_values_jobconf(self): """Jobconf dictionary to enable sorting by value. """ if not self._sort_values: return {} # translate _SORT_VALUES_JOBCONF to the correct Hadoop version, # without logging a warning hadoop_version = self.get_hadoop_version() jobconf = {} for k, v in _SORT_VALUES_JOBCONF.items(): if hadoop_version: jobconf[translate_jobconf(k, hadoop_version)] = v else: for j in translate_jobconf_for_all_versions(k): jobconf[j] = v return jobconf
def _simulate_jobconf_for_step(self, step_num, step_type, task_num, working_dir, input_file=None, input_start=None, input_length=None): """Simulate jobconf variables set by Hadoop to indicate input files, files uploaded, working directory, etc. for a particular step. Returns a dictionary mapping jobconf variable name (e.g. ``'mapreduce.map.input.file'``) to its value, which is always a string. """ # By convention, we use the newer (Hadoop 2) jobconf names and # translate them at the very end. j = {} j['mapreduce.job.id'] = self._job_key j['mapreduce.task.output.dir'] = self._output_dir j['mapreduce.job.local.dir'] = working_dir # archives and files for jobconf cache_archives = [] cache_files = [] cache_local_archives = [] cache_local_files = [] files = self._working_dir_mgr.name_to_path('file').items() for name, path in files: cache_files.append('%s#%s' % (path, name)) cache_local_files.append(os.path.join(working_dir, name)) archives = self._working_dir_mgr.name_to_path('archive').items() for name, path in archives: cache_archives.append('%s#%s' % (path, name)) cache_local_archives.append(os.path.join(working_dir, name)) # TODO: could add mtime info here too (e.g. # mapreduce.job.cache.archives.timestamps) here too j['mapreduce.job.cache.files'] = (','.join(cache_files)) j['mapreduce.job.cache.local.files'] = (','.join(cache_local_files)) j['mapreduce.job.cache.archives'] = (','.join(cache_archives)) j['mapreduce.job.cache.local.archives'] = ( ','.join(cache_local_archives)) # task and attempt IDs # TODO: these are a crappy imitation of task/attempt IDs (see #1254) j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % ( self._job_key, step_type.lower(), step_num, task_num) # (we only have one attempt) j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % ( self._job_key, step_type.lower(), step_num, task_num) # not actually sure what's correct for combiners here. It'll definitely # be true if we're just using pipes to simulate a combiner though j['mapreduce.task.ismap'] = str(step_type in ('mapper', 'combiner')).lower() j['mapreduce.task.partition'] = str(task_num) if input_file is not None: j['mapreduce.map.input.file'] = input_file if input_start is not None: j['mapreduce.map.input.start'] = str(input_start) if input_length is not None: j['mapreduce.map.input.length'] = str(input_length) version = self.get_hadoop_version() if version: # translate to correct version j = dict((translate_jobconf(k, version), v) for k, v in j.items()) else: # use all versions j = dict((variant, v) for k, v in j.items() for variant in translate_jobconf_for_all_versions(k)) return j
def test_translate_jobconf_for_all_versions(self): self.assertEqual(translate_jobconf_for_all_versions('user.name'), ['mapreduce.job.user.name', 'user.name']) self.assertEqual(translate_jobconf_for_all_versions('foo.bar'), ['foo.bar'])
def test_translate_jobconf_for_all_versions(self): self.assertEqual(translate_jobconf_for_all_versions('user.name'), ['mapreduce.job.user.name', 'user.name']) self.assertEqual(translate_jobconf_for_all_versions('foo.bar'), ['foo.bar'])
def _simulate_jobconf_for_step( self, step_num, step_type, task_num, working_dir, input_file=None, input_start=None, input_length=None): """Simulate jobconf variables set by Hadoop to indicate input files, files uploaded, working directory, etc. for a particular step. Returns a dictionary mapping jobconf variable name (e.g. ``'mapreduce.map.input.file'``) to its value, which is always a string. """ # By convention, we use the newer (Hadoop 2) jobconf names and # translate them at the very end. j = {} j['mapreduce.job.id'] = self._job_key j['mapreduce.task.output.dir'] = self._output_dir j['mapreduce.job.local.dir'] = working_dir # archives and files for jobconf cache_archives = [] cache_files = [] cache_local_archives = [] cache_local_files = [] files = self._working_dir_mgr.name_to_path('file').items() for name, path in files: cache_files.append('%s#%s' % (path, name)) cache_local_files.append(os.path.join(working_dir, name)) archives = self._working_dir_mgr.name_to_path('archive').items() for name, path in archives: cache_archives.append('%s#%s' % (path, name)) cache_local_archives.append(os.path.join(working_dir, name)) # TODO: could add mtime info here too (e.g. # mapreduce.job.cache.archives.timestamps) here too j['mapreduce.job.cache.files'] = (','.join(cache_files)) j['mapreduce.job.cache.local.files'] = (','.join(cache_local_files)) j['mapreduce.job.cache.archives'] = (','.join(cache_archives)) j['mapreduce.job.cache.local.archives'] = ( ','.join(cache_local_archives)) # task and attempt IDs j['mapreduce.task.id'] = 'task_%s_%s_%05d%d' % ( self._job_key, step_type.lower(), step_num, task_num) # (we only have one attempt) j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%05d%d_0' % ( self._job_key, step_type.lower(), step_num, task_num) # not actually sure what's correct for combiners here. It'll definitely # be true if we're just using pipes to simulate a combiner though j['mapreduce.task.ismap'] = str( step_type in ('mapper', 'combiner')).lower() j['mapreduce.task.partition'] = str(task_num) if input_file is not None: j['mapreduce.map.input.file'] = input_file if input_start is not None: j['mapreduce.map.input.start'] = str(input_start) if input_length is not None: j['mapreduce.map.input.length'] = str(input_length) version = self.get_hadoop_version() if version: # translate to correct version j = dict((translate_jobconf(k, version), v) for k, v in j.items()) else: # use all versions j = dict((variant, v) for k, v in j.items() for variant in translate_jobconf_for_all_versions(k)) return j
def test_translate_jobconf_for_all_versions(self): self.assertEqual(translate_jobconf_for_all_versions("user.name"), ["mapreduce.job.user.name", "user.name"]) self.assertEqual(translate_jobconf_for_all_versions("foo.bar"), ["foo.bar"])