def _spark_script_args(self, step_num, last_step_num=None): """Generate spark harness args for streaming steps (and handle other spark step types the usual way). """ step = self._get_step(step_num) if step['type'] != 'streaming': return super(SparkMRJobRunner, self)._spark_script_args(step_num, last_step_num) if last_step_num is None: last_step_num = step_num args = [] # class name args.append('%s.%s' % (self._job_script_module_name(), self._mrjob_cls.__name__)) # INPUT args.append(','.join(self._step_input_uris(step_num))) # OUTPUT # note that we use the output dir for the *last* step args.append(self._step_output_uri(last_step_num)) # --counter-output-dir, to simulate counters args.extend( ['--counter-output-dir', self._counter_output_dir(step_num)]) # --first-step-num, --last-step-num (step range) args.extend([ '--first-step-num', str(step_num), '--last-step-num', str(last_step_num) ]) # --job-args (passthrough args) job_args = self._mr_job_extra_args() if job_args: args.extend(['--job-args', cmd_line(job_args)]) # --compression-codec jobconf = self._jobconf_for_step(step_num) compress_conf = jobconf_from_dict( jobconf, 'mapreduce.output.fileoutputformat.compress') codec_conf = jobconf_from_dict( jobconf, 'mapreduce.output.fileoutputformat.compress.codec') if compress_conf and compress_conf != 'false' and codec_conf: args.extend(['--compression-codec', codec_conf]) # --num-reducers num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces') if num_reducers and int(num_reducers) > 0: args.extend(['--num-reducers', str(num_reducers)]) return args
def _invoke_step(self, step_num, step_type): """Run the mapper or reducer for the given step. """ step = self._get_step(step_num) if step['type'] != 'streaming': raise Exception("LocalMRJobRunner cannot run %s steps" % step['type']) jobconf = self._jobconf_for_step(step_num) outfile_prefix = 'step-%04d-%s' % (step_num, step_type) # allow setting number of tasks from jobconf if step_type == 'reducer': num_tasks = int( jobconf_from_dict(jobconf, 'mapreduce.job.reduces', self._DEFAULT_REDUCE_TASKS)) else: num_tasks = int( jobconf_from_dict(jobconf, 'mapreduce.job.maps', self._DEFAULT_MAP_TASKS)) # get file splits for mappers and reducers keep_sorted = (step_type == 'reducer') file_splits = self._get_file_splits(self._step_input_paths(), num_tasks, keep_sorted=keep_sorted) # since we have grapped the files from the _prev_outfiles as input # to this step reset _prev_outfiles self._prev_outfiles = [] # Start the tasks associated with the step: # if we need to sort, then just sort all input files into one file # otherwise, split the files needed for mappers and reducers # and setup the task environment for each # The correctly-ordered list of task_num, file_name pairs file_tasks = sorted([(t['task_num'], file_name) for file_name, t in file_splits.items()], key=lambda t: t[0]) for task_num, input_path in file_tasks: # make a new working_dir for each task working_dir = os.path.join(self._get_local_tmp_dir(), 'job_local_dir', str(step_num), step_type, str(task_num)) self._setup_working_dir(working_dir) log.debug("File name %s" % input_path) # setup environment variables split_kwargs = {} if step_type == 'mapper': # mappers have extra file split info split_kwargs = dict( input_file=file_splits[input_path]['orig_name'], input_start=file_splits[input_path]['start'], input_length=file_splits[input_path]['length']) env = self._subprocess_env(step_num, step_type, task_num, working_dir, **split_kwargs) output_path = os.path.join( self._get_local_tmp_dir(), outfile_prefix + '_part-%05d' % task_num) log.debug('Writing to %s' % output_path) self._run_step(step_num, step_type, input_path, output_path, working_dir, env) self._prev_outfiles.append(output_path) self._per_step_runner_finish(step_num) counters = self._counters[step_num] if counters: log.info(_format_counters(counters))
def test_get_missing_jobconf_not_in_table(self): # there was a bug where defaults didn't work for jobconf # variables that we don't know about self.assertEqual(jobconf_from_dict({}, 'user.defined'), None) self.assertEqual(jobconf_from_dict({}, 'user.defined', 'beauty'), 'beauty')
def test_default(self): self.assertEqual(jobconf_from_dict({}, 'user.name'), None) self.assertEqual(jobconf_from_dict({}, 'user.name', 'dave'), 'dave')
def test_get_new_hadoop_jobconf(self): jobconf = {'mapreduce.job.user.name': 'Edsger W. Dijkstra'} self.assertEqual(jobconf_from_dict(jobconf, 'user.name'), 'Edsger W. Dijkstra') self.assertEqual(jobconf_from_dict(jobconf, 'mapreduce.job.user.name'), 'Edsger W. Dijkstra')
def test_get_missing_jobconf_not_in_table(self): # there was a bug where defaults didn't work for jobconf # variables that we don't know about self.assertEqual(jobconf_from_dict({}, 'user.defined'), None) self.assertEqual( jobconf_from_dict({}, 'user.defined', 'beauty'), 'beauty')
def _invoke_step(self, step_num, step_type): """Run the mapper or reducer for the given step. """ step = self._get_step(step_num) if step['type'] != 'streaming': raise Exception("LocalMRJobRunner cannot run %s steps" % step['type']) jobconf = self._jobconf_for_step(step_num) outfile_prefix = 'step-%d-%s' % (step_num, step_type) # allow setting number of tasks from jobconf if step_type == 'reducer': num_tasks = int(jobconf_from_dict( jobconf, 'mapreduce.job.reduces', self._DEFAULT_REDUCE_TASKS)) else: num_tasks = int(jobconf_from_dict( jobconf, 'mapreduce.job.maps', self._DEFAULT_MAP_TASKS)) # get file splits for mappers and reducers keep_sorted = (step_type == 'reducer') file_splits = self._get_file_splits( self._step_input_paths(), num_tasks, keep_sorted=keep_sorted) # since we have grapped the files from the _prev_outfiles as input # to this step reset _prev_outfiles self._prev_outfiles = [] # Start the tasks associated with the step: # if we need to sort, then just sort all input files into one file # otherwise, split the files needed for mappers and reducers # and setup the task environment for each # The correctly-ordered list of task_num, file_name pairs file_tasks = sorted([ (t['task_num'], file_name) for file_name, t in file_splits.items()], key=lambda t: t[0]) for task_num, input_path in file_tasks: # make a new working_dir for each task working_dir = os.path.join( self._get_local_tmp_dir(), 'job_local_dir', str(step_num), step_type, str(task_num)) self._setup_working_dir(working_dir) log.debug("File name %s" % input_path) # setup environment variables split_kwargs = {} if step_type == 'mapper': # mappers have extra file split info split_kwargs = dict( input_file=file_splits[input_path]['orig_name'], input_start=file_splits[input_path]['start'], input_length=file_splits[input_path]['length']) env = self._subprocess_env( step_num, step_type, task_num, working_dir, **split_kwargs) output_path = os.path.join( self._get_local_tmp_dir(), outfile_prefix + '_part-%05d' % task_num) log.info('writing to %s' % output_path) self._run_step(step_num, step_type, input_path, output_path, working_dir, env) self._prev_outfiles.append(output_path) self.per_step_runner_finish(step_num) self.print_counters([step_num + 1])
def _spark_script_args(self, step_num, last_step_num=None): """Generate spark harness args for streaming steps (and handle other spark step types the usual way). """ if last_step_num is None: last_step_num = step_num steps = self._get_steps()[step_num:last_step_num + 1] if steps[0]['type'] != 'streaming': return super(SparkMRJobRunner, self)._spark_script_args( step_num, last_step_num) args = [] # class name args.append('%s.%s' % (self._job_script_module_name(), self._mrjob_cls.__name__)) # INPUT args.append( ','.join(self._step_input_uris(step_num))) # OUTPUT # note that we use the output dir for the *last* step args.append( self._step_output_uri(last_step_num)) # --hadoop-input-format. Pass '' to indicate we know there is none args.extend(['--hadoop-input-format', self._hadoop_input_format or '']) # --hadoop-output-format. Pass '' to indicate we know there is none args.extend(['--hadoop-output-format', self._hadoop_output_format or '']) # --sort-values if self._sort_values: args.append('--sort-values') else: args.append('--no-sort-values') # --steps-desc args.extend(['--steps-desc', json.dumps(steps)]) # --counter-output-dir, to simulate counters args.extend(['--counter-output-dir', self._counter_output_dir(step_num)]) # --first-step-num, --last-step-num (step range) args.extend(['--first-step-num', str(step_num), '--last-step-num', str(last_step_num)]) # --job-args (passthrough args) # if on local[*] master, keep file upload args as-is (see #2031) job_args = self._mr_job_extra_args( local=not self._spark_executors_have_own_wd()) if job_args: args.extend(['--job-args', cmd_line(job_args)]) # --compression-codec jobconf = self._jobconf_for_step(step_num) compress_conf = jobconf_from_dict( jobconf, 'mapreduce.output.fileoutputformat.compress') codec_conf = jobconf_from_dict( jobconf, 'mapreduce.output.fileoutputformat.compress.codec') if compress_conf and compress_conf != 'false' and codec_conf: args.extend(['--compression-codec', codec_conf]) # --num-reducers num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces') if num_reducers and int(num_reducers) > 0: args.extend(['--num-reducers', str(num_reducers)]) # --max-output-files if self._max_output_files: args.extend(['--max-output-files', str(self._max_output_files)]) if self._opts['emulate_map_input_file']: args.append('--emulate-map-input-file') return args
def _spark_script_args(self, step_num, last_step_num=None): """Generate spark harness args for streaming steps (and handle other spark step types the usual way). """ if last_step_num is None: last_step_num = step_num steps = self._get_steps()[step_num:last_step_num + 1] if steps[0]['type'] != 'streaming': return super(SparkMRJobRunner, self)._spark_script_args( step_num, last_step_num) args = [] # class name args.append('%s.%s' % (self._job_script_module_name(), self._mrjob_cls.__name__)) # INPUT args.append( ','.join(self._step_input_uris(step_num))) # OUTPUT # note that we use the output dir for the *last* step args.append( self._step_output_uri(last_step_num)) # --hadoop-input-format. Pass '' to indicate we know there is none args.extend(['--hadoop-input-format', self._hadoop_input_format or '']) # --hadoop-output-format. Pass '' to indicate we know there is none args.extend(['--hadoop-output-format', self._hadoop_output_format or '']) # --sort-values if self._sort_values: args.append('--sort-values') else: args.append('--no-sort-values') # --steps-desc args.extend(['--steps-desc', json.dumps(steps)]) # --counter-output-dir, to simulate counters args.extend(['--counter-output-dir', self._counter_output_dir(step_num)]) # --first-step-num, --last-step-num (step range) args.extend(['--first-step-num', str(step_num), '--last-step-num', str(last_step_num)]) # --job-args (passthrough args) # if on local[*] master, keep file upload args as-is (see #2031) job_args = self._mr_job_extra_args( local=not self._spark_executors_have_own_wd()) if job_args: args.extend(['--job-args', cmd_line(job_args)]) # --compression-codec jobconf = self._jobconf_for_step(step_num) compress_conf = jobconf_from_dict( jobconf, 'mapreduce.output.fileoutputformat.compress') codec_conf = jobconf_from_dict( jobconf, 'mapreduce.output.fileoutputformat.compress.codec') if compress_conf and compress_conf != 'false' and codec_conf: args.extend(['--compression-codec', codec_conf]) # --num-reducers num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces') if num_reducers and int(num_reducers) > 0: args.extend(['--num-reducers', str(num_reducers)]) # --max-output-files if self._max_output_files: args.extend(['--max-output-files', str(self._max_output_files)]) return args
def test_default(self): self.assertEqual(jobconf_from_dict({}, "user.name"), None) self.assertEqual(jobconf_from_dict({}, "user.name", "dave"), "dave")
def test_get_new_hadoop_jobconf(self): jobconf = {"mapreduce.job.user.name": "Edsger W. Dijkstra"} self.assertEqual(jobconf_from_dict(jobconf, "user.name"), "Edsger W. Dijkstra") self.assertEqual(jobconf_from_dict(jobconf, "mapreduce.job.user.name"), "Edsger W. Dijkstra")