Ejemplo n.º 1
0
    def _spark_script_args(self, step_num, last_step_num=None):
        """Generate spark harness args for streaming steps (and handle
        other spark step types the usual way).
        """
        step = self._get_step(step_num)

        if step['type'] != 'streaming':
            return super(SparkMRJobRunner,
                         self)._spark_script_args(step_num, last_step_num)

        if last_step_num is None:
            last_step_num = step_num

        args = []

        # class name
        args.append('%s.%s' %
                    (self._job_script_module_name(), self._mrjob_cls.__name__))

        # INPUT
        args.append(','.join(self._step_input_uris(step_num)))

        # OUTPUT
        # note that we use the output dir for the *last* step
        args.append(self._step_output_uri(last_step_num))

        # --counter-output-dir, to simulate counters
        args.extend(
            ['--counter-output-dir',
             self._counter_output_dir(step_num)])

        # --first-step-num, --last-step-num (step range)
        args.extend([
            '--first-step-num',
            str(step_num), '--last-step-num',
            str(last_step_num)
        ])

        # --job-args (passthrough args)
        job_args = self._mr_job_extra_args()
        if job_args:
            args.extend(['--job-args', cmd_line(job_args)])

        # --compression-codec
        jobconf = self._jobconf_for_step(step_num)

        compress_conf = jobconf_from_dict(
            jobconf, 'mapreduce.output.fileoutputformat.compress')
        codec_conf = jobconf_from_dict(
            jobconf, 'mapreduce.output.fileoutputformat.compress.codec')

        if compress_conf and compress_conf != 'false' and codec_conf:
            args.extend(['--compression-codec', codec_conf])

        # --num-reducers
        num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces')
        if num_reducers and int(num_reducers) > 0:
            args.extend(['--num-reducers', str(num_reducers)])

        return args
Ejemplo n.º 2
0
    def _invoke_step(self, step_num, step_type):
        """Run the mapper or reducer for the given step.
        """
        step = self._get_step(step_num)

        if step['type'] != 'streaming':
            raise Exception("LocalMRJobRunner cannot run %s steps" %
                            step['type'])

        jobconf = self._jobconf_for_step(step_num)

        outfile_prefix = 'step-%04d-%s' % (step_num, step_type)

        # allow setting number of tasks from jobconf
        if step_type == 'reducer':
            num_tasks = int(
                jobconf_from_dict(jobconf, 'mapreduce.job.reduces',
                                  self._DEFAULT_REDUCE_TASKS))
        else:
            num_tasks = int(
                jobconf_from_dict(jobconf, 'mapreduce.job.maps',
                                  self._DEFAULT_MAP_TASKS))

        # get file splits for mappers and reducers
        keep_sorted = (step_type == 'reducer')
        file_splits = self._get_file_splits(self._step_input_paths(),
                                            num_tasks,
                                            keep_sorted=keep_sorted)

        # since we have grapped the files from the _prev_outfiles as input
        # to this step reset _prev_outfiles
        self._prev_outfiles = []

        # Start the tasks associated with the step:
        # if we need to sort, then just sort all input files into one file
        # otherwise, split the files needed for mappers and reducers
        # and setup the task environment for each

        # The correctly-ordered list of task_num, file_name pairs
        file_tasks = sorted([(t['task_num'], file_name)
                             for file_name, t in file_splits.items()],
                            key=lambda t: t[0])

        for task_num, input_path in file_tasks:
            # make a new working_dir for each task
            working_dir = os.path.join(self._get_local_tmp_dir(),
                                       'job_local_dir', str(step_num),
                                       step_type, str(task_num))
            self._setup_working_dir(working_dir)

            log.debug("File name %s" % input_path)
            # setup environment variables
            split_kwargs = {}
            if step_type == 'mapper':
                # mappers have extra file split info
                split_kwargs = dict(
                    input_file=file_splits[input_path]['orig_name'],
                    input_start=file_splits[input_path]['start'],
                    input_length=file_splits[input_path]['length'])

            env = self._subprocess_env(step_num, step_type, task_num,
                                       working_dir, **split_kwargs)

            output_path = os.path.join(
                self._get_local_tmp_dir(),
                outfile_prefix + '_part-%05d' % task_num)
            log.debug('Writing to %s' % output_path)

            self._run_step(step_num, step_type, input_path, output_path,
                           working_dir, env)

            self._prev_outfiles.append(output_path)

        self._per_step_runner_finish(step_num)
        counters = self._counters[step_num]
        if counters:
            log.info(_format_counters(counters))
Ejemplo n.º 3
0
 def test_get_missing_jobconf_not_in_table(self):
     # there was a bug where defaults didn't work for jobconf
     # variables that we don't know about
     self.assertEqual(jobconf_from_dict({}, 'user.defined'), None)
     self.assertEqual(jobconf_from_dict({}, 'user.defined', 'beauty'),
                      'beauty')
Ejemplo n.º 4
0
 def test_default(self):
     self.assertEqual(jobconf_from_dict({}, 'user.name'), None)
     self.assertEqual(jobconf_from_dict({}, 'user.name', 'dave'), 'dave')
Ejemplo n.º 5
0
 def test_get_new_hadoop_jobconf(self):
     jobconf = {'mapreduce.job.user.name': 'Edsger W. Dijkstra'}
     self.assertEqual(jobconf_from_dict(jobconf, 'user.name'),
                      'Edsger W. Dijkstra')
     self.assertEqual(jobconf_from_dict(jobconf, 'mapreduce.job.user.name'),
                      'Edsger W. Dijkstra')
Ejemplo n.º 6
0
 def test_get_missing_jobconf_not_in_table(self):
     # there was a bug where defaults didn't work for jobconf
     # variables that we don't know about
     self.assertEqual(jobconf_from_dict({}, 'user.defined'), None)
     self.assertEqual(
         jobconf_from_dict({}, 'user.defined', 'beauty'), 'beauty')
Ejemplo n.º 7
0
 def test_default(self):
     self.assertEqual(jobconf_from_dict({}, 'user.name'), None)
     self.assertEqual(jobconf_from_dict({}, 'user.name', 'dave'), 'dave')
Ejemplo n.º 8
0
 def test_get_new_hadoop_jobconf(self):
     jobconf = {'mapreduce.job.user.name': 'Edsger W. Dijkstra'}
     self.assertEqual(jobconf_from_dict(jobconf, 'user.name'),
                      'Edsger W. Dijkstra')
     self.assertEqual(jobconf_from_dict(jobconf, 'mapreduce.job.user.name'),
                      'Edsger W. Dijkstra')
Ejemplo n.º 9
0
    def _invoke_step(self, step_num, step_type):
        """Run the mapper or reducer for the given step.
        """
        step = self._get_step(step_num)

        if step['type'] != 'streaming':
            raise Exception("LocalMRJobRunner cannot run %s steps" %
                            step['type'])

        jobconf = self._jobconf_for_step(step_num)

        outfile_prefix = 'step-%d-%s' % (step_num, step_type)

        # allow setting number of tasks from jobconf
        if step_type == 'reducer':
            num_tasks = int(jobconf_from_dict(
                jobconf, 'mapreduce.job.reduces', self._DEFAULT_REDUCE_TASKS))
        else:
            num_tasks = int(jobconf_from_dict(
                jobconf, 'mapreduce.job.maps', self._DEFAULT_MAP_TASKS))

        # get file splits for mappers and reducers
        keep_sorted = (step_type == 'reducer')
        file_splits = self._get_file_splits(
            self._step_input_paths(), num_tasks, keep_sorted=keep_sorted)

        # since we have grapped the files from the _prev_outfiles as input
        # to this step reset _prev_outfiles
        self._prev_outfiles = []

        # Start the tasks associated with the step:
        # if we need to sort, then just sort all input files into one file
        # otherwise, split the files needed for mappers and reducers
        # and setup the task environment for each

        # The correctly-ordered list of task_num, file_name pairs
        file_tasks = sorted([
            (t['task_num'], file_name) for file_name, t
            in file_splits.items()], key=lambda t: t[0])

        for task_num, input_path in file_tasks:
            # make a new working_dir for each task
            working_dir = os.path.join(
                self._get_local_tmp_dir(),
                'job_local_dir', str(step_num), step_type, str(task_num))
            self._setup_working_dir(working_dir)

            log.debug("File name %s" % input_path)
            # setup environment variables
            split_kwargs = {}
            if step_type == 'mapper':
                # mappers have extra file split info
                split_kwargs = dict(
                    input_file=file_splits[input_path]['orig_name'],
                    input_start=file_splits[input_path]['start'],
                    input_length=file_splits[input_path]['length'])

            env = self._subprocess_env(
                step_num, step_type, task_num, working_dir, **split_kwargs)

            output_path = os.path.join(
                self._get_local_tmp_dir(),
                outfile_prefix + '_part-%05d' % task_num)
            log.info('writing to %s' % output_path)

            self._run_step(step_num, step_type, input_path, output_path,
                           working_dir, env)

            self._prev_outfiles.append(output_path)

        self.per_step_runner_finish(step_num)
        self.print_counters([step_num + 1])
Ejemplo n.º 10
0
    def _spark_script_args(self, step_num, last_step_num=None):
        """Generate spark harness args for streaming steps (and handle
        other spark step types the usual way).
        """
        if last_step_num is None:
            last_step_num = step_num

        steps = self._get_steps()[step_num:last_step_num + 1]

        if steps[0]['type'] != 'streaming':
            return super(SparkMRJobRunner, self)._spark_script_args(
                step_num, last_step_num)

        args = []

        # class name
        args.append('%s.%s' % (self._job_script_module_name(),
                               self._mrjob_cls.__name__))

        # INPUT
        args.append(
            ','.join(self._step_input_uris(step_num)))

        # OUTPUT
        # note that we use the output dir for the *last* step
        args.append(
            self._step_output_uri(last_step_num))

        # --hadoop-input-format. Pass '' to indicate we know there is none
        args.extend(['--hadoop-input-format',
                     self._hadoop_input_format or ''])

        # --hadoop-output-format. Pass '' to indicate we know there is none
        args.extend(['--hadoop-output-format',
                     self._hadoop_output_format or ''])

        # --sort-values
        if self._sort_values:
            args.append('--sort-values')
        else:
            args.append('--no-sort-values')

        # --steps-desc
        args.extend(['--steps-desc', json.dumps(steps)])

        # --counter-output-dir, to simulate counters
        args.extend(['--counter-output-dir',
                     self._counter_output_dir(step_num)])

        # --first-step-num, --last-step-num (step range)
        args.extend(['--first-step-num', str(step_num),
                     '--last-step-num', str(last_step_num)])

        # --job-args (passthrough args)

        # if on local[*] master, keep file upload args as-is (see #2031)
        job_args = self._mr_job_extra_args(
            local=not self._spark_executors_have_own_wd())

        if job_args:
            args.extend(['--job-args', cmd_line(job_args)])

        # --compression-codec
        jobconf = self._jobconf_for_step(step_num)

        compress_conf = jobconf_from_dict(
            jobconf, 'mapreduce.output.fileoutputformat.compress')
        codec_conf = jobconf_from_dict(
            jobconf, 'mapreduce.output.fileoutputformat.compress.codec')

        if compress_conf and compress_conf != 'false' and codec_conf:
            args.extend(['--compression-codec', codec_conf])

        # --num-reducers
        num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces')
        if num_reducers and int(num_reducers) > 0:
            args.extend(['--num-reducers', str(num_reducers)])

        # --max-output-files
        if self._max_output_files:
            args.extend(['--max-output-files',
                         str(self._max_output_files)])

        if self._opts['emulate_map_input_file']:
            args.append('--emulate-map-input-file')

        return args
Ejemplo n.º 11
0
Archivo: runner.py Proyecto: Yelp/mrjob
    def _spark_script_args(self, step_num, last_step_num=None):
        """Generate spark harness args for streaming steps (and handle
        other spark step types the usual way).
        """
        if last_step_num is None:
            last_step_num = step_num

        steps = self._get_steps()[step_num:last_step_num + 1]

        if steps[0]['type'] != 'streaming':
            return super(SparkMRJobRunner, self)._spark_script_args(
                step_num, last_step_num)

        args = []

        # class name
        args.append('%s.%s' % (self._job_script_module_name(),
                               self._mrjob_cls.__name__))

        # INPUT
        args.append(
            ','.join(self._step_input_uris(step_num)))

        # OUTPUT
        # note that we use the output dir for the *last* step
        args.append(
            self._step_output_uri(last_step_num))

        # --hadoop-input-format. Pass '' to indicate we know there is none
        args.extend(['--hadoop-input-format',
                     self._hadoop_input_format or ''])

        # --hadoop-output-format. Pass '' to indicate we know there is none
        args.extend(['--hadoop-output-format',
                     self._hadoop_output_format or ''])

        # --sort-values
        if self._sort_values:
            args.append('--sort-values')
        else:
            args.append('--no-sort-values')

        # --steps-desc
        args.extend(['--steps-desc', json.dumps(steps)])

        # --counter-output-dir, to simulate counters
        args.extend(['--counter-output-dir',
                     self._counter_output_dir(step_num)])

        # --first-step-num, --last-step-num (step range)
        args.extend(['--first-step-num', str(step_num),
                     '--last-step-num', str(last_step_num)])

        # --job-args (passthrough args)

        # if on local[*] master, keep file upload args as-is (see #2031)
        job_args = self._mr_job_extra_args(
            local=not self._spark_executors_have_own_wd())

        if job_args:
            args.extend(['--job-args', cmd_line(job_args)])

        # --compression-codec
        jobconf = self._jobconf_for_step(step_num)

        compress_conf = jobconf_from_dict(
            jobconf, 'mapreduce.output.fileoutputformat.compress')
        codec_conf = jobconf_from_dict(
            jobconf, 'mapreduce.output.fileoutputformat.compress.codec')

        if compress_conf and compress_conf != 'false' and codec_conf:
            args.extend(['--compression-codec', codec_conf])

        # --num-reducers
        num_reducers = jobconf_from_dict(jobconf, 'mapreduce.job.reduces')
        if num_reducers and int(num_reducers) > 0:
            args.extend(['--num-reducers', str(num_reducers)])

        # --max-output-files
        if self._max_output_files:
            args.extend(['--max-output-files',
                         str(self._max_output_files)])

        return args
Ejemplo n.º 12
0
 def test_default(self):
     self.assertEqual(jobconf_from_dict({}, "user.name"), None)
     self.assertEqual(jobconf_from_dict({}, "user.name", "dave"), "dave")
Ejemplo n.º 13
0
 def test_get_new_hadoop_jobconf(self):
     jobconf = {"mapreduce.job.user.name": "Edsger W. Dijkstra"}
     self.assertEqual(jobconf_from_dict(jobconf, "user.name"), "Edsger W. Dijkstra")
     self.assertEqual(jobconf_from_dict(jobconf, "mapreduce.job.user.name"), "Edsger W. Dijkstra")