Example #1
0
    def _wait_for_process(self, proc_dict, step_num):
        # handle counters, status msgs, and other stuff on stderr
        proc = proc_dict['proc']

        stderr_lines = self._process_stderr_from_script(
            proc.stderr, step_num=step_num)
        tb_lines = _find_python_traceback(stderr_lines)

        # proc.stdout isn't always defined
        if proc.stdout:
            proc.stdout.close()
        proc.stderr.close()

        returncode = proc.wait()

        if returncode != 0:
            # show counters before raising exception
            counters = self._counters[step_num]
            if counters:
                log.info(_format_counters(counters))

            # try to throw a useful exception
            if tb_lines:
                for line in tb_lines:
                    log.error(line.rstrip('\r\n'))

            reason = str(
                CalledProcessError(returncode, proc_dict['args']))
            raise StepFailedException(
                reason=reason, step_num=step_num,
                num_steps=len(self._get_steps()))
Example #2
0
    def _wait_for_process(self, proc_dict, step_num):
        # handle counters, status msgs, and other stuff on stderr
        proc = proc_dict['proc']

        stderr_lines = self._process_stderr_from_script(proc.stderr,
                                                        step_num=step_num)
        tb_lines = _find_python_traceback(stderr_lines)

        # proc.stdout isn't always defined
        if proc.stdout:
            proc.stdout.close()
        proc.stderr.close()

        returncode = proc.wait()

        if returncode != 0:
            # show counters before raising exception
            counters = self._counters[step_num]
            if counters:
                log.info(_format_counters(counters))

            # try to throw a useful exception
            if tb_lines:
                for line in tb_lines:
                    log.error(line.rstrip('\r\n'))

            reason = str(CalledProcessError(returncode, proc_dict['args']))
            raise StepFailedException(reason=reason,
                                      step_num=step_num,
                                      num_steps=len(self._get_steps()))
Example #3
0
 def test_indent(self):
     self.assertEqual(_format_counters(self.COUNTERS, indent='  '),
                      ('Counters: 3\n'
                       '  File System Counters\n'
                       '    FILE: Number of bytes read=8\n'
                       '    FILE: Number of bytes written=359982\n'
                       '  Job Counters\n'
                       '    Launched map tasks=2'))
Example #4
0
 def test_basic(self):
     self.assertEqual(_format_counters(self.COUNTERS),
                      ('Counters: 3\n'
                       '\tFile System Counters\n'
                       '\t\tFILE: Number of bytes read=8\n'
                       '\t\tFILE: Number of bytes written=359982\n'
                       '\tJob Counters\n'
                       '\t\tLaunched map tasks=2'))
Example #5
0
 def test_indent(self):
     self.assertEqual(
         _format_counters(self.COUNTERS, indent='  '),
         ('Counters: 3\n'
          '  File System Counters\n'
          '    FILE: Number of bytes read=8\n'
          '    FILE: Number of bytes written=359982\n'
          '  Job Counters\n'
          '    Launched map tasks=2'))
Example #6
0
 def test_basic(self):
     self.assertEqual(
         _format_counters(self.COUNTERS),
         ('Counters: 3\n'
          '\tFile System Counters\n'
          '\t\tFILE: Number of bytes read=8\n'
          '\t\tFILE: Number of bytes written=359982\n'
          '\tJob Counters\n'
          '\t\tLaunched map tasks=2'))
Example #7
0
 def test_custom_desc(self):
     self.assertEqual(
         _format_counters(self.COUNTERS, desc='Counters for step 1'),
         ('Counters for step 1: 3\n'
          '\tFile System Counters\n'
          '\t\tFILE: Number of bytes read=8\n'
          '\t\tFILE: Number of bytes written=359982\n'
          '\tJob Counters\n'
          '\t\tLaunched map tasks=2'))
Example #8
0
    def _log_counters(self, log_interpretation, step_num):
        """Utility for logging counters (if any) for a step."""
        step_type = self._get_step(step_num)['type']

        if not _is_spark_step_type(step_type):
            counters = self._pick_counters(log_interpretation, step_type)
            if counters:
                log.info(_format_counters(counters))
            else:
                log.warning('No counters found')
Example #9
0
 def test_empty_group(self):
     # counter groups should always have at least one counter
     self.assertEqual(
         _format_counters({
             'File System Counters': {},
             'Job Counters': {
                 'Launched map tasks': 2,
             },
         }), ('Counters: 1\n'
              '\tJob Counters\n'
              '\t\tLaunched map tasks=2'))
Example #10
0
    def _log_counters(self, log_interpretation, step_num):
        """Utility for logging counters (if any) for a step."""
        step_type = self._get_step(step_num)['type']

        if not self._step_type_uses_spark(step_type):
            counters = self._pick_counters(log_interpretation, step_type)
            if counters:
                log.info(_format_counters(counters))
            elif self._read_logs():
                # should only log this if we actually looked for counters
                log.warning('No counters found')
Example #11
0
 def test_indent(self):
     self.assertEqual(
         _format_counters(self.COUNTERS, indent="  "),
         (
             "Counters: 3\n"
             "  File System Counters\n"
             "    FILE: Number of bytes read=8\n"
             "    FILE: Number of bytes written=359982\n"
             "  Job Counters\n"
             "    Launched map tasks=2"
         ),
     )
Example #12
0
    def _log_counters(self, log_interpretation, step_num):
        """Utility for logging counters (if any) for a step."""
        step_type = self._get_step(step_num)['type']

        if not _is_spark_step_type(step_type):
            counters = self._pick_counters(
                log_interpretation, step_type)
            if counters:
                log.info(_format_counters(counters))
            elif self._read_logs():
                # should only log this if we actually looked for counters
                log.warning('No counters found')
Example #13
0
 def test_empty_group(self):
     # counter groups should always have at least one counter
     self.assertEqual(
         _format_counters({
             'File System Counters': {},
             'Job Counters': {
                 'Launched map tasks': 2,
             },
         }),
         ('Counters: 1\n'
          '\tJob Counters\n'
          '\t\tLaunched map tasks=2'))
Example #14
0
    def _run_step_on_spark(self, step, step_num, last_step_num=None):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num, last_step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode, step_interpretation = self._run_spark_submit(
            spark_submit_args, env, record_callback=_log_log4j_record)

        counters = None
        if step['type'] == 'streaming':
            counter_file = self.fs.join(self._counter_output_dir(step_num),
                                        'part-*')
            counter_json = b''.join(self.fs.cat(counter_file))
            if counter_json.strip():
                # json.loads() on Python 3.4/3.5 can't take bytes
                counters = json.loads(to_unicode(counter_json))

        if isinstance(counters, list):
            self._counters.extend(counters)

            # desc_num is 1-indexed user-readable step num
            for desc_num, counter_dict in enumerate(counters,
                                                    start=(step_num + 1)):
                if counter_dict:
                    log.info(
                        _format_counters(counter_dict,
                                         desc=('Counters for step %d' %
                                               desc_num)))

        # for non-streaming steps, there are no counters.
        # pad self._counters to match number of steps
        while len(self._counters) < (last_step_num or step_num) + 1:
            self._counters.append({})

        if returncode:
            error = _pick_error(dict(step=step_interpretation))
            if error:
                _log_probable_cause_of_failure(log, error)

            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(reason=reason,
                                      step_num=step_num,
                                      last_step_num=last_step_num,
                                      num_steps=self._num_steps())
Example #15
0
File: runner.py Project: Yelp/mrjob
    def _run_step_on_spark(self, step, step_num, last_step_num=None):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num, last_step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode = self._run_spark_submit(spark_submit_args, env,
                                            record_callback=_log_log4j_record)

        counters = None
        if step['type'] == 'streaming':
            counter_file = self.fs.join(
                self._counter_output_dir(step_num), 'part-*')
            counter_json = b''.join(self.fs.cat(counter_file))
            if counter_json.strip():
                # json.loads() on Python 3.4/3.5 can't take bytes
                counters = json.loads(to_unicode(counter_json))

        if isinstance(counters, list):
            self._counters.extend(counters)

            # desc_num is 1-indexed user-readable step num
            for desc_num, counter_dict in enumerate(
                    counters, start=(step_num + 1)):
                if counter_dict:
                    log.info(_format_counters(
                        counter_dict,
                        desc=('Counters for step %d' % desc_num)))

        # for non-streaming steps, there are no counters.
        # pad self._counters to match number of steps
        while len(self._counters) < (last_step_num or step_num) + 1:
            self._counters.append({})

        if returncode:
            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(
                reason=reason, step_num=step_num, last_step_num=last_step_num,
                num_steps=self._num_steps())
Example #16
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr,
                    record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_unicode(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())
Example #17
0
 def test_round_trip(self):
     # are we outputting counters in the same format as the Hadoop binary?
     self.assertEqual(_parse_indented_counters(_format_counters(self.COUNTERS).splitlines()), self.COUNTERS)
Example #18
0
 def test_empty_group(self):
     # counter groups should always have at least one counter
     self.assertEqual(
         _format_counters({"File System Counters": {}, "Job Counters": {"Launched map tasks": 2}}),
         ("Counters: 1\n" "\tJob Counters\n" "\t\tLaunched map tasks=2"),
     )
Example #19
0
 def test_empty(self):
     self.assertEqual(_format_counters({}), "Counters: 0")
Example #20
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = self._env_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr, record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(reason=reason,
                                          step_num=step_num,
                                          num_steps=self._num_steps())
Example #21
0
    def _invoke_step(self, step_num, step_type):
        """Run the mapper or reducer for the given step.
        """
        step = self._get_step(step_num)

        if step['type'] != 'streaming':
            raise Exception("LocalMRJobRunner cannot run %s steps" %
                            step['type'])

        jobconf = self._jobconf_for_step(step_num)

        outfile_prefix = 'step-%04d-%s' % (step_num, step_type)

        # allow setting number of tasks from jobconf
        if step_type == 'reducer':
            num_tasks = int(
                jobconf_from_dict(jobconf, 'mapreduce.job.reduces',
                                  self._DEFAULT_REDUCE_TASKS))
        else:
            num_tasks = int(
                jobconf_from_dict(jobconf, 'mapreduce.job.maps',
                                  self._DEFAULT_MAP_TASKS))

        # get file splits for mappers and reducers
        keep_sorted = (step_type == 'reducer')
        file_splits = self._get_file_splits(self._step_input_paths(),
                                            num_tasks,
                                            keep_sorted=keep_sorted)

        # since we have grapped the files from the _prev_outfiles as input
        # to this step reset _prev_outfiles
        self._prev_outfiles = []

        # Start the tasks associated with the step:
        # if we need to sort, then just sort all input files into one file
        # otherwise, split the files needed for mappers and reducers
        # and setup the task environment for each

        # The correctly-ordered list of task_num, file_name pairs
        file_tasks = sorted([(t['task_num'], file_name)
                             for file_name, t in file_splits.items()],
                            key=lambda t: t[0])

        for task_num, input_path in file_tasks:
            # make a new working_dir for each task
            working_dir = os.path.join(self._get_local_tmp_dir(),
                                       'job_local_dir', str(step_num),
                                       step_type, str(task_num))
            self._setup_working_dir(working_dir)

            log.debug("File name %s" % input_path)
            # setup environment variables
            split_kwargs = {}
            if step_type == 'mapper':
                # mappers have extra file split info
                split_kwargs = dict(
                    input_file=file_splits[input_path]['orig_name'],
                    input_start=file_splits[input_path]['start'],
                    input_length=file_splits[input_path]['length'])

            env = self._subprocess_env(step_num, step_type, task_num,
                                       working_dir, **split_kwargs)

            output_path = os.path.join(
                self._get_local_tmp_dir(),
                outfile_prefix + '_part-%05d' % task_num)
            log.debug('Writing to %s' % output_path)

            self._run_step(step_num, step_type, input_path, output_path,
                           working_dir, env)

            self._prev_outfiles.append(output_path)

        self._per_step_runner_finish(step_num)
        counters = self._counters[step_num]
        if counters:
            log.info(_format_counters(counters))
Example #22
0
File: sim.py Project: Affirm/mrjob
 def _log_counters(self, step_num):
     counters = self.counters()[step_num]
     if counters:
         log.info('\n%s\n' % _format_counters(counters))
Example #23
0
 def test_empty(self):
     self.assertEqual(_format_counters({}), 'Counters: 0')
Example #24
0
 def test_round_trip(self):
     # are we outputting counters in the same format as the Hadoop binary?
     self.assertEqual(
         _parse_indented_counters(
             _format_counters(self.COUNTERS).splitlines()), self.COUNTERS)
Example #25
0
    def _invoke_step(self, step_num, step_type):
        """Run the mapper or reducer for the given step.
        """
        step = self._get_step(step_num)

        if step['type'] != 'streaming':
            raise Exception("LocalMRJobRunner cannot run %s steps" %
                            step['type'])

        jobconf = self._jobconf_for_step(step_num)

        outfile_prefix = 'step-%d-%s' % (step_num, step_type)

        # allow setting number of tasks from jobconf
        if step_type == 'reducer':
            num_tasks = int(jobconf_from_dict(
                jobconf, 'mapreduce.job.reduces', self._DEFAULT_REDUCE_TASKS))
        else:
            num_tasks = int(jobconf_from_dict(
                jobconf, 'mapreduce.job.maps', self._DEFAULT_MAP_TASKS))

        # get file splits for mappers and reducers
        keep_sorted = (step_type == 'reducer')
        file_splits = self._get_file_splits(
            self._step_input_paths(), num_tasks, keep_sorted=keep_sorted)

        # since we have grapped the files from the _prev_outfiles as input
        # to this step reset _prev_outfiles
        self._prev_outfiles = []

        # Start the tasks associated with the step:
        # if we need to sort, then just sort all input files into one file
        # otherwise, split the files needed for mappers and reducers
        # and setup the task environment for each

        # The correctly-ordered list of task_num, file_name pairs
        file_tasks = sorted([
            (t['task_num'], file_name) for file_name, t
            in file_splits.items()], key=lambda t: t[0])

        for task_num, input_path in file_tasks:
            # make a new working_dir for each task
            working_dir = os.path.join(
                self._get_local_tmp_dir(),
                'job_local_dir', str(step_num), step_type, str(task_num))
            self._setup_working_dir(working_dir)

            log.debug("File name %s" % input_path)
            # setup environment variables
            split_kwargs = {}
            if step_type == 'mapper':
                # mappers have extra file split info
                split_kwargs = dict(
                    input_file=file_splits[input_path]['orig_name'],
                    input_start=file_splits[input_path]['start'],
                    input_length=file_splits[input_path]['length'])

            env = self._subprocess_env(
                step_num, step_type, task_num, working_dir, **split_kwargs)

            output_path = os.path.join(
                self._get_local_tmp_dir(),
                outfile_prefix + '_part-%05d' % task_num)
            log.info('writing to %s' % output_path)

            self._run_step(step_num, step_type, input_path, output_path,
                           working_dir, env)

            self._prev_outfiles.append(output_path)

        self.per_step_runner_finish(step_num)
        counters = self._counters[step_num]
        if counters:
            log.info(_format_counters(counters))
Example #26
0
 def _log_counters(self, step_num):
     counters = self.counters()[step_num]
     if counters:
         log.info('\n%s\n' % _format_counters(counters))