Beispiel #1
0
    def _run_step_on_spark(self, step, step_num):
        """Set up a fake working directory and environment, and call the Spark
        method."""
        # this is kind of a Spark-specific mash-up of _run_streaming_step()
        # (in sim.py) and _invoke_task_func(), above

        # don't create the output dir for the step; that's Spark's job

        # breaking the Spark step down into tasks is pyspark's job, so
        # we just have a single dummy task

        self.fs.mkdir(self._task_dir('spark', step_num, task_num=0))
        # could potentially parse this for cause of error
        stderr_path = self._task_stderr_path('spark', step_num, task_num=0)
        stdout_path = self._task_output_path('spark', step_num, task_num=0)

        self._create_dist_cache_dir(step_num)
        wd = self._setup_working_dir('spark', step_num, task_num=0)

        # use abspath() on input URIs before changing working dir
        task_args = self._spark_script_args(step_num)

        with open(stdout_path, 'wb') as stdout, \
                open(stderr_path, 'wb') as stderr:
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                os.environ.update(_fix_env(self._opts['cmdenv']))
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we redirected stdout and stderr
                sys.stdout, sys.stderr = stdout, stderr

                task = self._mrjob_cls(task_args)
                task.execute()
Beispiel #2
0
    def _run_step_on_spark(self, step, step_num):
        """Set up a fake working directory and environment, and call the Spark
        method."""
        # this is kind of a Spark-specific mash-up of _run_streaming_step()
        # (in sim.py) and _invoke_task_func(), above

        # don't create the output dir for the step; that's Spark's job

        # breaking the Spark step down into tasks is pyspark's job, so
        # we just have a single dummy task

        self.fs.mkdir(self._task_dir('spark', step_num, task_num=0))
        # could potentially parse this for cause of error
        stderr_path = self._task_stderr_path('spark', step_num, task_num=0)
        stdout_path = self._task_output_path('spark', step_num, task_num=0)

        self._create_dist_cache_dir(step_num)
        wd = self._setup_working_dir('spark', step_num, task_num=0)

        # use abspath() on input URIs before changing working dir
        task_args = self._spark_script_args(step_num)

        with open(stdout_path, 'wb') as stdout, \
                open(stderr_path, 'wb') as stderr:
            with save_current_environment(), save_cwd(), save_sys_path(), \
                    save_sys_std():
                os.environ.update(_fix_env(self._opts['cmdenv']))
                os.chdir(wd)
                sys.path = [os.getcwd()] + sys.path

                # pretend we redirected stdout and stderr
                sys.stdout, sys.stderr = stdout, stderr

                task = self._mrjob_cls(task_args)
                task.execute()
Beispiel #3
0
    def _run_task_func(self, task_type, step_num, task_num, map_split=None):
        """Returns a no-args function that runs one mapper, reducer, or
         combiner.

        This sets up everything the task needs to run, then passes it off to
        :py:meth:`_invoke_task_func`.
        """
        input_path = self._task_input_path(task_type, step_num, task_num)
        stderr_path = self._task_stderr_path(task_type, step_num, task_num)
        output_path = self._task_output_path(task_type, step_num, task_num)
        wd = self._setup_working_dir(task_type, step_num, task_num)
        env = _fix_env(
            self._env_for_task(task_type, step_num, task_num, map_split))

        return partial(_run_task,
                       self._invoke_task_func(task_type, step_num,
                                              task_num), task_type, step_num,
                       task_num, input_path, output_path, stderr_path, wd, env)
Beispiel #4
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_type = step['type']
            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            if self._step_type_uses_spark(step_type):
                returncode, step_interpretation = self._run_spark_submit(
                    step_args, env, record_callback=_log_log4j_record)
            else:
                returncode, step_interpretation = self._run_hadoop(
                    step_args, env, record_callback=_log_record_from_hadoop)

            # make sure output_dir is filled (used for history log)
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            self._log_counters(log_interpretation, step_num)

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    _log_probable_cause_of_failure(log, error)

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())
Beispiel #5
0
    def _run_task_func(self, task_type, step_num, task_num, map_split=None):
        """Returns a no-args function that runs one mapper, reducer, or
         combiner.

        This sets up everything the task needs to run, then passes it off to
        :py:meth:`_invoke_task_func`.
        """
        input_path = self._task_input_path(task_type, step_num, task_num)
        stderr_path = self._task_stderr_path(task_type, step_num, task_num)
        output_path = self._task_output_path(task_type, step_num, task_num)
        wd = self._setup_working_dir(task_type, step_num, task_num)
        env = _fix_env(
            self._env_for_task(task_type, step_num, task_num, map_split))

        return partial(
            _run_task,
            self._invoke_task_func(task_type, step_num, task_num),
            task_type, step_num, task_num,
            input_path, output_path, stderr_path, wd, env)
Beispiel #6
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr,
                    record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_unicode(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())
Beispiel #7
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr, record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_driver(to_unicode(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            self._log_counters(log_interpretation, step_num)

            step_type = step['type']

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(reason=reason,
                                          step_num=step_num,
                                          num_steps=self._num_steps())