Beispiel #1
0
def _parse_counters_0_18(counter_string):
    # 0.18 counters look like this:
    # GroupName.CounterName:Value,Group1.Crackers:3,Group2.Nerf:243,...
    groups = _COUNTER_RE_0_18.finditer(counter_string)
    if groups is None:
        log.warning("Cannot parse Hadoop counter string: %s" % counter_string)

    for m in groups:
        yield (to_string(m.group("group")), to_string(m.group("name")), int(m.group("value")))
Beispiel #2
0
def _parse_counters_0_18(counter_string):
    # 0.18 counters look like this:
    # GroupName.CounterName:Value,Group1.Crackers:3,Group2.Nerf:243,...
    groups = _COUNTER_RE_0_18.finditer(counter_string)
    if groups is None:
        log.warning('Cannot parse Hadoop counter string: %s' % counter_string)

    for m in groups:
        yield (to_string(m.group('group')), to_string(m.group('name')),
               int(m.group('value')))
Beispiel #3
0
    def invoke_hadoop(self,
                      args,
                      ok_returncodes=None,
                      ok_stderr=None,
                      return_stdout=False):
        """Run the given hadoop command, raising an exception on non-zero
        return code. This only works for commands whose output we don't
        care about.

        Args:
        ok_returncodes -- a list/tuple/set of return codes we expect to
            get back from hadoop (e.g. [0,1]). By default, we only expect 0.
            If we get an unexpected return code, we raise a CalledProcessError.
        ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
            matches a regex in this list (even if the returncode is bad)
        return_stdout -- return the stdout from the hadoop command rather
            than logging it. If this is False, we return the returncode
            instead.
        """
        args = self._hadoop_bin + args

        log.debug('> %s' % cmd_line(args))

        proc = Popen(args, stdout=PIPE, stderr=PIPE)
        stdout, stderr = proc.communicate()

        log_func = log.debug if proc.returncode == 0 else log.error
        if not return_stdout:
            for line in BytesIO(stdout):
                log_func('STDOUT: ' + to_string(line.rstrip(b'\r\n')))

        # check if STDERR is okay
        stderr_is_ok = False
        if ok_stderr:
            for stderr_re in ok_stderr:
                if stderr_re.match(stderr):
                    stderr_is_ok = True
                    break

        if not stderr_is_ok:
            for line in BytesIO(stderr):
                log_func('STDERR: ' + to_string(line.rstrip(b'\r\n')))

        ok_returncodes = ok_returncodes or [0]

        if not stderr_is_ok and proc.returncode not in ok_returncodes:
            raise CalledProcessError(proc.returncode, args)

        if return_stdout:
            return stdout
        else:
            return proc.returncode
Beispiel #4
0
    def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
                      return_stdout=False):
        """Run the given hadoop command, raising an exception on non-zero
        return code. This only works for commands whose output we don't
        care about.

        Args:
        ok_returncodes -- a list/tuple/set of return codes we expect to
            get back from hadoop (e.g. [0,1]). By default, we only expect 0.
            If we get an unexpected return code, we raise a CalledProcessError.
        ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
            matches a regex in this list (even if the returncode is bad)
        return_stdout -- return the stdout from the hadoop command rather
            than logging it. If this is False, we return the returncode
            instead.
        """
        args = self._hadoop_bin + args

        log.debug('> %s' % cmd_line(args))

        proc = Popen(args, stdout=PIPE, stderr=PIPE)
        stdout, stderr = proc.communicate()

        log_func = log.debug if proc.returncode == 0 else log.error
        if not return_stdout:
            for line in BytesIO(stdout):
                log_func('STDOUT: ' + to_string(line.rstrip(b'\r\n')))

        # check if STDERR is okay
        stderr_is_ok = False
        if ok_stderr:
            for stderr_re in ok_stderr:
                if stderr_re.match(stderr):
                    stderr_is_ok = True
                    break

        if not stderr_is_ok:
            for line in BytesIO(stderr):
                log_func('STDERR: ' + to_string(line.rstrip(b'\r\n')))

        ok_returncodes = ok_returncodes or [0]

        if not stderr_is_ok and proc.returncode not in ok_returncodes:
            raise CalledProcessError(proc.returncode, args)

        if return_stdout:
            return stdout
        else:
            return proc.returncode
Beispiel #5
0
    def _process_stderr_from_streaming(self, stderr):
        def treat_eio_as_eof(iter):
            # on Linux, the PTY gives us a specific IOError when the
            # when the child process exits, rather than EOF.
            while True:
                try:
                    yield next(iter)  # okay for StopIteration to bubble up
                except IOError as e:
                    if e.errno == errno.EIO:
                        return
                    else:
                        raise

        for line in treat_eio_as_eof(stderr):
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info('HADOOP: ' + to_string(line))

            if b'Streaming Job Failed!' in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group('timestamp')
                self._start_step_num = int(m.group('step_num'))
Beispiel #6
0
def find_hadoop_java_stack_trace(lines):
    """Scan a log file or other iterable for a java stack trace from Hadoop,
    and return it as a list of lines (bytes).

    In logs from EMR, we find java stack traces in ``task-attempts/*/syslog``

    Sample stack trace::

        2010-07-27 18:25:48,397 WARN org.apache.hadoop.mapred.TaskTracker (main): Error running child
        java.lang.OutOfMemoryError: Java heap space
                at org.apache.hadoop.mapred.IFile$Reader.readNextBlock(IFile.java:270)
                at org.apache.hadoop.mapred.IFile$Reader.next(IFile.java:332)
                at org.apache.hadoop.mapred.Merger$Segment.next(Merger.java:147)
                at org.apache.hadoop.mapred.Merger$MergeQueue.adjustPriorityQueue(Merger.java:238)
                at org.apache.hadoop.mapred.Merger$MergeQueue.next(Merger.java:255)
                at org.apache.hadoop.mapred.Merger.writeFile(Merger.java:86)
                at org.apache.hadoop.mapred.Merger$MergeQueue.merge(Merger.java:377)
                at org.apache.hadoop.mapred.Merger.merge(Merger.java:58)
                at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:277)
                at org.apache.hadoop.mapred.TaskTracker$Child.main(TaskTracker.java:2216)

    (We omit the "Error running child" line from the results)
    """
    for line in lines:
        if line.rstrip(b'\r\n').endswith(b"Error running child"):
            st_lines = []
            for line in lines:
                st_lines.append(line)
                for line in lines:
                    if not line.startswith(b'        at '):
                        break
                    st_lines.append(line)
                return [to_string(line) for line in st_lines]
    else:
        return None
Beispiel #7
0
def _ssh_ls(ssh_bin,
            address,
            ec2_key_pair_file,
            path,
            keyfile=None,
            sudo=False):
    """Recursively list files under ``path`` on the specified SSH host.
    Return the file at ``path`` as a string. Raises ``IOError`` if the
    path doesn't exist or SSH access fails.

    :param ssh_bin: Path to ``ssh`` binary
    :param address: Address of your job's master node
    :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)
    :param path: Path on the remote host to list
    :param keyfile: Name of the EMR private key file on the master node in case
                    ``path`` exists on one of the slave nodes
    :param sudo: if true, run command with ``sudo``
    """
    cmd_args = ['find', '-L', path, '-type', 'f']
    if sudo:
        cmd_args = ['sudo'] + cmd_args

    out = to_string(
        _check_output(*_ssh_run_with_recursion(
            ssh_bin, address, ec2_key_pair_file, keyfile, cmd_args)))
    if 'No such file or directory' in out:
        raise IOError("No such file or directory: %s" % path)
    return out.split('\n')
Beispiel #8
0
    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in range(self._num_steps()):
            log.debug("running step %d of %d" % (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug("> %s" % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                self._process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error("STDOUT: " + to_string(line.strip(b"\n")))

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    with os.fdopen(master_fd, "rb") as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        self._process_stderr_from_streaming(master)
                        _, returncode = os.waitpid(pid, 0)

            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = "Job failed with return code %d: %s" % (returncode, step_args)
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure([step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"])
                    cause_msg.extend(line.strip("\n") for line in cause["lines"])
                    if cause["input_uri"]:
                        cause_msg.append("(while reading from %s)" % cause["input_uri"])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += "\n" + "\n".join(cause_msg) + "\n"

                raise CalledProcessError(returncode, step_args)
Beispiel #9
0
def find_hadoop_java_stack_trace(lines):
    """Scan a log file or other iterable for a java stack trace from Hadoop,
    and return it as a list of lines (bytes).

    In logs from EMR, we find java stack traces in ``task-attempts/*/syslog``

    Sample stack trace::

        2010-07-27 18:25:48,397 WARN org.apache.hadoop.mapred.TaskTracker (main): Error running child
        java.lang.OutOfMemoryError: Java heap space
                at org.apache.hadoop.mapred.IFile$Reader.readNextBlock(IFile.java:270)
                at org.apache.hadoop.mapred.IFile$Reader.next(IFile.java:332)
                at org.apache.hadoop.mapred.Merger$Segment.next(Merger.java:147)
                at org.apache.hadoop.mapred.Merger$MergeQueue.adjustPriorityQueue(Merger.java:238)
                at org.apache.hadoop.mapred.Merger$MergeQueue.next(Merger.java:255)
                at org.apache.hadoop.mapred.Merger.writeFile(Merger.java:86)
                at org.apache.hadoop.mapred.Merger$MergeQueue.merge(Merger.java:377)
                at org.apache.hadoop.mapred.Merger.merge(Merger.java:58)
                at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:277)
                at org.apache.hadoop.mapred.TaskTracker$Child.main(TaskTracker.java:2216)

    (We omit the "Error running child" line from the results)
    """
    for line in lines:
        if line.rstrip(b'\r\n').endswith(b"Error running child"):
            st_lines = []
            for line in lines:
                st_lines.append(line)
                for line in lines:
                    if not line.startswith(b'        at '):
                        break
                    st_lines.append(line)
                return [to_string(line) for line in st_lines]
    else:
        return None
Beispiel #10
0
    def _process_stderr_from_streaming(self, stderr):
        def treat_eio_as_eof(iter):
            # on Linux, the PTY gives us a specific IOError when the
            # when the child process exits, rather than EOF.
            while True:
                try:
                    yield next(iter)  # okay for StopIteration to bubble up
                except IOError as e:
                    if e.errno == errno.EIO:
                        return
                    else:
                        raise

        for line in treat_eio_as_eof(stderr):
            line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
            log.info("HADOOP: " + to_string(line))

            if b"Streaming Job Failed!" in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            m = HADOOP_JOB_TIMESTAMP_RE.match(line)
            if m and self._job_timestamp is None:
                self._job_timestamp = m.group("timestamp")
                self._start_step_num = int(m.group("step_num"))
Beispiel #11
0
 def stderr_to_log(lines):
     for line in lines:
         line = to_string(line)
         if _HADOOP_NON_LOG_LINE_RE.match(line):
             # use error because this is usually "Streaming Command Failed!"
             _log_line_from_hadoop(line, level=logging.ERROR)
         else:
             yield line
Beispiel #12
0
 def stderr_to_log(lines):
     for line in lines:
         line = to_string(line)
         if _HADOOP_NON_LOG_LINE_RE.match(line):
             # use error because this is usually "Streaming Command Failed!"
             _log_line_from_hadoop(line, level=logging.ERROR)
         else:
             yield line
Beispiel #13
0
def parse_mr_job_stderr(stderr, counters=None):
    """Parse counters and status messages out of MRJob output.

    :param stderr: a filehandle, a list of lines (bytes), or bytes
    :param counters: Counters so far, to update; a map from group (string to
                     counter name (string) to count.

    Returns a dictionary with the keys *counters*, *statuses*, *other*:

    - *counters*: counters so far; same format as above
    - *statuses*: a list of status messages encountered
    - *other*: lines (strings) that aren't either counters or status messages
    """
    # For the corresponding code in Hadoop Streaming, see ``incrCounter()`` in
    # http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapRed.java?view=markup  # noqa
    if isinstance(stderr, bytes):
        stderr = BytesIO(stderr)

    if counters is None:
        counters = {}
    statuses = []
    other = []

    for line in stderr:
        m = _COUNTER_RE.match(line.rstrip(b'\r\n'))
        if m:
            group, counter, amount_str = m.groups()

            # don't leave these as bytes on Python 3
            group = to_string(group)
            counter = to_string(counter)

            counters.setdefault(group, {})
            counters[group].setdefault(counter, 0)
            counters[group][counter] += int(amount_str)
            continue

        m = _STATUS_RE.match(line.rstrip(b'\r\n'))
        if m:
            # don't leave as bytes on Python 3
            statuses.append(to_string(m.group(1)))
            continue

        other.append(to_string(line))

    return {'counters': counters, 'statuses': statuses, 'other': other}
Beispiel #14
0
def parse_mr_job_stderr(stderr, counters=None):
    """Parse counters and status messages out of MRJob output.

    :param stderr: a filehandle, a list of lines (bytes), or bytes
    :param counters: Counters so far, to update; a map from group (string to
                     counter name (string) to count.

    Returns a dictionary with the keys *counters*, *statuses*, *other*:

    - *counters*: counters so far; same format as above
    - *statuses*: a list of status messages encountered
    - *other*: lines (strings) that aren't either counters or status messages
    """
    # For the corresponding code in Hadoop Streaming, see ``incrCounter()`` in
    # http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapRed.java?view=markup  # noqa
    if isinstance(stderr, bytes):
        stderr = BytesIO(stderr)

    if counters is None:
        counters = {}
    statuses = []
    other = []

    for line in stderr:
        m = _COUNTER_RE.match(line.rstrip(b'\r\n'))
        if m:
            group, counter, amount_str = m.groups()

            # don't leave these as bytes on Python 3
            group = to_string(group)
            counter = to_string(counter)

            counters.setdefault(group, {})
            counters[group].setdefault(counter, 0)
            counters[group][counter] += int(amount_str)
            continue

        m = _STATUS_RE.match(line.rstrip(b'\r\n'))
        if m:
            # don't leave as bytes on Python 3
            statuses.append(to_string(m.group(1)))
            continue

        other.append(to_string(line))

    return {'counters': counters, 'statuses': statuses, 'other': other}
Beispiel #15
0
def _cat_log(fs, path):
    """fs.cat() the given log, converting lines to strings, and logging
    errors."""
    try:
        for line in fs.cat(path):
            yield to_string(line)
    except IOError as e:
        log.warning("couldn't cat() %s: %r" % (path, e))
Beispiel #16
0
def _cat_log(fs, path):
    """fs.cat() the given log, converting lines to strings, and logging
    errors."""
    try:
        for line in fs.cat(path):
            yield to_string(line)
    except IOError as e:
        log.warning("couldn't cat() %s: %r" % (path, e))
Beispiel #17
0
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        version = self.get_hadoop_version()

        # use ls -R on Hadoop 2 (see #1152)
        if uses_yarn(version):
            args = ['fs', '-ls', '-R', path_glob]
        else:
            args = ['fs', '-lsr', path_glob]

        try:
            stdout = self.invoke_hadoop(args,
                                        return_stdout=True,
                                        ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in BytesIO(stdout):
            line = line.rstrip(b'\r\n')

            # ignore total item count
            if line.startswith(b'Found '):
                continue

            fields = line.split(b' ')

            # Throw out directories
            if fields[0].startswith(b'd'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                # look for time field, and pick one after that
                # (can't use field[2] because that's an int in Python 3)
                if len(field) == 5 and field[2:3] == b':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string %r" % line)

            path = to_string(line.split(b' ', path_index)[-1])
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
Beispiel #18
0
def _cat_log(fs, path):
    """fs.cat() the given log, converting lines to strings, and logging
    errors."""
    try:
        if not fs.exists(path):
            return
        for line in fs.cat(path):
            yield to_string(line)
    except (IOError, OSError) as e:
        log.warning("couldn't cat() %s: %r" % (path, e))
Beispiel #19
0
 def yield_lines():
     try:
         for line in stderr:
             yield to_string(line)
     except IOError as e:
         # this is just the PTY's way of saying goodbye
         if e.errno == errno.EIO:
             return
         else:
             raise
Beispiel #20
0
def find_python_traceback(lines):
    """Scan a log file or other iterable for a Python traceback,
    and return it as a list of lines (bytes).

    In logs from EMR, we find python tracebacks in ``task-attempts/*/stderr``
    """
    # Essentially, we detect the start of the traceback, and continue
    # until we find a non-indented line, with some special rules for exceptions
    # from subprocesses.

    # Lines to pass back representing entire error found
    all_tb_lines = []

    # This is used to store a working list of lines in a single traceback
    tb_lines = []

    # This is used to store a working list of non-traceback lines between the
    # current traceback and the previous one
    non_tb_lines = []

    # Track whether or not we are in a traceback rather than consuming the
    # iterator
    in_traceback = False

    for line in lines:
        # don't return bytes in Python 3
        line = to_string(line)

        if in_traceback:
            tb_lines.append(line)

            # If no indentation, this is the last line of the traceback
            if line.lstrip() == line:
                in_traceback = False

                if line.startswith('subprocess.CalledProcessError'):
                    # CalledProcessError may mean that the subprocess printed
                    # errors to stderr which we can show the user
                    all_tb_lines += non_tb_lines

                all_tb_lines += tb_lines

                # Reset all working lists
                tb_lines = []
                non_tb_lines = []
        else:
            if line.startswith('Traceback (most recent call last):'):
                tb_lines.append(line)
                in_traceback = True
            else:
                non_tb_lines.append(line)
    if all_tb_lines:
        return all_tb_lines
    else:
        return None
Beispiel #21
0
def find_python_traceback(lines):
    """Scan a log file or other iterable for a Python traceback,
    and return it as a list of lines (bytes).

    In logs from EMR, we find python tracebacks in ``task-attempts/*/stderr``
    """
    # Essentially, we detect the start of the traceback, and continue
    # until we find a non-indented line, with some special rules for exceptions
    # from subprocesses.

    # Lines to pass back representing entire error found
    all_tb_lines = []

    # This is used to store a working list of lines in a single traceback
    tb_lines = []

    # This is used to store a working list of non-traceback lines between the
    # current traceback and the previous one
    non_tb_lines = []

    # Track whether or not we are in a traceback rather than consuming the
    # iterator
    in_traceback = False

    for line in lines:
        # don't return bytes in Python 3
        line = to_string(line)

        if in_traceback:
            tb_lines.append(line)

            # If no indentation, this is the last line of the traceback
            if line.lstrip() == line:
                in_traceback = False

                if line.startswith('subprocess.CalledProcessError'):
                    # CalledProcessError may mean that the subprocess printed
                    # errors to stderr which we can show the user
                    all_tb_lines += non_tb_lines

                all_tb_lines += tb_lines

                # Reset all working lists
                tb_lines = []
                non_tb_lines = []
        else:
            if line.startswith('Traceback (most recent call last):'):
                tb_lines.append(line)
                in_traceback = True
            else:
                non_tb_lines.append(line)
    if all_tb_lines:
        return all_tb_lines
    else:
        return None
Beispiel #22
0
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        version = self.get_hadoop_version()

        # use ls -R on Hadoop 2 (see #1152)
        if uses_yarn(version):
            args = ['fs', '-ls', '-R', path_glob]
        else:
            args = ['fs', '-lsr', path_glob]

        try:
            stdout = self.invoke_hadoop(args, return_stdout=True,
                                        ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in BytesIO(stdout):
            line = line.rstrip(b'\r\n')

            # ignore total item count
            if line.startswith(b'Found '):
                continue

            fields = line.split(b' ')

            # Throw out directories
            if fields[0].startswith(b'd'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                # look for time field, and pick one after that
                # (can't use field[2] because that's an int in Python 3)
                if len(field) == 5 and field[2:3] == b':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string %r" % line)

            path = to_string(line.split(b' ', path_index)[-1])
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
Beispiel #23
0
def ssh_terminate_single_job(ssh_bin, address, ec2_key_pair_file):
    """Terminate the only job running the Hadoop cluster with master node
    *address* using 'hadoop job -kill JOB_ID'. Return string output of command
    or None if there was no job to termiante. Raise :py:class:`IOError` if some
    other error occurred.

    :param ssh_bin: Path to ``ssh`` binary
    :param address: Address of your job's master node (obtained via
                    :py:meth:`boto.emr.EmrConnection.describe_jobflow`)
    :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)

    :return: ``True`` if successful, ``False`` if no job was running
    """
    job_list_out = to_string(
        check_output(*ssh_run(ssh_bin, address, ec2_key_pair_file,
                              ['hadoop', 'job', '-list'])))
    job_list_lines = job_list_out.splitlines()

    def job_list_output_error():
        raise IOError('Could not read results of "hadoop job -list" and so'
                      ' could not terminate job:\n%s' % job_list_out)

    num_jobs_match = HADOOP_JOB_LIST_NUM_RE.match(job_list_lines[0])
    if not num_jobs_match:
        job_list_output_error()
    if int(num_jobs_match.group(1)) > 1:
        raise IOError('More than one job is running; unclear which one to'
                      ' terminate, so not terminating any jobs')
    if int(num_jobs_match.group(1)) == 0:
        return None

    job_info_match = HADOOP_JOB_LIST_INFO_RE.match(job_list_lines[2])
    if not job_info_match:
        job_list_output_error()
    job_id = to_string(job_info_match.group(1))

    job_kill_out = to_string(
        check_output(*ssh_run(ssh_bin, address, ec2_key_pair_file,
                              ['hadoop', 'job', '-kill', job_id])))

    return job_kill_out
Beispiel #24
0
        def cleanup():
            # this does someties happen; see #1396
            for line in cat_proc.stderr:
                log.error('STDERR: ' + to_string(line.rstrip(b'\r\n')))

            cat_proc.stdout.close()
            cat_proc.stderr.close()

            returncode = cat_proc.wait()

            if returncode != 0:
                raise IOError("Could not stream %s" % filename)
Beispiel #25
0
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
    """Given an :py:class:`EMRJobRunner`, run the command specified by
    *cmd_args* on all nodes in the cluster and save the stdout and stderr of
    each run to subdirectories of *output_dir*.

    You should probably have run :py:meth:`_enable_slave_ssh_access()` on the
    runner before calling this function.
    """
    master_addr = runner._address_of_master()
    addresses = [master_addr]

    ssh_bin = runner._opts['ssh_bin']
    ec2_key_pair_file = runner._opts['ec2_key_pair_file']

    keyfile = None
    slave_addrs = runner.fs.ssh_slave_hosts(master_addr)

    if slave_addrs:
        addresses += [
            '%s!%s' % (master_addr, slave_addr) for slave_addr in slave_addrs
        ]
        # copying key file like a boss (name of keyfile doesn't really matter)
        keyfile = 'mrboss-%s.pem' % random_identifier()
        _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile)

    for addr in addresses:

        stdout, stderr = _ssh_run_with_recursion(
            ssh_bin,
            addr,
            ec2_key_pair_file,
            keyfile,
            cmd_args,
        )

        if print_stderr:
            print('---')
            print('Command completed on %s.' % addr)
            print(to_string(stderr), end=' ')

        if '!' in addr:
            base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1])
        else:
            base_dir = os.path.join(output_dir, 'master')

        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
            f.write(stdout)

        with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
            f.write(stderr)
Beispiel #26
0
        def cleanup():
            # this does someties happen; see #1396
            for line in cat_proc.stderr:
                log.error('STDERR: ' + to_string(line.rstrip(b'\r\n')))

            cat_proc.stdout.close()
            cat_proc.stderr.close()

            returncode = cat_proc.wait()

            if returncode != 0:
                raise IOError("Could not stream %s" % filename)
Beispiel #27
0
def ssh_terminate_single_job(ssh_bin, address, ec2_key_pair_file):
    """Terminate the only job running the Hadoop cluster with master node
    *address* using 'hadoop job -kill JOB_ID'. Return string output of command
    or None if there was no job to termiante. Raise :py:class:`IOError` if some
    other error occurred.

    :param ssh_bin: Path to ``ssh`` binary
    :param address: Address of your job's master node (obtained via
                    :py:meth:`boto.emr.EmrConnection.describe_jobflow`)
    :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)

    :return: ``True`` if successful, ``False`` if no job was running
    """
    job_list_out = to_string(check_output(*ssh_run(
        ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-list'])))
    job_list_lines = job_list_out.splitlines()

    def job_list_output_error():
        raise IOError('Could not read results of "hadoop job -list" and so'
                      ' could not terminate job:\n%s' % job_list_out)

    num_jobs_match = HADOOP_JOB_LIST_NUM_RE.match(job_list_lines[0])
    if not num_jobs_match:
        job_list_output_error()
    if int(num_jobs_match.group(1)) > 1:
        raise IOError('More than one job is running; unclear which one to'
                      ' terminate, so not terminating any jobs')
    if int(num_jobs_match.group(1)) == 0:
        return None

    job_info_match = HADOOP_JOB_LIST_INFO_RE.match(job_list_lines[2])
    if not job_info_match:
        job_list_output_error()
    job_id = to_string(job_info_match.group(1))

    job_kill_out = to_string(check_output(*ssh_run(
        ssh_bin, address, ec2_key_pair_file,
        ['hadoop', 'job', '-kill', job_id])))

    return job_kill_out
Beispiel #28
0
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
    """Given an :py:class:`EMRJobRunner`, run the command specified by
    *cmd_args* on all nodes in the cluster and save the stdout and stderr of
    each run to subdirectories of *output_dir*.

    You should probably have run :py:meth:`_enable_slave_ssh_access()` on the
    runner before calling this function.
    """
    master_addr = runner._address_of_master()
    addresses = [master_addr]

    ssh_bin = runner._opts['ssh_bin']
    ec2_key_pair_file = runner._opts['ec2_key_pair_file']

    keyfile = None
    slave_addrs = runner.fs.ssh_slave_hosts(master_addr)

    if slave_addrs:
        addresses += ['%s!%s' % (master_addr, slave_addr)
                      for slave_addr in slave_addrs]
        # copying key file like a boss (name of keyfile doesn't really matter)
        keyfile = 'mrboss-%s.pem' % random_identifier()
        _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile)

    for addr in addresses:

        stdout, stderr = _ssh_run_with_recursion(
            ssh_bin,
            addr,
            ec2_key_pair_file,
            keyfile,
            cmd_args,
        )

        if print_stderr:
            print('---')
            print('Command completed on %s.' % addr)
            print(to_string(stderr), end=' ')

        if '!' in addr:
            base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1])
        else:
            base_dir = os.path.join(output_dir, 'master')

        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
            f.write(stdout)

        with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
            f.write(stderr)
Beispiel #29
0
def _ssh_slave_addresses(ssh_bin, master_address, ec2_key_pair_file):
    """Get the IP addresses of the slave nodes. Fails silently because it
    makes testing easier and if things are broken they will fail before this
    function is called.
    """
    if not ec2_key_pair_file or not os.path.exists(ec2_key_pair_file):
        return []   # this is a testing environment

    cmd = "hadoop dfsadmin -report | grep ^Name | cut -f2 -d: | cut -f2 -d' '"
    args = ['bash -c "%s"' % cmd]
    ips = to_string(_check_output(
        *_ssh_run(ssh_bin, master_address, ec2_key_pair_file, args)))
    return [ip for ip in ips.split('\n') if ip]
Beispiel #30
0
def ssh_slave_addresses(ssh_bin, master_address, ec2_key_pair_file):
    """Get the IP addresses of the slave nodes. Fails silently because it
    makes testing easier and if things are broken they will fail before this
    function is called.
    """
    if not ec2_key_pair_file or not os.path.exists(ec2_key_pair_file):
        return []   # this is a testing environment

    cmd = "hadoop dfsadmin -report | grep ^Name | cut -f2 -d: | cut -f2 -d' '"
    args = ['bash -c "%s"' % cmd]
    ips = to_string(check_output(
        *ssh_run(ssh_bin, master_address, ec2_key_pair_file, args)))
    return [ip for ip in ips.split('\n') if ip]
Beispiel #31
0
    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        # mkdir() needs this
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(['version'], return_stdout=True)
            if stdout:
                first_line = stdout.split(b'\n')[0]
                m = _HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = to_string(m.group('version'))
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                else:
                    raise Exception('Unable to determine Hadoop version.')

        return self._hadoop_version
Beispiel #32
0
def _parse_counters_0_20(counter_string):
    # 0.20 counters look like this:
    # {(groupid)(groupname)[(counterid)(countername)(countervalue)][...]...}
    groups = _GROUP_RE_0_20.findall(counter_string)
    if not groups:
        log.warning('Cannot parse Hadoop counter string: %s' % counter_string)

    for group_id, group_name, counter_str in groups:
        matches = _COUNTER_RE_0_20.findall(counter_str)

        try:
            group_name = counter_unescape(group_name)
        except ValueError:
            log.warning("Could not decode group name %r" % group_name)
            group_name = to_string(group_name)

        for counter_id, counter_name, counter_value in matches:
            try:
                counter_name = counter_unescape(counter_name)
            except ValueError:
                log.warning("Could not decode counter name %r" % counter_name)
                counter_name = to_string(counter_name)

            yield group_name, counter_name, int(counter_value)
Beispiel #33
0
    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        # mkdir() needs this
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(['version'], return_stdout=True)
            if stdout:
                first_line = stdout.split(b'\n')[0]
                m = _HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = to_string(m.group('version'))
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                else:
                    raise Exception('Unable to determine Hadoop version.')

        return self._hadoop_version
Beispiel #34
0
def _parse_counters_0_20(counter_string):
    # 0.20 counters look like this:
    # {(groupid)(groupname)[(counterid)(countername)(countervalue)][...]...}
    groups = _GROUP_RE_0_20.findall(counter_string)
    if not groups:
        log.warning('Cannot parse Hadoop counter string: %s' % counter_string)

    for group_id, group_name, counter_str in groups:
        matches = _COUNTER_RE_0_20.findall(counter_str)

        try:
            group_name = counter_unescape(group_name)
        except ValueError:
            log.warning("Could not decode group name %r" % group_name)
            group_name = to_string(group_name)

        for counter_id, counter_name, counter_value in matches:
            try:
                counter_name = counter_unescape(counter_name)
            except ValueError:
                log.warning("Could not decode counter name %r" % counter_name)
                counter_name = to_string(counter_name)

            yield group_name, counter_name, int(counter_value)
Beispiel #35
0
def find_input_uri_for_mapper(lines):
    """Scan a log file or other iterable for the path of an input file
    for the first mapper on Hadoop. Just returns the path, or None if
    no match.

    In logs from EMR, we find python tracebacks in ``task-attempts/*/syslog``

    Matching log lines look like::

        2010-07-27 17:54:54,344 INFO org.apache.hadoop.fs.s3native.NativeS3FileSystem (main): Opening 's3://yourbucket/logs/2010/07/23/log2-00077.gz' for reading
    """
    val = None
    for line in lines:
        match = _OPENING_FOR_READING_RE.match(line)
        if match:
            val = to_string(match.group(1))
    return val
Beispiel #36
0
def find_input_uri_for_mapper(lines):
    """Scan a log file or other iterable for the path of an input file
    for the first mapper on Hadoop. Just returns the path, or None if
    no match.

    In logs from EMR, we find python tracebacks in ``task-attempts/*/syslog``

    Matching log lines look like::

        2010-07-27 17:54:54,344 INFO org.apache.hadoop.fs.s3native.NativeS3FileSystem (main): Opening 's3://yourbucket/logs/2010/07/23/log2-00077.gz' for reading
    """
    val = None
    for line in lines:
        match = _OPENING_FOR_READING_RE.match(line)
        if match:
            val = to_string(match.group(1))
    return val
Beispiel #37
0
def find_job_log_multiline_error(lines):
    """Scan a log file for an arbitrary multi-line error. Return it as a list
    of lines, or None of nothing was found.

    Here is an example error::

        MapAttempt TASK_TYPE="MAP" TASKID="task_201106280040_0001_m_000218" TASK_ATTEMPT_ID="attempt_201106280040_0001_m_000218_5" TASK_STATUS="FAILED" FINISH_TIME="1309246900665" HOSTNAME="/default-rack/ip-10-166-239-133.us-west-1.compute.internal" ERROR="Error initializing attempt_201106280040_0001_m_000218_5:
        java.io.IOException: Cannot run program "bash": java.io.IOException: error=12, Cannot allocate memory
            at java.lang.ProcessBuilder.start(ProcessBuilder.java:460)
            at org.apache.hadoop.util.Shell.runCommand(Shell.java:149)
            at org.apache.hadoop.util.Shell.run(Shell.java:134)
            at org.apache.hadoop.fs.DF.getAvailable(DF.java:73)
            at org.apache.hadoop.fs.LocalDirAllocator$AllocatorPerContext.getLocalPathForWrite(LocalDirAllocator.java:296)
            at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:124)
            at org.apache.hadoop.mapred.TaskTracker.localizeJob(TaskTracker.java:648)
            at org.apache.hadoop.mapred.TaskTracker.startNewTask(TaskTracker.java:1320)
            at org.apache.hadoop.mapred.TaskTracker.offerService(TaskTracker.java:956)
            at org.apache.hadoop.mapred.TaskTracker.run(TaskTracker.java:1357)
            at org.apache.hadoop.mapred.TaskTracker.main(TaskTracker.java:2361)
        Caused by: java.io.IOException: java.io.IOException: error=12, Cannot allocate memory
            at java.lang.UNIXProcess.<init>(UNIXProcess.java:148)
            at java.lang.ProcessImpl.start(ProcessImpl.java:65)
            at java.lang.ProcessBuilder.start(ProcessBuilder.java:453)
            ... 10 more
        "

    The first line returned will only include the text after ``ERROR="``, and
    discard the final line with just ``"``.

    These errors are parsed from jobs/\*.jar.
    """
    for line in lines:
        m = _MULTILINE_JOB_LOG_ERROR_RE.match(line)
        if m:
            st_lines = []
            if m.group('first_line'):
                st_lines.append(m.group('first_line'))
            for line in lines:
                st_lines.append(line)
                for line in lines:
                    if line.strip() == b'"':
                        break
                    st_lines.append(line)
                return [to_string(line) for line in st_lines]
    return None
Beispiel #38
0
def find_job_log_multiline_error(lines):
    """Scan a log file for an arbitrary multi-line error. Return it as a list
    of lines, or None of nothing was found.

    Here is an example error::

        MapAttempt TASK_TYPE="MAP" TASKID="task_201106280040_0001_m_000218" TASK_ATTEMPT_ID="attempt_201106280040_0001_m_000218_5" TASK_STATUS="FAILED" FINISH_TIME="1309246900665" HOSTNAME="/default-rack/ip-10-166-239-133.us-west-1.compute.internal" ERROR="Error initializing attempt_201106280040_0001_m_000218_5:
        java.io.IOException: Cannot run program "bash": java.io.IOException: error=12, Cannot allocate memory
            at java.lang.ProcessBuilder.start(ProcessBuilder.java:460)
            at org.apache.hadoop.util.Shell.runCommand(Shell.java:149)
            at org.apache.hadoop.util.Shell.run(Shell.java:134)
            at org.apache.hadoop.fs.DF.getAvailable(DF.java:73)
            at org.apache.hadoop.fs.LocalDirAllocator$AllocatorPerContext.getLocalPathForWrite(LocalDirAllocator.java:296)
            at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:124)
            at org.apache.hadoop.mapred.TaskTracker.localizeJob(TaskTracker.java:648)
            at org.apache.hadoop.mapred.TaskTracker.startNewTask(TaskTracker.java:1320)
            at org.apache.hadoop.mapred.TaskTracker.offerService(TaskTracker.java:956)
            at org.apache.hadoop.mapred.TaskTracker.run(TaskTracker.java:1357)
            at org.apache.hadoop.mapred.TaskTracker.main(TaskTracker.java:2361)
        Caused by: java.io.IOException: java.io.IOException: error=12, Cannot allocate memory
            at java.lang.UNIXProcess.<init>(UNIXProcess.java:148)
            at java.lang.ProcessImpl.start(ProcessImpl.java:65)
            at java.lang.ProcessBuilder.start(ProcessBuilder.java:453)
            ... 10 more
        "

    The first line returned will only include the text after ``ERROR="``, and
    discard the final line with just ``"``.

    These errors are parsed from jobs/\*.jar.
    """
    for line in lines:
        m = _MULTILINE_JOB_LOG_ERROR_RE.match(line)
        if m:
            st_lines = []
            if m.group('first_line'):
                st_lines.append(m.group('first_line'))
            for line in lines:
                st_lines.append(line)
                for line in lines:
                    if line.strip() == b'"':
                        break
                    st_lines.append(line)
                return [to_string(line) for line in st_lines]
    return None
Beispiel #39
0
def _wrap_streaming_pty_output(lines):
    """Make output from PTY running hadoop streaming behave like stderr.

    This screens out and logs lines that look like they came from stdout,
    and treats the EIO error as EOF.
    """
    while True:
        try:
            line = next(lines)  # okay to get StopIteration
            if _HADOOP_STDOUT_RE.match(line):
                _log_line_from_hadoop(to_string(line).rstrip('\r\n'))
            else:
                yield line
        except IOError as e:
            if e.errno == errno.EIO:
                return
            else:
                raise
Beispiel #40
0
def find_interesting_hadoop_streaming_error(lines):
    """Scan a log file or other iterable for a hadoop streaming error
    other than "Job not Successful!". Return the error as a string, or None
    if nothing found.

    In logs from EMR, we find java stack traces in ``steps/*/syslog``

    Example line::

        2010-07-27 19:53:35,451 ERROR org.apache.hadoop.streaming.StreamJob (main): Error launching job , Output path already exists : Output directory s3://yourbucket/logs/2010/07/23/ already exists and is not empty
    """
    for line in lines:
        match = _HADOOP_STREAMING_ERROR_RE.match(line) or _HADOOP_STREAMING_ERROR_RE_2.match(line)
        if match:
            msg = to_string(match.group(1))
            if msg != "Job not Successful!":
                return msg
    return None
Beispiel #41
0
def _ssh_ls(ssh_bin, address, ec2_key_pair_file, path, keyfile=None):
    """Recursively list files under ``path`` on the specified SSH host.
    Return the file at ``path`` as a string. Raises ``IOError`` if the
    path doesn't exist or SSH access fails.

    :param ssh_bin: Path to ``ssh`` binary
    :param address: Address of your job's master node
    :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)
    :param path: Path on the remote host to list
    :param keyfile: Name of the EMR private key file on the master node in case
                    ``path`` exists on one of the slave nodes
    """
    out = to_string(_check_output(*_ssh_run_with_recursion(
        ssh_bin, address, ec2_key_pair_file, keyfile,
        ['find', '-L', path, '-type', 'f'])))
    if 'No such file or directory' in out:
        raise IOError("No such file or directory: %s" % path)
    return out.split('\n')
Beispiel #42
0
def _wrap_streaming_pty_output(lines):
    """Make output from PTY running hadoop streaming behave like stderr.

    This screens out and logs lines that look like they came from stdout,
    and treats the EIO error as EOF.
    """
    while True:
        try:
            line = next(lines)  # okay to get StopIteration
            if _HADOOP_STDOUT_RE.match(line):
                _log_line_from_hadoop(to_string(line).rstrip('\r\n'))
            else:
                yield line
        except IOError as e:
            if e.errno == errno.EIO:
                return
            else:
                raise
Beispiel #43
0
def find_interesting_hadoop_streaming_error(lines):
    """Scan a log file or other iterable for a hadoop streaming error
    other than "Job not Successful!". Return the error as a string, or None
    if nothing found.

    In logs from EMR, we find java stack traces in ``steps/*/syslog``

    Example line::

        2010-07-27 19:53:35,451 ERROR org.apache.hadoop.streaming.StreamJob (main): Error launching job , Output path already exists : Output directory s3://yourbucket/logs/2010/07/23/ already exists and is not empty
    """
    for line in lines:
        match = (_HADOOP_STREAMING_ERROR_RE.match(line)
                 or _HADOOP_STREAMING_ERROR_RE_2.match(line))
        if match:
            msg = to_string(match.group(1))
            if msg != 'Job not Successful!':
                return msg
    return None
Beispiel #44
0
def ssh_ls(ssh_bin, address, ec2_key_pair_file, path, keyfile=None):
    """Recursively list files under ``path`` on the specified SSH host.
    Return the file at ``path`` as a string. Raises ``IOError`` if the
    path doesn't exist or SSH access fails.

    :param ssh_bin: Path to ``ssh`` binary
    :param address: Address of your job's master node (obtained via
                    :py:meth:`boto.emr.EmrConnection.describe_jobflow`)
    :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)
    :param path: Path on the remote host to list
    :param keyfile: Name of the EMR private key file on the master node in case
                    ``path`` exists on one of the slave nodes
    """
    out = to_string(
        check_output(*ssh_run_with_recursion(
            ssh_bin, address, ec2_key_pair_file, keyfile,
            ['find', '-L', path, '-type', 'f'])))
    if 'No such file or directory' in out:
        raise IOError("No such file or directory: %s" % path)
    return out.split('\n')
Beispiel #45
0
 def test_non_ascii_unicode(self):
     self.assertEqual(to_string(u'café'), u'café')
Beispiel #46
0
 def test_ascii_unicode(self):
     self.assertEqual(to_string(u'foo'), u'foo')
Beispiel #47
0
 def test_latin_1_bytes(self):
     self.assertEqual(to_string(b'caf\xe9'), 'caf\xe9')
Beispiel #48
0
    def _run_job_in_hadoop(self):
        for step_num in range(self._num_steps()):
            step_args = self._args_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                step_info = _process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_info = _process_stderr_from_streaming(
                            _wrap_streaming_pty_output(master))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if not step_info['output_dir']:
                step_info['output_dir'] = self._hdfs_step_output_dir(step_num)

            if not step_info['counters']:
                pass  # TODO: fetch counters; see _fetch_counters()

            self._steps_info.append(step_info)

            # just print counters for this one step
            self._print_counters(step_nums=[step_num])

            if returncode:
                err_lines = [
                    'Job failed with return code %d: %s' %
                    (returncode, cmd_line(step_args))
                ]

                cause = self._find_probable_cause_of_failure(**step_info)

                if cause:
                    err_lines.append('')  # pad with empty line
                    err_lines.extend(_format_cause_of_failure(cause))

                for err_line in err_lines:
                    log.error(err_line)
                raise Exception('\n'.join(err_lines) + '\n')
Beispiel #49
0
 def test_ascii_unicode(self):
     self.assertEqual(to_string(u'foo'), u'foo')
Beispiel #50
0
 def test_non_ascii_unicode(self):
     self.assertEqual(to_string(u'café'), u'café')
Beispiel #51
0
    def _run_job_in_hadoop(self):
        for step_num in range(self._num_steps()):
            step_args = self._args_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d' %
                      (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr,
                    record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._hdfs_step_output_dir(step_num))

            log_interpretation['step'] = step_interpretation

            counters = self._pick_counters(log_interpretation)
            if counters:
                log.info(_format_counters(counters))
            else:
                log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())
Beispiel #52
0
    def _process_streaming_stderr_lines(self, lines):
        """Process lines (with \r and \n stripped) from Hadoop binary's
        stderr. Return a dict of counters for the step.

        This handles counter parsing, debug printouts, and the job timestamp.
        """
        # counter-parsing state
        step_counters = {}
        parsing_counters = False
        counter_group = None
        counter_group_indent = None

        for line in lines:
            line = _HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)

            # don't print HADOOP: <counter stuff>, since we print
            # counters later anyway

            # start of counters
            if not (parsing_counters or step_counters):
                m = _HADOOP_COUNTERS_START_RE.match(line)
                if m:
                    parsing_counters = True
                    log.info('Parsing counters from hadoop output')
                    continue

            if parsing_counters:
                if not (counter_group is None or counter_group_indent is None):
                    m = _HADOOP_COUNTER_RE.match(line)
                    if m and len(m.group('indent')) > counter_group_indent:

                        counter = to_string(m.group('counter'))
                        amount = int(m.group('amount'))

                        log.debug('  counter: %s=%d' % (counter, amount))

                        step_counters.setdefault(counter_group, {})
                        step_counters[counter_group][counter] = amount

                        continue

                m = _HADOOP_COUNTER_GROUP_RE.match(line)
                if m:
                    counter_group = to_string(m.group('group'))
                    counter_group_indent = len(m.group('indent'))

                    log.debug('  counter group: %s' % counter_group)

                    continue
                else:
                    parsing_counters = False

            log.info('HADOOP: ' + to_string(line))
            if b'Streaming Job Failed!' in line:
                raise Exception(line)

            # The job identifier is printed to stderr. We only want to parse it
            # once because we know how many steps we have and just want to know
            # what Hadoop thinks the first step's number is.
            if self._job_timestamp is None:
                m = _HADOOP_JOB_TIMESTAMP_RE.match(line)
                if m:
                    self._job_timestamp = m.group('timestamp')
                    self._start_step_num = int(m.group('step_num'))

        return step_counters
Beispiel #53
0
    def _run_job_in_hadoop(self):
        for step_num in range(self._num_steps()):
            step_args = self._args_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d' %
                      (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                step_info = _process_stderr_from_streaming(
                    step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_info = _process_stderr_from_streaming(
                            _wrap_streaming_pty_output(master))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if not step_info['output_dir']:
                step_info['output_dir'] = self._hdfs_step_output_dir(step_num)

            if not step_info['counters']:
                log.info('Attempting to read counters from history log')
                history = self._interpret_history_log(step_info)
                if history:
                    step_info['counters'] = history['counters']

            self._steps_info.append(step_info)

            # just print counters for this one step
            self._print_counters(step_nums=[step_num])

            if returncode:
                err_lines = [
                    'Job failed with return code %d: %s' %
                       (returncode, cmd_line(step_args))]

                cause = self._find_probable_cause_of_failure(**step_info)

                if cause:
                    err_lines.append('')  # pad with empty line
                    err_lines.extend(_format_cause_of_failure(cause))

                for err_line in err_lines:
                    log.error(err_line)
                raise Exception('\n'.join(err_lines) + '\n')
Beispiel #54
0
    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in range(self._num_steps()):
            log.debug('running step %d of %d' %
                      (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                step_counters = self._process_stderr_from_streaming(
                    step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error('STDOUT: ' + to_string(line.strip(b'\n')))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_counters = self._process_stderr_from_streaming(
                            master)
                        _, returncode = os.waitpid(pid, 0)

            # TODO: looks like we are parsing counters
            # but they don't get printed out.
            if step_counters:
                self._counters.append(step_counters)
            else:
                self._fetch_counters([step_num + self._start_step_num])

            # just print counters for this one step
            self._print_counters(step_nums=[step_num])

            if returncode:
                msg = ('Job failed with return code %d: %s' %
                       (returncode, step_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                                     cause['log_file_uri'])
                    cause_msg.extend(line.strip('\n')
                                     for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise CalledProcessError(returncode, step_args)
Beispiel #55
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = self._env_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr, record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(reason=reason,
                                          step_num=step_num,
                                          num_steps=self._num_steps())
Beispiel #56
0
    def _run_job_in_hadoop(self):
        self._counters = []

        for step_num in range(self._num_steps()):
            log.debug('running step %d of %d' %
                      (step_num + 1, self._num_steps()))

            step_args = self._args_for_step(step_num)

            log.debug('> %s' % cmd_line(step_args))

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen
                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                self._process_stderr_from_streaming(step_proc.stderr)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    log.error('STDOUT: ' + to_string(line.strip(b'\n')))

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        self._process_stderr_from_streaming(master)
                        _, returncode = os.waitpid(pid, 0)

            if returncode == 0:
                # parsing needs step number for whole job
                self._fetch_counters([step_num + self._start_step_num])
                # printing needs step number relevant to this run of mrjob
                self.print_counters([step_num + 1])
            else:
                msg = ('Job failed with return code %d: %s' %
                       (returncode, step_args))
                log.error(msg)
                # look for a Python traceback
                cause = self._find_probable_cause_of_failure(
                    [step_num + self._start_step_num])
                if cause:
                    # log cause, and put it in exception
                    cause_msg = []  # lines to log and put in exception
                    cause_msg.append('Probable cause of failure (from %s):' %
                                     cause['log_file_uri'])
                    cause_msg.extend(
                        line.strip('\n') for line in cause['lines'])
                    if cause['input_uri']:
                        cause_msg.append('(while reading from %s)' %
                                         cause['input_uri'])

                    for line in cause_msg:
                        log.error(line)

                    # add cause_msg to exception message
                    msg += '\n' + '\n'.join(cause_msg) + '\n'

                raise CalledProcessError(returncode, step_args)
Beispiel #57
0
 def test_latin_1_bytes(self):
     self.assertEqual(to_string(b'caf\xe9'), 'caf\xe9')
Beispiel #58
0
    def _run_job_in_hadoop(self):
        for step_num in range(self._num_steps()):
            step_args = self._args_for_step(step_num)

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr, record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_string(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvp(step_args[0], step_args)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._hdfs_step_output_dir(step_num))

            log_interpretation['step'] = step_interpretation

            if 'counters' not in step_interpretation:
                log.info('Attempting to read counters from history log')
                self._interpret_history_log(log_interpretation)

            # just print counters for this one step
            self._print_counters(step_nums=[step_num])

            if returncode:
                error = self._pick_error(log_interpretation)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                raise CalledProcessError(returncode, step_args)