Exemple #1
0
    def invoke_hadoop(self,
                      args,
                      ok_returncodes=None,
                      ok_stderr=None,
                      return_stdout=False):
        """Run the given hadoop command, raising an exception on non-zero
        return code. This only works for commands whose output we don't
        care about.

        Args:
        ok_returncodes -- a list/tuple/set of return codes we expect to
            get back from hadoop (e.g. [0,1]). By default, we only expect 0.
            If we get an unexpected return code, we raise a CalledProcessError.
        ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
            matches a regex in this list (even if the returncode is bad)
        return_stdout -- return the stdout from the hadoop command rather
            than logging it. If this is False, we return the returncode
            instead.
        """
        args = self.get_hadoop_bin() + args

        log.debug('> %s' % cmd_line(args))

        proc = Popen(args, stdout=PIPE, stderr=PIPE)
        stdout, stderr = proc.communicate()

        log_func = log.debug if proc.returncode == 0 else log.error
        if not return_stdout:
            for line in BytesIO(stdout):
                log_func('STDOUT: ' + to_unicode(line.rstrip(b'\r\n')))

        # check if STDERR is okay
        stderr_is_ok = False
        if ok_stderr:
            for stderr_re in ok_stderr:
                if stderr_re.match(stderr):
                    stderr_is_ok = True
                    break

        if not stderr_is_ok:
            for line in BytesIO(stderr):
                log_func('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))

        ok_returncodes = ok_returncodes or [0]

        if not stderr_is_ok and proc.returncode not in ok_returncodes:
            raise CalledProcessError(proc.returncode, args)

        if return_stdout:
            return stdout
        else:
            return proc.returncode
Exemple #2
0
    def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
                      return_stdout=False):
        """Run the given hadoop command, raising an exception on non-zero
        return code. This only works for commands whose output we don't
        care about.

        Args:
        ok_returncodes -- a list/tuple/set of return codes we expect to
            get back from hadoop (e.g. [0,1]). By default, we only expect 0.
            If we get an unexpected return code, we raise a CalledProcessError.
        ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
            matches a regex in this list (even if the returncode is bad)
        return_stdout -- return the stdout from the hadoop command rather
            than logging it. If this is False, we return the returncode
            instead.
        """
        args = self.get_hadoop_bin() + args

        log.debug('> %s' % cmd_line(args))

        proc = Popen(args, stdout=PIPE, stderr=PIPE)
        stdout, stderr = proc.communicate()

        log_func = log.debug if proc.returncode == 0 else log.error
        if not return_stdout:
            for line in BytesIO(stdout):
                log_func('STDOUT: ' + to_unicode(line.rstrip(b'\r\n')))

        # check if STDERR is okay
        stderr_is_ok = False
        if ok_stderr:
            for stderr_re in ok_stderr:
                if stderr_re.match(stderr):
                    stderr_is_ok = True
                    break

        if not stderr_is_ok:
            for line in BytesIO(stderr):
                log_func('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))

        ok_returncodes = ok_returncodes or [0]

        if not stderr_is_ok and proc.returncode not in ok_returncodes:
            raise CalledProcessError(proc.returncode, args)

        if return_stdout:
            return stdout
        else:
            return proc.returncode
Exemple #3
0
def _ssh_ls(ssh_bin,
            address,
            ec2_key_pair_file,
            path,
            keyfile=None,
            sudo=False):
    """Recursively list files under ``path`` on the specified SSH host.
    Return the file at ``path`` as a string. Raises ``IOError`` if the
    path doesn't exist or SSH access fails.

    :param ssh_bin: Path to ``ssh`` binary
    :param address: Address of your job's master node
    :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)
    :param path: Path on the remote host to list
    :param keyfile: Name of the EMR private key file on the master node in case
                    ``path`` exists on one of the slave nodes
    :param sudo: if true, run command with ``sudo``
    """
    cmd_args = ['find', '-L', path, '-type', 'f']
    if sudo:
        cmd_args = ['sudo'] + cmd_args

    out = to_unicode(
        _check_output(*_ssh_run_with_recursion(
            ssh_bin, address, ec2_key_pair_file, keyfile, cmd_args)))
    if 'No such file or directory' in out:
        raise IOError("No such file or directory: %s" % path)
    return out.split('\n')
Exemple #4
0
def parse_mr_job_stderr(stderr, counters=None):
    """Parse counters and status messages out of MRJob output.

    :param stderr: a filehandle, a list of lines (bytes), or bytes
    :param counters: Counters so far, to update; a map from group (string to
                     counter name (string) to count.

    Returns a dictionary with the keys *counters*, *statuses*, *other*:

    - *counters*: counters so far; same format as above
    - *statuses*: a list of status messages encountered
    - *other*: lines (strings) that aren't either counters or status messages
    """
    # For the corresponding code in Hadoop Streaming, see ``incrCounter()`` in
    # http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapRed.java?view=markup  # noqa
    if isinstance(stderr, bytes):
        stderr = BytesIO(stderr)

    if counters is None:
        counters = {}
    statuses = []
    other = []

    for line in stderr:
        m = _COUNTER_RE.match(line.rstrip(b'\r\n'))
        if m:
            group, counter, amount_str = m.groups()

            # don't leave these as bytes on Python 3
            group = to_unicode(group)
            counter = to_unicode(counter)

            counters.setdefault(group, {})
            counters[group].setdefault(counter, 0)
            counters[group][counter] += int(amount_str)
            continue

        m = _STATUS_RE.match(line.rstrip(b'\r\n'))
        if m:
            # don't leave as bytes on Python 3
            statuses.append(to_unicode(m.group(1)))
            continue

        other.append(to_unicode(line))

    return {'counters': counters, 'statuses': statuses, 'other': other}
Exemple #5
0
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        version = self.get_hadoop_version()

        # use ls -R on Hadoop 2 (see #1152)
        if uses_yarn(version):
            args = ['fs', '-ls', '-R', path_glob]
        else:
            args = ['fs', '-lsr', path_glob]

        try:
            stdout = self.invoke_hadoop(args,
                                        return_stdout=True,
                                        ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in BytesIO(stdout):
            line = line.rstrip(b'\r\n')

            # ignore total item count
            if line.startswith(b'Found '):
                continue

            fields = line.split(b' ')

            # Throw out directories
            if fields[0].startswith(b'd'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                # look for time field, and pick one after that
                # (can't use field[2] because that's an int in Python 3)
                if len(field) == 5 and field[2:3] == b':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string %r" % line)

            path = to_unicode(line.split(b' ', path_index)[-1])
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
Exemple #6
0
 def yield_lines():
     try:
         for line in stderr:
             yield to_unicode(line)
     except IOError as e:
         # this is just the PTY's way of saying goodbye
         if e.errno == errno.EIO:
             return
         else:
             raise
Exemple #7
0
def _cat_log(fs, path):
    """fs.cat() the given log, converting lines to strings, and logging
    errors."""
    try:
        if not fs.exists(path):
            return
        for line in fs.cat(path):
            yield to_unicode(line)
    except (IOError, OSError) as e:
        log.warning("couldn't cat() %s: %r" % (path, e))
Exemple #8
0
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        version = self.get_hadoop_version()

        # use ls -R on Hadoop 2 (see #1152)
        if uses_yarn(version):
            args = ['fs', '-ls', '-R', path_glob]
        else:
            args = ['fs', '-lsr', path_glob]

        try:
            stdout = self.invoke_hadoop(args, return_stdout=True,
                                        ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in BytesIO(stdout):
            line = line.rstrip(b'\r\n')

            # ignore total item count
            if line.startswith(b'Found '):
                continue

            fields = line.split(b' ')

            # Throw out directories
            if fields[0].startswith(b'd'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                # look for time field, and pick one after that
                # (can't use field[2] because that's an int in Python 3)
                if len(field) == 5 and field[2:3] == b':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string %r" % line)

            path = to_unicode(line.split(b' ', path_index)[-1])
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
Exemple #9
0
    def _get_new_driver_output_lines(self, driver_output_uri):
        """Get a list of complete job driver output lines that are
        new since the last time we checked.
        """
        state = self._driver_output_state.setdefault(
            driver_output_uri,
            dict(log_uri=None, pos=0, buffer=b''))

        # driver output is in logs with names like driveroutput.000000000
        log_uris = sorted(self.fs.ls(driver_output_uri + '*'))

        for log_uri in log_uris:
            # initialize log_uri with first URI we see
            if state['log_uri'] is None:
                # log the location of job driver output just once
                log.info(
                    '  Parsing job driver output from %s*' % driver_output_uri)
                state['log_uri'] = log_uri

            # skip log files already parsed
            if log_uri < state['log_uri']:
                continue

            # when parsing the next file, reset *pos*
            elif log_uri > state['log_uri']:
                state['pos'] = 0
                state['log_uri'] = log_uri

            log_blob = self.fs.gcs._get_blob(log_uri)

            try:
                new_data = log_blob.download_as_string(start=state['pos'])
            except (google.api_core.exceptions.NotFound,
                    google.api_core.exceptions.RequestRangeNotSatisfiable):
                # blob was just created, or no more data is available
                break

            state['buffer'] += new_data
            state['pos'] += len(new_data)

        # convert buffer into lines, saving leftovers for next time
        stream = BytesIO(state['buffer'])
        state['buffer'] = b''

        lines = []

        for line_bytes in stream:
            if line_bytes.endswith(b'\n'):
                lines.append(to_unicode(line_bytes))
            else:
                # leave final partial line (if any) in buffer
                state['buffer'] = line_bytes

        return lines
Exemple #10
0
    def _get_new_driver_output_lines(self, driver_output_uri):
        """Get a list of complete job driver output lines that are
        new since the last time we checked.
        """
        state = self._driver_output_state.setdefault(
            driver_output_uri,
            dict(log_uri=None, pos=0, buffer=b''))

        # driver output is in logs with names like driveroutput.000000000
        log_uris = sorted(self.fs.ls(driver_output_uri + '*'))

        for log_uri in log_uris:
            # initialize log_uri with first URI we see
            if state['log_uri'] is None:
                # log the location of job driver output just once
                log.info(
                    '  Parsing job driver output from %s*' % driver_output_uri)
                state['log_uri'] = log_uri

            # skip log files already parsed
            if log_uri < state['log_uri']:
                continue

            # when parsing the next file, reset *pos*
            elif log_uri > state['log_uri']:
                state['pos'] = 0
                state['log_uri'] = log_uri

            log_blob = self.fs.gcs._get_blob(log_uri)

            try:
                new_data = log_blob.download_as_string(start=state['pos'])
            except (google.api_core.exceptions.NotFound,
                    google.api_core.exceptions.RequestRangeNotSatisfiable):
                # blob was just created, or no more data is available
                break

            state['buffer'] += new_data
            state['pos'] += len(new_data)

        # convert buffer into lines, saving leftovers for next time
        stream = BytesIO(state['buffer'])
        state['buffer'] = b''

        lines = []

        for line_bytes in stream:
            if line_bytes.endswith(b'\n'):
                lines.append(to_unicode(line_bytes))
            else:
                # leave final partial line (if any) in buffer
                state['buffer'] = line_bytes

        return lines
Exemple #11
0
def _cat_log_lines(fs, path):
    """Yield lines from the given log.

    Log errors rather than raising them.
    """
    try:
        if not fs.exists(path):
            return
        for line in to_lines(fs.cat(path)):
            yield to_unicode(line)
    except (IOError, OSError) as e:
        log.warning("couldn't cat() %s: %r" % (path, e))
Exemple #12
0
def _yield_lines_from_pty_or_pipe(stderr):
    """Yield lines from a PTY or pipe, converting to unicode and gracefully
    handling errno.EIO"""
    try:
        for line in stderr:
            yield to_unicode(line)
    except IOError as e:
        # this is just the PTY's way of saying goodbye
        if e.errno == errno.EIO:
            return
        else:
            raise
Exemple #13
0
def _yield_lines_from_pty_or_pipe(stderr):
    """Yield lines from a PTY or pipe, converting to unicode and gracefully
    handling errno.EIO"""
    try:
        for line in stderr:
            yield to_unicode(line)
    except IOError as e:
        # this is just the PTY's way of saying goodbye
        if e.errno == errno.EIO:
            return
        else:
            raise
Exemple #14
0
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
    """Given an :py:class:`EMRJobRunner`, run the command specified by
    *cmd_args* on all nodes in the cluster and save the stdout and stderr of
    each run to subdirectories of *output_dir*.

    You should probably have run :py:meth:`_enable_slave_ssh_access()` on the
    runner before calling this function.
    """
    master_addr = runner._address_of_master()
    addresses = [master_addr]

    ssh_bin = runner._opts['ssh_bin']
    ec2_key_pair_file = runner._opts['ec2_key_pair_file']

    keyfile = None
    slave_addrs = runner._ssh_worker_hosts()

    if slave_addrs:
        addresses += [
            '%s!%s' % (master_addr, slave_addr) for slave_addr in slave_addrs
        ]
        # copying key file like a boss (name of keyfile doesn't really matter)
        keyfile = 'mrboss-%s.pem' % random_identifier()
        _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile)

    for addr in addresses:

        stdout, stderr = _ssh_run_with_recursion(
            ssh_bin,
            addr,
            ec2_key_pair_file,
            keyfile,
            cmd_args,
        )

        if print_stderr:
            print('---')
            print('Command completed on %s.' % addr)
            print(to_unicode(stderr), end=' ')

        if '!' in addr:
            base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1])
        else:
            base_dir = os.path.join(output_dir, 'master')

        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
            f.write(stdout)

        with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
            f.write(stderr)
Exemple #15
0
    def ls(self, path_glob):
        m = _SSH_URI_RE.match(path_glob)
        addr = m.group('hostname')
        path_to_ls = m.group('filesystem_path')

        p = self._ssh_launch(addr, ['find', '-L', path_to_ls, '-type', 'f'])

        for line in p.stdout:
            path = to_unicode(line).rstrip('\n')
            yield 'ssh://%s%s' % (addr, path)

        self._ssh_finish_run(p)
Exemple #16
0
def _cat_log_lines(fs, path):
    """Yield lines from the given log.

    Log errors rather than raising them.
    """
    try:
        if not fs.exists(path):
            return
        for line in to_lines(fs.cat(path)):
            yield to_unicode(line)
    except (IOError, OSError) as e:
        log.warning("couldn't cat() %s: %r" % (path, e))
Exemple #17
0
        def cleanup():
            # this does someties happen; see #1396
            for line in cat_proc.stderr:
                log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))

            cat_proc.stdout.close()
            cat_proc.stderr.close()

            returncode = cat_proc.wait()

            if returncode != 0:
                raise IOError("Could not stream %s" % filename)
    def parse_doc(self, input_path, input_uri):
        """Mapper: parse documents and emit ngram information.

        Input: Text files whose name contains a unique document ID and
        category information (see :py:func:`parse_doc_filename`).

        Output:
        ``('ngram', (n, ngram)), (count, cats)`` OR
        ``('doc', doc_id), doc``

        n: ngram length
        ngram: ngram encoded encoded as a string (e.g. "pad thai")
            or None to indicate ANY ngram.
        count:  # of times an ngram appears in the document
        cats: a map from category name to a boolean indicating whether it's
            this document is in the category

        doc_id: (hopefully) unique document ID
        doc: the encoded document. We'll fill these fields:
            ngram_counts: map from (n, ngram) to  # of times ngram appears
                in the document, using (n, None) to represent the total
                number of times ANY ngram of that size appears (essentially
                number of words)
            in_test_set: boolean indicating if this doc is in the test set
            id: SHA1 hash of doc text (if not already filled)
        """
        # fill *id* and *cats*
        doc = parse_doc_filename(input_uri)

        with open(input_path) as f:
            text = to_unicode(f.read())

        # pick test/training docs
        if self.options.no_test_set:
            doc['in_test_set'] = False
        else:
            doc_hash = hashlib.sha1(text.encode('utf-8')).hexdigest()
            doc['in_test_set'] = bool(int(doc_hash[-1], 16) % 2)

        # map from (n, ngram) to number of times it appears
        ngram_counts = count_ngrams(text, self.options.max_ngram_size,
                                    self.stop_words)

        # yield the number of times the ngram appears in this doc
        # and the categories for this document, so we can train the classifier
        if not doc['in_test_set']:
            for (n, ngram), count in ngram_counts.items():
                yield ('ngram', (n, ngram)), (count, doc['cats'])

        # yield the document itself, for safekeeping
        doc['ngram_counts'] = list(ngram_counts.items())
        yield ('doc', doc['id']), doc
Exemple #19
0
    def ls(self, uri_glob):
        m = _SSH_URI_RE.match(uri_glob)
        addr = m.group('hostname')
        path_to_ls = m.group('filesystem_path')

        p = self._ssh_launch(
            addr, ['find', '-L', path_to_ls, '-type', 'f'])

        for line in p.stdout:
            path = to_unicode(line).rstrip('\n')
            yield 'ssh://%s%s' % (addr, path)

        self._ssh_finish_run(p)
Exemple #20
0
    def _run_hadoop(self, hadoop_args, env, record_callback):
        # try to use a PTY if it's available
        try:
            pid, master_fd = pty.fork()
        except (AttributeError, OSError):
            # no PTYs, just use Popen

            # user won't get much feedback for a while, so tell them
            # Hadoop is running
            log.debug('No PTY available, using Popen() to invoke Hadoop')

            step_proc = Popen(hadoop_args, stdout=PIPE, stderr=PIPE, env=env)

            step_interpretation = _interpret_hadoop_jar_command_stderr(
                step_proc.stderr,
                record_callback=_log_record_from_hadoop)

            # there shouldn't be much output to STDOUT
            for line in step_proc.stdout:
                _log_line_from_driver(to_unicode(line).strip('\r\n'))

            step_proc.stdout.close()
            step_proc.stderr.close()

            returncode = step_proc.wait()
        else:
            # we have PTYs
            if pid == 0:  # we are the child process
                try:
                    os.execvpe(hadoop_args[0], hadoop_args, env)
                    # now we are no longer Python
                except OSError as ex:
                    # use _exit() so we don't do cleanup, etc. that's
                    # the parent process's job
                    os._exit(ex.errno)
                finally:
                    # if we got some other exception, still exit hard
                    os._exit(-1)
            else:
                log.debug('Invoking Hadoop via PTY')

                with os.fdopen(master_fd, 'rb') as master:
                    # reading from master gives us the subprocess's
                    # stderr and stdout (it's a fake terminal)
                    step_interpretation = (
                        _interpret_hadoop_jar_command_stderr(
                            _eio_to_eof(master),
                            record_callback=_log_record_from_hadoop))
                    _, returncode = os.waitpid(pid, 0)

        return returncode, step_interpretation
Exemple #21
0
def _find_python_traceback(lines):
    """Scan subprocess stderr for Python traceback."""
    # Essentially, we detect the start of the traceback, and continue
    # until we find a non-indented line, with some special rules for exceptions
    # from subprocesses.

    # Lines to pass back representing entire error found
    all_tb_lines = []

    # This is used to store a working list of lines in a single traceback
    tb_lines = []

    # This is used to store a working list of non-traceback lines between the
    # current traceback and the previous one
    non_tb_lines = []

    # Track whether or not we are in a traceback rather than consuming the
    # iterator
    in_traceback = False

    for line in lines:
        # don't return bytes in Python 3
        line = to_unicode(line)

        if in_traceback:
            tb_lines.append(line)

            # If no indentation, this is the last line of the traceback
            if line.lstrip() == line:
                in_traceback = False

                if line.startswith('subprocess.CalledProcessError'):
                    # CalledProcessError may mean that the subprocess printed
                    # errors to stderr which we can show the user
                    all_tb_lines += non_tb_lines

                all_tb_lines += tb_lines

                # Reset all working lists
                tb_lines = []
                non_tb_lines = []
        else:
            if line.startswith('Traceback (most recent call last):'):
                tb_lines.append(line)
                in_traceback = True
            else:
                non_tb_lines.append(line)
    if all_tb_lines:
        return all_tb_lines
    else:
        return None
Exemple #22
0
    def _get_new_driver_output_lines(self, driver_output_uri):
        """Get a list of complete job driver output lines that are
        new since the last time we checked.
        """
        state = self._driver_output_state.setdefault(
            driver_output_uri, dict(log_uri=None, pos=0, buffer=b''))

        # driver output is in logs with names like driveroutput.000000000
        log_uris = sorted(self.fs.ls(driver_output_uri + '*'))

        for log_uri in log_uris:
            # initialize log_uri with first URI we see
            if state['log_uri'] is None:
                state['log_uri'] = log_uri

            # skip log files already parsed
            if log_uri < state['log_uri']:
                continue

            # when parsing the next file, reset *pos*
            elif log_uri > state['log_uri']:
                state['pos'] = 0
                state['log_uri'] = log_uri

            log_blob = self.fs._get_blob(log_uri)

            try:
                # TODO: use start= kwarg once google-cloud-storage 1.9 is out
                new_data = log_blob.download_as_string()[state['pos']:]
            except google.api_core.exceptions.NotFound:
                # handle race condition where blob was just created
                break

            state['buffer'] += new_data
            state['pos'] += len(new_data)

        # convert buffer into lines, saving leftovers for next time
        stream = BytesIO(state['buffer'])
        state['buffer'] = b''

        lines = []

        for line_bytes in stream:
            if line_bytes.endswith(b'\n'):
                lines.append(to_unicode(line_bytes))
            else:
                # leave final partial line (if any) in buffer
                state['buffer'] = line_bytes

        return lines
Exemple #23
0
    def _ssh_run(self, address, cmd_args, stdin=None):
        """Run the given SSH command, and raise an IOError if it fails.
        Return ``(stdout, stderr)``

        Use this for commands with a bounded amount of output.
        """
        p = self._ssh_launch(address, cmd_args, stdin=stdin)

        stdout, stderr = p.communicate()

        if p.returncode != 0:
            raise IOError(to_unicode(stderr))

        return stdout, stderr
Exemple #24
0
    def _ssh_run(self, address, cmd_args):
        """Run the given SSH command, and raise an IOError if it fails.
        Return ``(stdout, stderr)``

        Use this for commands with a bounded amount of output.
        """
        p = self._ssh_launch(address, cmd_args)

        stdout, stderr = p.communicate()

        if p.returncode != 0:
            raise IOError(to_unicode(stderr))

        return stdout, stderr
Exemple #25
0
    def _run_step_on_spark(self, step, step_num, last_step_num=None):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num, last_step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode, step_interpretation = self._run_spark_submit(
            spark_submit_args, env, record_callback=_log_log4j_record)

        counters = None
        if step['type'] == 'streaming':
            counter_file = self.fs.join(self._counter_output_dir(step_num),
                                        'part-*')
            counter_json = b''.join(self.fs.cat(counter_file))
            if counter_json.strip():
                # json.loads() on Python 3.4/3.5 can't take bytes
                counters = json.loads(to_unicode(counter_json))

        if isinstance(counters, list):
            self._counters.extend(counters)

            # desc_num is 1-indexed user-readable step num
            for desc_num, counter_dict in enumerate(counters,
                                                    start=(step_num + 1)):
                if counter_dict:
                    log.info(
                        _format_counters(counter_dict,
                                         desc=('Counters for step %d' %
                                               desc_num)))

        # for non-streaming steps, there are no counters.
        # pad self._counters to match number of steps
        while len(self._counters) < (last_step_num or step_num) + 1:
            self._counters.append({})

        if returncode:
            error = _pick_error(dict(step=step_interpretation))
            if error:
                _log_probable_cause_of_failure(log, error)

            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(reason=reason,
                                      step_num=step_num,
                                      last_step_num=last_step_num,
                                      num_steps=self._num_steps())
Exemple #26
0
    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        # mkdir() needs this
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(['version'], return_stdout=True)
            if stdout:
                first_line = stdout.split(b'\n')[0]
                m = _HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = to_unicode(m.group('version'))
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                else:
                    raise Exception('Unable to determine Hadoop version.')

        return self._hadoop_version
Exemple #27
0
    def get_hadoop_version(self):
        """Invoke the hadoop executable to determine its version"""
        # mkdir() needs this
        if not self._hadoop_version:
            stdout = self.invoke_hadoop(['version'], return_stdout=True)
            if stdout:
                first_line = stdout.split(b'\n')[0]
                m = _HADOOP_VERSION_RE.match(first_line)
                if m:
                    self._hadoop_version = to_unicode(m.group('version'))
                    log.info("Using Hadoop version %s" % self._hadoop_version)
                else:
                    raise Exception('Unable to determine Hadoop version.')

        return self._hadoop_version
Exemple #28
0
    def _ssh_add_key(self):
        """Add ``self._ec2_key_pair_file`` to the ssh agent with ``ssh-add``.
        """
        args = self._ssh_add_bin + ['-t', '60', self._ec2_key_pair_file]

        log.debug('  > ' + cmd_line(args))

        try:
            p = Popen(args, stdout=PIPE, stderr=PIPE)
        except OSError as ex:
            raise IOError(ex.strerror)

        stdout, stderr = p.communicate()

        if p.returncode != 0:
            raise IOError(to_unicode(stderr))
Exemple #29
0
    def _run_step_on_spark(self, step, step_num, last_step_num=None):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num, last_step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode = self._run_spark_submit(spark_submit_args, env,
                                            record_callback=_log_log4j_record)

        counters = None
        if step['type'] == 'streaming':
            counter_file = self.fs.join(
                self._counter_output_dir(step_num), 'part-*')
            counter_json = b''.join(self.fs.cat(counter_file))
            if counter_json.strip():
                # json.loads() on Python 3.4/3.5 can't take bytes
                counters = json.loads(to_unicode(counter_json))

        if isinstance(counters, list):
            self._counters.extend(counters)

            # desc_num is 1-indexed user-readable step num
            for desc_num, counter_dict in enumerate(
                    counters, start=(step_num + 1)):
                if counter_dict:
                    log.info(_format_counters(
                        counter_dict,
                        desc=('Counters for step %d' % desc_num)))

        # for non-streaming steps, there are no counters.
        # pad self._counters to match number of steps
        while len(self._counters) < (last_step_num or step_num) + 1:
            self._counters.append({})

        if returncode:
            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(
                reason=reason, step_num=step_num, last_step_num=last_step_num,
                num_steps=self._num_steps())
Exemple #30
0
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        for chunk in decompress(cat_proc.stdout, filename):
            yield chunk

        # this does someties happen; see #1396
        for line in cat_proc.stderr:
            log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))

        cat_proc.stdout.close()
        cat_proc.stderr.close()

        returncode = cat_proc.wait()

        if returncode != 0:
            raise IOError("Could not stream %s" % filename)
Exemple #31
0
    def _cat_file(self, filename):
        # stream from HDFS
        cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename]
        log.debug('> %s' % cmd_line(cat_args))

        cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

        for chunk in decompress(cat_proc.stdout, filename):
            yield chunk

        # this does someties happen; see #1396
        for line in cat_proc.stderr:
            log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n')))

        cat_proc.stdout.close()
        cat_proc.stderr.close()

        returncode = cat_proc.wait()

        if returncode != 0:
            raise IOError("Could not stream %s" % filename)
Exemple #32
0
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
    """Given an :py:class:`EMRJobRunner`, run the command specified by
    *cmd_args* on all nodes in the cluster and save the stdout and stderr of
    each run to subdirectories of *output_dir*.
    """
    master_addr = runner._address_of_master()
    addresses = [master_addr]

    worker_addrs = runner._ssh_worker_hosts()

    if worker_addrs:
        addresses += [
            '%s!%s' % (master_addr, worker_addr)
            for worker_addr in worker_addrs
        ]

    for addr in addresses:

        stdout, stderr = runner.fs.ssh._ssh_run(addr, cmd_args)

        if print_stderr:
            print('---')
            print('Command completed on %s.' % addr)
            print(to_unicode(stderr), end=' ')

        if '!' in addr:
            base_dir = os.path.join(output_dir, 'worker ' + addr.split('!')[1])
        else:
            base_dir = os.path.join(output_dir, 'master')

        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
            f.write(stdout)

        with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
            f.write(stderr)
Exemple #33
0
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
    """Given an :py:class:`EMRJobRunner`, run the command specified by
    *cmd_args* on all nodes in the cluster and save the stdout and stderr of
    each run to subdirectories of *output_dir*.
    """
    master_addr = runner._address_of_master()
    addresses = [master_addr]

    worker_addrs = runner._ssh_worker_hosts()

    if worker_addrs:
        addresses += ['%s!%s' % (master_addr, worker_addr)
                      for worker_addr in worker_addrs]

    for addr in addresses:

        stdout, stderr = runner.fs._ssh_run(addr, cmd_args)

        if print_stderr:
            print('---')
            print('Command completed on %s.' % addr)
            print(to_unicode(stderr), end=' ')

        if '!' in addr:
            base_dir = os.path.join(output_dir, 'worker ' + addr.split('!')[1])
        else:
            base_dir = os.path.join(output_dir, 'master')

        if not os.path.exists(base_dir):
            os.makedirs(base_dir)

        with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
            f.write(stdout)

        with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
            f.write(stderr)
Exemple #34
0
 def test_ascii_unicode(self):
     self.assertEqual(to_unicode(u'foo'), u'foo')
Exemple #35
0
 def test_utf_8_bytes(self):
     self.assertEqual(to_unicode(b'caf\xc3\xa9'), u'café')
Exemple #36
0
 def test_ascii_bytes(self):
     self.assertEqual(to_unicode(b'foo'), u'foo')
Exemple #37
0
def _parse_hadoop_log4j_records(lines, pre_filter=None):
    """Parse lines from a hadoop log into log4j records.

    Yield dictionaries with the following keys:
    caller_location -- e.g. 'YarnClientImpl.java:submitApplication(251)'
    level -- e.g. 'INFO'
    logger -- e.g. 'amazon.emr.metrics.MetricsSaver'
    message -- the actual message. If this is a multi-line message (e.g.
        for counters), the lines will be joined by '\n'
    num_lines -- how many lines made up the message
    start_line -- which line the message started on (0-indexed)
    thread -- e.g. 'main'. Defaults to ''
    timestamp -- unparsed timestamp, e.g. '15/12/07 20:49:28',
        '2015-08-22 00:46:18,411'

    Lines will be converted to unicode, and trailing \r and \n will be stripped
    from lines.

    If set, *pre_filter* will be applied to stripped lines. If it
    returns true, we'll return a fake record with message set to the line,
    num_lines and start_line set as normal, and everything else set to ''.

    Also yields fake records for leading non-log4j lines (trailing non-log4j
    lines are assumed to be part of a multiline message if not pre-filtered).
    """
    last_record = None

    for line_num, line in enumerate(lines):
        # convert from bytes to unicode, if needed, and strip trailing newlines
        line = to_unicode(line).rstrip('\r\n')

        def fake_record():
            return dict(caller_location='',
                        level='',
                        logger='',
                        message=line,
                        num_lines=1,
                        start_line=line_num,
                        thread='',
                        timestamp='')

        # had to patch this in here to get _parse_hadoop_jar_command_stderr()'s
        # record_callback to fire on the correct line. The problem is that
        # we don't emit records until we see the next line (to handle
        # multiline records), so the callback would fire in the wrong order
        if pre_filter:
            if pre_filter(line):
                if last_record:
                    last_record['num_lines'] = (line_num -
                                                last_record['start_line'])
                    yield last_record

                yield fake_record()

                last_record = None
                continue

        m = (_HADOOP_LOG4J_LINE_RE.match(line)
             or _HADOOP_LOG4J_LINE_ALTERNATE_RE.match(line))

        if m:
            if last_record:
                last_record['num_lines'] = (line_num -
                                            last_record['start_line'])
                yield last_record

            last_record = m.groupdict()
            last_record.setdefault('caller_location', '')
            last_record['thread'] = last_record['thread'] or ''
            last_record['start_line'] = line_num
        else:
            # add on to previous record
            if last_record:
                last_record['message'] += '\n' + line
            else:
                yield fake_record()

    if last_record:
        last_record['num_lines'] = (line_num + 1 - last_record['start_line'])
        yield last_record
Exemple #38
0
 def test_ascii_bytes(self):
     self.assertEqual(to_unicode(b'foo'), u'foo')
Exemple #39
0
 def test_latin_1_bytes(self):
     self.assertEqual(to_unicode(b'caf\xe9'), u'caf\xe9')
Exemple #40
0
 def test_non_ascii_unicode(self):
     self.assertEqual(to_unicode(u'café'), u'café')
Exemple #41
0
 def _log_line(line):
     log.info('  %s' % to_unicode(line).strip('\r\n'))
Exemple #42
0
 def test_non_ascii_unicode(self):
     self.assertEqual(to_unicode(u'café'), u'café')
Exemple #43
0
 def test_ascii_unicode(self):
     self.assertEqual(to_unicode(u'foo'), u'foo')
Exemple #44
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr, record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_driver(to_unicode(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            self._log_counters(log_interpretation, step_num)

            step_type = step['type']

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(reason=reason,
                                          step_num=step_num,
                                          num_steps=self._num_steps())
Exemple #45
0
 def test_latin_1_bytes(self):
     self.assertEqual(to_unicode(b'caf\xe9'), u'caf\xe9')
Exemple #46
0
    def _run_job_in_hadoop(self):
        for step_num, step in enumerate(self._get_steps()):
            self._warn_about_spark_archives(step)

            step_args = self._args_for_step(step_num)
            env = _fix_env(self._env_for_step(step_num))

            # log this *after* _args_for_step(), which can start a search
            # for the Hadoop streaming jar
            log.info('Running step %d of %d...' %
                     (step_num + 1, self._num_steps()))
            log.debug('> %s' % cmd_line(step_args))
            log.debug('  with environment: %r' % sorted(env.items()))

            log_interpretation = {}
            self._log_interpretations.append(log_interpretation)

            # try to use a PTY if it's available
            try:
                pid, master_fd = pty.fork()
            except (AttributeError, OSError):
                # no PTYs, just use Popen

                # user won't get much feedback for a while, so tell them
                # Hadoop is running
                log.debug('No PTY available, using Popen() to invoke Hadoop')

                step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

                step_interpretation = _interpret_hadoop_jar_command_stderr(
                    step_proc.stderr,
                    record_callback=_log_record_from_hadoop)

                # there shouldn't be much output to STDOUT
                for line in step_proc.stdout:
                    _log_line_from_hadoop(to_unicode(line).strip('\r\n'))

                step_proc.stdout.close()
                step_proc.stderr.close()

                returncode = step_proc.wait()
            else:
                # we have PTYs
                if pid == 0:  # we are the child process
                    os.execvpe(step_args[0], step_args, env)
                else:
                    log.debug('Invoking Hadoop via PTY')

                    with os.fdopen(master_fd, 'rb') as master:
                        # reading from master gives us the subprocess's
                        # stderr and stdout (it's a fake terminal)
                        step_interpretation = (
                            _interpret_hadoop_jar_command_stderr(
                                master,
                                record_callback=_log_record_from_hadoop))
                        _, returncode = os.waitpid(pid, 0)

            # make sure output_dir is filled
            if 'output_dir' not in step_interpretation:
                step_interpretation['output_dir'] = (
                    self._step_output_uri(step_num))

            log_interpretation['step'] = step_interpretation

            step_type = step['type']

            if not _is_spark_step_type(step_type):
                counters = self._pick_counters(log_interpretation, step_type)
                if counters:
                    log.info(_format_counters(counters))
                else:
                    log.warning('No counters found')

            if returncode:
                error = self._pick_error(log_interpretation, step_type)
                if error:
                    log.error('Probable cause of failure:\n\n%s\n' %
                              _format_error(error))

                # use CalledProcessError's well-known message format
                reason = str(CalledProcessError(returncode, step_args))
                raise StepFailedException(
                    reason=reason, step_num=step_num,
                    num_steps=self._num_steps())
Exemple #47
0
 def test_utf_8_bytes(self):
     self.assertEqual(to_unicode(b'caf\xc3\xa9'), u'café')