def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None, return_stdout=False): """Run the given hadoop command, raising an exception on non-zero return code. This only works for commands whose output we don't care about. Args: ok_returncodes -- a list/tuple/set of return codes we expect to get back from hadoop (e.g. [0,1]). By default, we only expect 0. If we get an unexpected return code, we raise a CalledProcessError. ok_stderr -- don't log STDERR or raise CalledProcessError if stderr matches a regex in this list (even if the returncode is bad) return_stdout -- return the stdout from the hadoop command rather than logging it. If this is False, we return the returncode instead. """ args = self.get_hadoop_bin() + args log.debug('> %s' % cmd_line(args)) proc = Popen(args, stdout=PIPE, stderr=PIPE) stdout, stderr = proc.communicate() log_func = log.debug if proc.returncode == 0 else log.error if not return_stdout: for line in BytesIO(stdout): log_func('STDOUT: ' + to_unicode(line.rstrip(b'\r\n'))) # check if STDERR is okay stderr_is_ok = False if ok_stderr: for stderr_re in ok_stderr: if stderr_re.match(stderr): stderr_is_ok = True break if not stderr_is_ok: for line in BytesIO(stderr): log_func('STDERR: ' + to_unicode(line.rstrip(b'\r\n'))) ok_returncodes = ok_returncodes or [0] if not stderr_is_ok and proc.returncode not in ok_returncodes: raise CalledProcessError(proc.returncode, args) if return_stdout: return stdout else: return proc.returncode
def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None, return_stdout=False): """Run the given hadoop command, raising an exception on non-zero return code. This only works for commands whose output we don't care about. Args: ok_returncodes -- a list/tuple/set of return codes we expect to get back from hadoop (e.g. [0,1]). By default, we only expect 0. If we get an unexpected return code, we raise a CalledProcessError. ok_stderr -- don't log STDERR or raise CalledProcessError if stderr matches a regex in this list (even if the returncode is bad) return_stdout -- return the stdout from the hadoop command rather than logging it. If this is False, we return the returncode instead. """ args = self.get_hadoop_bin() + args log.debug('> %s' % cmd_line(args)) proc = Popen(args, stdout=PIPE, stderr=PIPE) stdout, stderr = proc.communicate() log_func = log.debug if proc.returncode == 0 else log.error if not return_stdout: for line in BytesIO(stdout): log_func('STDOUT: ' + to_unicode(line.rstrip(b'\r\n'))) # check if STDERR is okay stderr_is_ok = False if ok_stderr: for stderr_re in ok_stderr: if stderr_re.match(stderr): stderr_is_ok = True break if not stderr_is_ok: for line in BytesIO(stderr): log_func('STDERR: ' + to_unicode(line.rstrip(b'\r\n'))) ok_returncodes = ok_returncodes or [0] if not stderr_is_ok and proc.returncode not in ok_returncodes: raise CalledProcessError(proc.returncode, args) if return_stdout: return stdout else: return proc.returncode
def _ssh_ls(ssh_bin, address, ec2_key_pair_file, path, keyfile=None, sudo=False): """Recursively list files under ``path`` on the specified SSH host. Return the file at ``path`` as a string. Raises ``IOError`` if the path doesn't exist or SSH access fails. :param ssh_bin: Path to ``ssh`` binary :param address: Address of your job's master node :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``) :param path: Path on the remote host to list :param keyfile: Name of the EMR private key file on the master node in case ``path`` exists on one of the slave nodes :param sudo: if true, run command with ``sudo`` """ cmd_args = ['find', '-L', path, '-type', 'f'] if sudo: cmd_args = ['sudo'] + cmd_args out = to_unicode( _check_output(*_ssh_run_with_recursion( ssh_bin, address, ec2_key_pair_file, keyfile, cmd_args))) if 'No such file or directory' in out: raise IOError("No such file or directory: %s" % path) return out.split('\n')
def parse_mr_job_stderr(stderr, counters=None): """Parse counters and status messages out of MRJob output. :param stderr: a filehandle, a list of lines (bytes), or bytes :param counters: Counters so far, to update; a map from group (string to counter name (string) to count. Returns a dictionary with the keys *counters*, *statuses*, *other*: - *counters*: counters so far; same format as above - *statuses*: a list of status messages encountered - *other*: lines (strings) that aren't either counters or status messages """ # For the corresponding code in Hadoop Streaming, see ``incrCounter()`` in # http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapRed.java?view=markup # noqa if isinstance(stderr, bytes): stderr = BytesIO(stderr) if counters is None: counters = {} statuses = [] other = [] for line in stderr: m = _COUNTER_RE.match(line.rstrip(b'\r\n')) if m: group, counter, amount_str = m.groups() # don't leave these as bytes on Python 3 group = to_unicode(group) counter = to_unicode(counter) counters.setdefault(group, {}) counters[group].setdefault(counter, 0) counters[group][counter] += int(amount_str) continue m = _STATUS_RE.match(line.rstrip(b'\r\n')) if m: # don't leave as bytes on Python 3 statuses.append(to_unicode(m.group(1))) continue other.append(to_unicode(line)) return {'counters': counters, 'statuses': statuses, 'other': other}
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) version = self.get_hadoop_version() # use ls -R on Hadoop 2 (see #1152) if uses_yarn(version): args = ['fs', '-ls', '-R', path_glob] else: args = ['fs', '-lsr', path_glob] try: stdout = self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in BytesIO(stdout): line = line.rstrip(b'\r\n') # ignore total item count if line.startswith(b'Found '): continue fields = line.split(b' ') # Throw out directories if fields[0].startswith(b'd'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): # look for time field, and pick one after that # (can't use field[2] because that's an int in Python 3) if len(field) == 5 and field[2:3] == b':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string %r" % line) path = to_unicode(line.split(b' ', path_index)[-1]) # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path
def yield_lines(): try: for line in stderr: yield to_unicode(line) except IOError as e: # this is just the PTY's way of saying goodbye if e.errno == errno.EIO: return else: raise
def _cat_log(fs, path): """fs.cat() the given log, converting lines to strings, and logging errors.""" try: if not fs.exists(path): return for line in fs.cat(path): yield to_unicode(line) except (IOError, OSError) as e: log.warning("couldn't cat() %s: %r" % (path, e))
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) version = self.get_hadoop_version() # use ls -R on Hadoop 2 (see #1152) if uses_yarn(version): args = ['fs', '-ls', '-R', path_glob] else: args = ['fs', '-lsr', path_glob] try: stdout = self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in BytesIO(stdout): line = line.rstrip(b'\r\n') # ignore total item count if line.startswith(b'Found '): continue fields = line.split(b' ') # Throw out directories if fields[0].startswith(b'd'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): # look for time field, and pick one after that # (can't use field[2] because that's an int in Python 3) if len(field) == 5 and field[2:3] == b':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string %r" % line) path = to_unicode(line.split(b' ', path_index)[-1]) # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path
def _get_new_driver_output_lines(self, driver_output_uri): """Get a list of complete job driver output lines that are new since the last time we checked. """ state = self._driver_output_state.setdefault( driver_output_uri, dict(log_uri=None, pos=0, buffer=b'')) # driver output is in logs with names like driveroutput.000000000 log_uris = sorted(self.fs.ls(driver_output_uri + '*')) for log_uri in log_uris: # initialize log_uri with first URI we see if state['log_uri'] is None: # log the location of job driver output just once log.info( ' Parsing job driver output from %s*' % driver_output_uri) state['log_uri'] = log_uri # skip log files already parsed if log_uri < state['log_uri']: continue # when parsing the next file, reset *pos* elif log_uri > state['log_uri']: state['pos'] = 0 state['log_uri'] = log_uri log_blob = self.fs.gcs._get_blob(log_uri) try: new_data = log_blob.download_as_string(start=state['pos']) except (google.api_core.exceptions.NotFound, google.api_core.exceptions.RequestRangeNotSatisfiable): # blob was just created, or no more data is available break state['buffer'] += new_data state['pos'] += len(new_data) # convert buffer into lines, saving leftovers for next time stream = BytesIO(state['buffer']) state['buffer'] = b'' lines = [] for line_bytes in stream: if line_bytes.endswith(b'\n'): lines.append(to_unicode(line_bytes)) else: # leave final partial line (if any) in buffer state['buffer'] = line_bytes return lines
def _get_new_driver_output_lines(self, driver_output_uri): """Get a list of complete job driver output lines that are new since the last time we checked. """ state = self._driver_output_state.setdefault( driver_output_uri, dict(log_uri=None, pos=0, buffer=b'')) # driver output is in logs with names like driveroutput.000000000 log_uris = sorted(self.fs.ls(driver_output_uri + '*')) for log_uri in log_uris: # initialize log_uri with first URI we see if state['log_uri'] is None: # log the location of job driver output just once log.info( ' Parsing job driver output from %s*' % driver_output_uri) state['log_uri'] = log_uri # skip log files already parsed if log_uri < state['log_uri']: continue # when parsing the next file, reset *pos* elif log_uri > state['log_uri']: state['pos'] = 0 state['log_uri'] = log_uri log_blob = self.fs.gcs._get_blob(log_uri) try: new_data = log_blob.download_as_string(start=state['pos']) except (google.api_core.exceptions.NotFound, google.api_core.exceptions.RequestRangeNotSatisfiable): # blob was just created, or no more data is available break state['buffer'] += new_data state['pos'] += len(new_data) # convert buffer into lines, saving leftovers for next time stream = BytesIO(state['buffer']) state['buffer'] = b'' lines = [] for line_bytes in stream: if line_bytes.endswith(b'\n'): lines.append(to_unicode(line_bytes)) else: # leave final partial line (if any) in buffer state['buffer'] = line_bytes return lines
def _cat_log_lines(fs, path): """Yield lines from the given log. Log errors rather than raising them. """ try: if not fs.exists(path): return for line in to_lines(fs.cat(path)): yield to_unicode(line) except (IOError, OSError) as e: log.warning("couldn't cat() %s: %r" % (path, e))
def _yield_lines_from_pty_or_pipe(stderr): """Yield lines from a PTY or pipe, converting to unicode and gracefully handling errno.EIO""" try: for line in stderr: yield to_unicode(line) except IOError as e: # this is just the PTY's way of saying goodbye if e.errno == errno.EIO: return else: raise
def _yield_lines_from_pty_or_pipe(stderr): """Yield lines from a PTY or pipe, converting to unicode and gracefully handling errno.EIO""" try: for line in stderr: yield to_unicode(line) except IOError as e: # this is just the PTY's way of saying goodbye if e.errno == errno.EIO: return else: raise
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True): """Given an :py:class:`EMRJobRunner`, run the command specified by *cmd_args* on all nodes in the cluster and save the stdout and stderr of each run to subdirectories of *output_dir*. You should probably have run :py:meth:`_enable_slave_ssh_access()` on the runner before calling this function. """ master_addr = runner._address_of_master() addresses = [master_addr] ssh_bin = runner._opts['ssh_bin'] ec2_key_pair_file = runner._opts['ec2_key_pair_file'] keyfile = None slave_addrs = runner._ssh_worker_hosts() if slave_addrs: addresses += [ '%s!%s' % (master_addr, slave_addr) for slave_addr in slave_addrs ] # copying key file like a boss (name of keyfile doesn't really matter) keyfile = 'mrboss-%s.pem' % random_identifier() _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile) for addr in addresses: stdout, stderr = _ssh_run_with_recursion( ssh_bin, addr, ec2_key_pair_file, keyfile, cmd_args, ) if print_stderr: print('---') print('Command completed on %s.' % addr) print(to_unicode(stderr), end=' ') if '!' in addr: base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1]) else: base_dir = os.path.join(output_dir, 'master') if not os.path.exists(base_dir): os.makedirs(base_dir) with open(os.path.join(base_dir, 'stdout'), 'wb') as f: f.write(stdout) with open(os.path.join(base_dir, 'stderr'), 'wb') as f: f.write(stderr)
def ls(self, path_glob): m = _SSH_URI_RE.match(path_glob) addr = m.group('hostname') path_to_ls = m.group('filesystem_path') p = self._ssh_launch(addr, ['find', '-L', path_to_ls, '-type', 'f']) for line in p.stdout: path = to_unicode(line).rstrip('\n') yield 'ssh://%s%s' % (addr, path) self._ssh_finish_run(p)
def _cat_log_lines(fs, path): """Yield lines from the given log. Log errors rather than raising them. """ try: if not fs.exists(path): return for line in to_lines(fs.cat(path)): yield to_unicode(line) except (IOError, OSError) as e: log.warning("couldn't cat() %s: %r" % (path, e))
def cleanup(): # this does someties happen; see #1396 for line in cat_proc.stderr: log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n'))) cat_proc.stdout.close() cat_proc.stderr.close() returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename)
def parse_doc(self, input_path, input_uri): """Mapper: parse documents and emit ngram information. Input: Text files whose name contains a unique document ID and category information (see :py:func:`parse_doc_filename`). Output: ``('ngram', (n, ngram)), (count, cats)`` OR ``('doc', doc_id), doc`` n: ngram length ngram: ngram encoded encoded as a string (e.g. "pad thai") or None to indicate ANY ngram. count: # of times an ngram appears in the document cats: a map from category name to a boolean indicating whether it's this document is in the category doc_id: (hopefully) unique document ID doc: the encoded document. We'll fill these fields: ngram_counts: map from (n, ngram) to # of times ngram appears in the document, using (n, None) to represent the total number of times ANY ngram of that size appears (essentially number of words) in_test_set: boolean indicating if this doc is in the test set id: SHA1 hash of doc text (if not already filled) """ # fill *id* and *cats* doc = parse_doc_filename(input_uri) with open(input_path) as f: text = to_unicode(f.read()) # pick test/training docs if self.options.no_test_set: doc['in_test_set'] = False else: doc_hash = hashlib.sha1(text.encode('utf-8')).hexdigest() doc['in_test_set'] = bool(int(doc_hash[-1], 16) % 2) # map from (n, ngram) to number of times it appears ngram_counts = count_ngrams(text, self.options.max_ngram_size, self.stop_words) # yield the number of times the ngram appears in this doc # and the categories for this document, so we can train the classifier if not doc['in_test_set']: for (n, ngram), count in ngram_counts.items(): yield ('ngram', (n, ngram)), (count, doc['cats']) # yield the document itself, for safekeeping doc['ngram_counts'] = list(ngram_counts.items()) yield ('doc', doc['id']), doc
def ls(self, uri_glob): m = _SSH_URI_RE.match(uri_glob) addr = m.group('hostname') path_to_ls = m.group('filesystem_path') p = self._ssh_launch( addr, ['find', '-L', path_to_ls, '-type', 'f']) for line in p.stdout: path = to_unicode(line).rstrip('\n') yield 'ssh://%s%s' % (addr, path) self._ssh_finish_run(p)
def _run_hadoop(self, hadoop_args, env, record_callback): # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(hadoop_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_driver(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process try: os.execvpe(hadoop_args[0], hadoop_args, env) # now we are no longer Python except OSError as ex: # use _exit() so we don't do cleanup, etc. that's # the parent process's job os._exit(ex.errno) finally: # if we got some other exception, still exit hard os._exit(-1) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( _eio_to_eof(master), record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) return returncode, step_interpretation
def _find_python_traceback(lines): """Scan subprocess stderr for Python traceback.""" # Essentially, we detect the start of the traceback, and continue # until we find a non-indented line, with some special rules for exceptions # from subprocesses. # Lines to pass back representing entire error found all_tb_lines = [] # This is used to store a working list of lines in a single traceback tb_lines = [] # This is used to store a working list of non-traceback lines between the # current traceback and the previous one non_tb_lines = [] # Track whether or not we are in a traceback rather than consuming the # iterator in_traceback = False for line in lines: # don't return bytes in Python 3 line = to_unicode(line) if in_traceback: tb_lines.append(line) # If no indentation, this is the last line of the traceback if line.lstrip() == line: in_traceback = False if line.startswith('subprocess.CalledProcessError'): # CalledProcessError may mean that the subprocess printed # errors to stderr which we can show the user all_tb_lines += non_tb_lines all_tb_lines += tb_lines # Reset all working lists tb_lines = [] non_tb_lines = [] else: if line.startswith('Traceback (most recent call last):'): tb_lines.append(line) in_traceback = True else: non_tb_lines.append(line) if all_tb_lines: return all_tb_lines else: return None
def _get_new_driver_output_lines(self, driver_output_uri): """Get a list of complete job driver output lines that are new since the last time we checked. """ state = self._driver_output_state.setdefault( driver_output_uri, dict(log_uri=None, pos=0, buffer=b'')) # driver output is in logs with names like driveroutput.000000000 log_uris = sorted(self.fs.ls(driver_output_uri + '*')) for log_uri in log_uris: # initialize log_uri with first URI we see if state['log_uri'] is None: state['log_uri'] = log_uri # skip log files already parsed if log_uri < state['log_uri']: continue # when parsing the next file, reset *pos* elif log_uri > state['log_uri']: state['pos'] = 0 state['log_uri'] = log_uri log_blob = self.fs._get_blob(log_uri) try: # TODO: use start= kwarg once google-cloud-storage 1.9 is out new_data = log_blob.download_as_string()[state['pos']:] except google.api_core.exceptions.NotFound: # handle race condition where blob was just created break state['buffer'] += new_data state['pos'] += len(new_data) # convert buffer into lines, saving leftovers for next time stream = BytesIO(state['buffer']) state['buffer'] = b'' lines = [] for line_bytes in stream: if line_bytes.endswith(b'\n'): lines.append(to_unicode(line_bytes)) else: # leave final partial line (if any) in buffer state['buffer'] = line_bytes return lines
def _ssh_run(self, address, cmd_args, stdin=None): """Run the given SSH command, and raise an IOError if it fails. Return ``(stdout, stderr)`` Use this for commands with a bounded amount of output. """ p = self._ssh_launch(address, cmd_args, stdin=stdin) stdout, stderr = p.communicate() if p.returncode != 0: raise IOError(to_unicode(stderr)) return stdout, stderr
def _ssh_run(self, address, cmd_args): """Run the given SSH command, and raise an IOError if it fails. Return ``(stdout, stderr)`` Use this for commands with a bounded amount of output. """ p = self._ssh_launch(address, cmd_args) stdout, stderr = p.communicate() if p.returncode != 0: raise IOError(to_unicode(stderr)) return stdout, stderr
def _run_step_on_spark(self, step, step_num, last_step_num=None): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num, last_step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode, step_interpretation = self._run_spark_submit( spark_submit_args, env, record_callback=_log_log4j_record) counters = None if step['type'] == 'streaming': counter_file = self.fs.join(self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes counters = json.loads(to_unicode(counter_json)) if isinstance(counters, list): self._counters.extend(counters) # desc_num is 1-indexed user-readable step num for desc_num, counter_dict in enumerate(counters, start=(step_num + 1)): if counter_dict: log.info( _format_counters(counter_dict, desc=('Counters for step %d' % desc_num))) # for non-streaming steps, there are no counters. # pad self._counters to match number of steps while len(self._counters) < (last_step_num or step_num) + 1: self._counters.append({}) if returncode: error = _pick_error(dict(step=step_interpretation)) if error: _log_probable_cause_of_failure(log, error) reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException(reason=reason, step_num=step_num, last_step_num=last_step_num, num_steps=self._num_steps())
def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" # mkdir() needs this if not self._hadoop_version: stdout = self.invoke_hadoop(['version'], return_stdout=True) if stdout: first_line = stdout.split(b'\n')[0] m = _HADOOP_VERSION_RE.match(first_line) if m: self._hadoop_version = to_unicode(m.group('version')) log.info("Using Hadoop version %s" % self._hadoop_version) else: raise Exception('Unable to determine Hadoop version.') return self._hadoop_version
def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" # mkdir() needs this if not self._hadoop_version: stdout = self.invoke_hadoop(['version'], return_stdout=True) if stdout: first_line = stdout.split(b'\n')[0] m = _HADOOP_VERSION_RE.match(first_line) if m: self._hadoop_version = to_unicode(m.group('version')) log.info("Using Hadoop version %s" % self._hadoop_version) else: raise Exception('Unable to determine Hadoop version.') return self._hadoop_version
def _ssh_add_key(self): """Add ``self._ec2_key_pair_file`` to the ssh agent with ``ssh-add``. """ args = self._ssh_add_bin + ['-t', '60', self._ec2_key_pair_file] log.debug(' > ' + cmd_line(args)) try: p = Popen(args, stdout=PIPE, stderr=PIPE) except OSError as ex: raise IOError(ex.strerror) stdout, stderr = p.communicate() if p.returncode != 0: raise IOError(to_unicode(stderr))
def _run_step_on_spark(self, step, step_num, last_step_num=None): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num, last_step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode = self._run_spark_submit(spark_submit_args, env, record_callback=_log_log4j_record) counters = None if step['type'] == 'streaming': counter_file = self.fs.join( self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes counters = json.loads(to_unicode(counter_json)) if isinstance(counters, list): self._counters.extend(counters) # desc_num is 1-indexed user-readable step num for desc_num, counter_dict in enumerate( counters, start=(step_num + 1)): if counter_dict: log.info(_format_counters( counter_dict, desc=('Counters for step %d' % desc_num))) # for non-streaming steps, there are no counters. # pad self._counters to match number of steps while len(self._counters) < (last_step_num or step_num) + 1: self._counters.append({}) if returncode: reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException( reason=reason, step_num=step_num, last_step_num=last_step_num, num_steps=self._num_steps())
def _cat_file(self, filename): # stream from HDFS cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) for chunk in decompress(cat_proc.stdout, filename): yield chunk # this does someties happen; see #1396 for line in cat_proc.stderr: log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n'))) cat_proc.stdout.close() cat_proc.stderr.close() returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename)
def _cat_file(self, filename): # stream from HDFS cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) for chunk in decompress(cat_proc.stdout, filename): yield chunk # this does someties happen; see #1396 for line in cat_proc.stderr: log.error('STDERR: ' + to_unicode(line.rstrip(b'\r\n'))) cat_proc.stdout.close() cat_proc.stderr.close() returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename)
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True): """Given an :py:class:`EMRJobRunner`, run the command specified by *cmd_args* on all nodes in the cluster and save the stdout and stderr of each run to subdirectories of *output_dir*. """ master_addr = runner._address_of_master() addresses = [master_addr] worker_addrs = runner._ssh_worker_hosts() if worker_addrs: addresses += [ '%s!%s' % (master_addr, worker_addr) for worker_addr in worker_addrs ] for addr in addresses: stdout, stderr = runner.fs.ssh._ssh_run(addr, cmd_args) if print_stderr: print('---') print('Command completed on %s.' % addr) print(to_unicode(stderr), end=' ') if '!' in addr: base_dir = os.path.join(output_dir, 'worker ' + addr.split('!')[1]) else: base_dir = os.path.join(output_dir, 'master') if not os.path.exists(base_dir): os.makedirs(base_dir) with open(os.path.join(base_dir, 'stdout'), 'wb') as f: f.write(stdout) with open(os.path.join(base_dir, 'stderr'), 'wb') as f: f.write(stderr)
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True): """Given an :py:class:`EMRJobRunner`, run the command specified by *cmd_args* on all nodes in the cluster and save the stdout and stderr of each run to subdirectories of *output_dir*. """ master_addr = runner._address_of_master() addresses = [master_addr] worker_addrs = runner._ssh_worker_hosts() if worker_addrs: addresses += ['%s!%s' % (master_addr, worker_addr) for worker_addr in worker_addrs] for addr in addresses: stdout, stderr = runner.fs._ssh_run(addr, cmd_args) if print_stderr: print('---') print('Command completed on %s.' % addr) print(to_unicode(stderr), end=' ') if '!' in addr: base_dir = os.path.join(output_dir, 'worker ' + addr.split('!')[1]) else: base_dir = os.path.join(output_dir, 'master') if not os.path.exists(base_dir): os.makedirs(base_dir) with open(os.path.join(base_dir, 'stdout'), 'wb') as f: f.write(stdout) with open(os.path.join(base_dir, 'stderr'), 'wb') as f: f.write(stderr)
def test_ascii_unicode(self): self.assertEqual(to_unicode(u'foo'), u'foo')
def test_utf_8_bytes(self): self.assertEqual(to_unicode(b'caf\xc3\xa9'), u'café')
def test_ascii_bytes(self): self.assertEqual(to_unicode(b'foo'), u'foo')
def _parse_hadoop_log4j_records(lines, pre_filter=None): """Parse lines from a hadoop log into log4j records. Yield dictionaries with the following keys: caller_location -- e.g. 'YarnClientImpl.java:submitApplication(251)' level -- e.g. 'INFO' logger -- e.g. 'amazon.emr.metrics.MetricsSaver' message -- the actual message. If this is a multi-line message (e.g. for counters), the lines will be joined by '\n' num_lines -- how many lines made up the message start_line -- which line the message started on (0-indexed) thread -- e.g. 'main'. Defaults to '' timestamp -- unparsed timestamp, e.g. '15/12/07 20:49:28', '2015-08-22 00:46:18,411' Lines will be converted to unicode, and trailing \r and \n will be stripped from lines. If set, *pre_filter* will be applied to stripped lines. If it returns true, we'll return a fake record with message set to the line, num_lines and start_line set as normal, and everything else set to ''. Also yields fake records for leading non-log4j lines (trailing non-log4j lines are assumed to be part of a multiline message if not pre-filtered). """ last_record = None for line_num, line in enumerate(lines): # convert from bytes to unicode, if needed, and strip trailing newlines line = to_unicode(line).rstrip('\r\n') def fake_record(): return dict(caller_location='', level='', logger='', message=line, num_lines=1, start_line=line_num, thread='', timestamp='') # had to patch this in here to get _parse_hadoop_jar_command_stderr()'s # record_callback to fire on the correct line. The problem is that # we don't emit records until we see the next line (to handle # multiline records), so the callback would fire in the wrong order if pre_filter: if pre_filter(line): if last_record: last_record['num_lines'] = (line_num - last_record['start_line']) yield last_record yield fake_record() last_record = None continue m = (_HADOOP_LOG4J_LINE_RE.match(line) or _HADOOP_LOG4J_LINE_ALTERNATE_RE.match(line)) if m: if last_record: last_record['num_lines'] = (line_num - last_record['start_line']) yield last_record last_record = m.groupdict() last_record.setdefault('caller_location', '') last_record['thread'] = last_record['thread'] or '' last_record['start_line'] = line_num else: # add on to previous record if last_record: last_record['message'] += '\n' + line else: yield fake_record() if last_record: last_record['num_lines'] = (line_num + 1 - last_record['start_line']) yield last_record
def test_ascii_bytes(self): self.assertEqual(to_unicode(b'foo'), u'foo')
def test_latin_1_bytes(self): self.assertEqual(to_unicode(b'caf\xe9'), u'caf\xe9')
def test_non_ascii_unicode(self): self.assertEqual(to_unicode(u'café'), u'café')
def _log_line(line): log.info(' %s' % to_unicode(line).strip('\r\n'))
def test_non_ascii_unicode(self): self.assertEqual(to_unicode(u'café'), u'café')
def test_ascii_unicode(self): self.assertEqual(to_unicode(u'foo'), u'foo')
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_driver(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation self._log_counters(log_interpretation, step_num) step_type = step['type'] if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())
def test_latin_1_bytes(self): self.assertEqual(to_unicode(b'caf\xe9'), u'caf\xe9')
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())
def test_utf_8_bytes(self): self.assertEqual(to_unicode(b'caf\xc3\xa9'), u'café')