def _parse_counters_0_18(counter_string): # 0.18 counters look like this: # GroupName.CounterName:Value,Group1.Crackers:3,Group2.Nerf:243,... groups = _COUNTER_RE_0_18.finditer(counter_string) if groups is None: log.warning("Cannot parse Hadoop counter string: %s" % counter_string) for m in groups: yield (to_string(m.group("group")), to_string(m.group("name")), int(m.group("value")))
def _parse_counters_0_18(counter_string): # 0.18 counters look like this: # GroupName.CounterName:Value,Group1.Crackers:3,Group2.Nerf:243,... groups = _COUNTER_RE_0_18.finditer(counter_string) if groups is None: log.warning('Cannot parse Hadoop counter string: %s' % counter_string) for m in groups: yield (to_string(m.group('group')), to_string(m.group('name')), int(m.group('value')))
def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None, return_stdout=False): """Run the given hadoop command, raising an exception on non-zero return code. This only works for commands whose output we don't care about. Args: ok_returncodes -- a list/tuple/set of return codes we expect to get back from hadoop (e.g. [0,1]). By default, we only expect 0. If we get an unexpected return code, we raise a CalledProcessError. ok_stderr -- don't log STDERR or raise CalledProcessError if stderr matches a regex in this list (even if the returncode is bad) return_stdout -- return the stdout from the hadoop command rather than logging it. If this is False, we return the returncode instead. """ args = self._hadoop_bin + args log.debug('> %s' % cmd_line(args)) proc = Popen(args, stdout=PIPE, stderr=PIPE) stdout, stderr = proc.communicate() log_func = log.debug if proc.returncode == 0 else log.error if not return_stdout: for line in BytesIO(stdout): log_func('STDOUT: ' + to_string(line.rstrip(b'\r\n'))) # check if STDERR is okay stderr_is_ok = False if ok_stderr: for stderr_re in ok_stderr: if stderr_re.match(stderr): stderr_is_ok = True break if not stderr_is_ok: for line in BytesIO(stderr): log_func('STDERR: ' + to_string(line.rstrip(b'\r\n'))) ok_returncodes = ok_returncodes or [0] if not stderr_is_ok and proc.returncode not in ok_returncodes: raise CalledProcessError(proc.returncode, args) if return_stdout: return stdout else: return proc.returncode
def _process_stderr_from_streaming(self, stderr): def treat_eio_as_eof(iter): # on Linux, the PTY gives us a specific IOError when the # when the child process exits, rather than EOF. while True: try: yield next(iter) # okay for StopIteration to bubble up except IOError as e: if e.errno == errno.EIO: return else: raise for line in treat_eio_as_eof(stderr): line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2) log.info('HADOOP: ' + to_string(line)) if b'Streaming Job Failed!' in line: raise Exception(line) # The job identifier is printed to stderr. We only want to parse it # once because we know how many steps we have and just want to know # what Hadoop thinks the first step's number is. m = HADOOP_JOB_TIMESTAMP_RE.match(line) if m and self._job_timestamp is None: self._job_timestamp = m.group('timestamp') self._start_step_num = int(m.group('step_num'))
def find_hadoop_java_stack_trace(lines): """Scan a log file or other iterable for a java stack trace from Hadoop, and return it as a list of lines (bytes). In logs from EMR, we find java stack traces in ``task-attempts/*/syslog`` Sample stack trace:: 2010-07-27 18:25:48,397 WARN org.apache.hadoop.mapred.TaskTracker (main): Error running child java.lang.OutOfMemoryError: Java heap space at org.apache.hadoop.mapred.IFile$Reader.readNextBlock(IFile.java:270) at org.apache.hadoop.mapred.IFile$Reader.next(IFile.java:332) at org.apache.hadoop.mapred.Merger$Segment.next(Merger.java:147) at org.apache.hadoop.mapred.Merger$MergeQueue.adjustPriorityQueue(Merger.java:238) at org.apache.hadoop.mapred.Merger$MergeQueue.next(Merger.java:255) at org.apache.hadoop.mapred.Merger.writeFile(Merger.java:86) at org.apache.hadoop.mapred.Merger$MergeQueue.merge(Merger.java:377) at org.apache.hadoop.mapred.Merger.merge(Merger.java:58) at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:277) at org.apache.hadoop.mapred.TaskTracker$Child.main(TaskTracker.java:2216) (We omit the "Error running child" line from the results) """ for line in lines: if line.rstrip(b'\r\n').endswith(b"Error running child"): st_lines = [] for line in lines: st_lines.append(line) for line in lines: if not line.startswith(b' at '): break st_lines.append(line) return [to_string(line) for line in st_lines] else: return None
def _ssh_ls(ssh_bin, address, ec2_key_pair_file, path, keyfile=None, sudo=False): """Recursively list files under ``path`` on the specified SSH host. Return the file at ``path`` as a string. Raises ``IOError`` if the path doesn't exist or SSH access fails. :param ssh_bin: Path to ``ssh`` binary :param address: Address of your job's master node :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``) :param path: Path on the remote host to list :param keyfile: Name of the EMR private key file on the master node in case ``path`` exists on one of the slave nodes :param sudo: if true, run command with ``sudo`` """ cmd_args = ['find', '-L', path, '-type', 'f'] if sudo: cmd_args = ['sudo'] + cmd_args out = to_string( _check_output(*_ssh_run_with_recursion( ssh_bin, address, ec2_key_pair_file, keyfile, cmd_args))) if 'No such file or directory' in out: raise IOError("No such file or directory: %s" % path) return out.split('\n')
def _run_job_in_hadoop(self): self._counters = [] for step_num in range(self._num_steps()): log.debug("running step %d of %d" % (step_num + 1, self._num_steps())) step_args = self._args_for_step(step_num) log.debug("> %s" % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error("STDOUT: " + to_string(line.strip(b"\n"))) returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: with os.fdopen(master_fd, "rb") as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) self._process_stderr_from_streaming(master) _, returncode = os.waitpid(pid, 0) if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = "Job failed with return code %d: %s" % (returncode, step_args) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure([step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"]) cause_msg.extend(line.strip("\n") for line in cause["lines"]) if cause["input_uri"]: cause_msg.append("(while reading from %s)" % cause["input_uri"]) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += "\n" + "\n".join(cause_msg) + "\n" raise CalledProcessError(returncode, step_args)
def _process_stderr_from_streaming(self, stderr): def treat_eio_as_eof(iter): # on Linux, the PTY gives us a specific IOError when the # when the child process exits, rather than EOF. while True: try: yield next(iter) # okay for StopIteration to bubble up except IOError as e: if e.errno == errno.EIO: return else: raise for line in treat_eio_as_eof(stderr): line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2) log.info("HADOOP: " + to_string(line)) if b"Streaming Job Failed!" in line: raise Exception(line) # The job identifier is printed to stderr. We only want to parse it # once because we know how many steps we have and just want to know # what Hadoop thinks the first step's number is. m = HADOOP_JOB_TIMESTAMP_RE.match(line) if m and self._job_timestamp is None: self._job_timestamp = m.group("timestamp") self._start_step_num = int(m.group("step_num"))
def stderr_to_log(lines): for line in lines: line = to_string(line) if _HADOOP_NON_LOG_LINE_RE.match(line): # use error because this is usually "Streaming Command Failed!" _log_line_from_hadoop(line, level=logging.ERROR) else: yield line
def parse_mr_job_stderr(stderr, counters=None): """Parse counters and status messages out of MRJob output. :param stderr: a filehandle, a list of lines (bytes), or bytes :param counters: Counters so far, to update; a map from group (string to counter name (string) to count. Returns a dictionary with the keys *counters*, *statuses*, *other*: - *counters*: counters so far; same format as above - *statuses*: a list of status messages encountered - *other*: lines (strings) that aren't either counters or status messages """ # For the corresponding code in Hadoop Streaming, see ``incrCounter()`` in # http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapRed.java?view=markup # noqa if isinstance(stderr, bytes): stderr = BytesIO(stderr) if counters is None: counters = {} statuses = [] other = [] for line in stderr: m = _COUNTER_RE.match(line.rstrip(b'\r\n')) if m: group, counter, amount_str = m.groups() # don't leave these as bytes on Python 3 group = to_string(group) counter = to_string(counter) counters.setdefault(group, {}) counters[group].setdefault(counter, 0) counters[group][counter] += int(amount_str) continue m = _STATUS_RE.match(line.rstrip(b'\r\n')) if m: # don't leave as bytes on Python 3 statuses.append(to_string(m.group(1))) continue other.append(to_string(line)) return {'counters': counters, 'statuses': statuses, 'other': other}
def _cat_log(fs, path): """fs.cat() the given log, converting lines to strings, and logging errors.""" try: for line in fs.cat(path): yield to_string(line) except IOError as e: log.warning("couldn't cat() %s: %r" % (path, e))
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) version = self.get_hadoop_version() # use ls -R on Hadoop 2 (see #1152) if uses_yarn(version): args = ['fs', '-ls', '-R', path_glob] else: args = ['fs', '-lsr', path_glob] try: stdout = self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in BytesIO(stdout): line = line.rstrip(b'\r\n') # ignore total item count if line.startswith(b'Found '): continue fields = line.split(b' ') # Throw out directories if fields[0].startswith(b'd'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): # look for time field, and pick one after that # (can't use field[2] because that's an int in Python 3) if len(field) == 5 and field[2:3] == b':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string %r" % line) path = to_string(line.split(b' ', path_index)[-1]) # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path
def _cat_log(fs, path): """fs.cat() the given log, converting lines to strings, and logging errors.""" try: if not fs.exists(path): return for line in fs.cat(path): yield to_string(line) except (IOError, OSError) as e: log.warning("couldn't cat() %s: %r" % (path, e))
def yield_lines(): try: for line in stderr: yield to_string(line) except IOError as e: # this is just the PTY's way of saying goodbye if e.errno == errno.EIO: return else: raise
def find_python_traceback(lines): """Scan a log file or other iterable for a Python traceback, and return it as a list of lines (bytes). In logs from EMR, we find python tracebacks in ``task-attempts/*/stderr`` """ # Essentially, we detect the start of the traceback, and continue # until we find a non-indented line, with some special rules for exceptions # from subprocesses. # Lines to pass back representing entire error found all_tb_lines = [] # This is used to store a working list of lines in a single traceback tb_lines = [] # This is used to store a working list of non-traceback lines between the # current traceback and the previous one non_tb_lines = [] # Track whether or not we are in a traceback rather than consuming the # iterator in_traceback = False for line in lines: # don't return bytes in Python 3 line = to_string(line) if in_traceback: tb_lines.append(line) # If no indentation, this is the last line of the traceback if line.lstrip() == line: in_traceback = False if line.startswith('subprocess.CalledProcessError'): # CalledProcessError may mean that the subprocess printed # errors to stderr which we can show the user all_tb_lines += non_tb_lines all_tb_lines += tb_lines # Reset all working lists tb_lines = [] non_tb_lines = [] else: if line.startswith('Traceback (most recent call last):'): tb_lines.append(line) in_traceback = True else: non_tb_lines.append(line) if all_tb_lines: return all_tb_lines else: return None
def ssh_terminate_single_job(ssh_bin, address, ec2_key_pair_file): """Terminate the only job running the Hadoop cluster with master node *address* using 'hadoop job -kill JOB_ID'. Return string output of command or None if there was no job to termiante. Raise :py:class:`IOError` if some other error occurred. :param ssh_bin: Path to ``ssh`` binary :param address: Address of your job's master node (obtained via :py:meth:`boto.emr.EmrConnection.describe_jobflow`) :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``) :return: ``True`` if successful, ``False`` if no job was running """ job_list_out = to_string( check_output(*ssh_run(ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-list']))) job_list_lines = job_list_out.splitlines() def job_list_output_error(): raise IOError('Could not read results of "hadoop job -list" and so' ' could not terminate job:\n%s' % job_list_out) num_jobs_match = HADOOP_JOB_LIST_NUM_RE.match(job_list_lines[0]) if not num_jobs_match: job_list_output_error() if int(num_jobs_match.group(1)) > 1: raise IOError('More than one job is running; unclear which one to' ' terminate, so not terminating any jobs') if int(num_jobs_match.group(1)) == 0: return None job_info_match = HADOOP_JOB_LIST_INFO_RE.match(job_list_lines[2]) if not job_info_match: job_list_output_error() job_id = to_string(job_info_match.group(1)) job_kill_out = to_string( check_output(*ssh_run(ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-kill', job_id]))) return job_kill_out
def cleanup(): # this does someties happen; see #1396 for line in cat_proc.stderr: log.error('STDERR: ' + to_string(line.rstrip(b'\r\n'))) cat_proc.stdout.close() cat_proc.stderr.close() returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename)
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True): """Given an :py:class:`EMRJobRunner`, run the command specified by *cmd_args* on all nodes in the cluster and save the stdout and stderr of each run to subdirectories of *output_dir*. You should probably have run :py:meth:`_enable_slave_ssh_access()` on the runner before calling this function. """ master_addr = runner._address_of_master() addresses = [master_addr] ssh_bin = runner._opts['ssh_bin'] ec2_key_pair_file = runner._opts['ec2_key_pair_file'] keyfile = None slave_addrs = runner.fs.ssh_slave_hosts(master_addr) if slave_addrs: addresses += [ '%s!%s' % (master_addr, slave_addr) for slave_addr in slave_addrs ] # copying key file like a boss (name of keyfile doesn't really matter) keyfile = 'mrboss-%s.pem' % random_identifier() _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile) for addr in addresses: stdout, stderr = _ssh_run_with_recursion( ssh_bin, addr, ec2_key_pair_file, keyfile, cmd_args, ) if print_stderr: print('---') print('Command completed on %s.' % addr) print(to_string(stderr), end=' ') if '!' in addr: base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1]) else: base_dir = os.path.join(output_dir, 'master') if not os.path.exists(base_dir): os.makedirs(base_dir) with open(os.path.join(base_dir, 'stdout'), 'wb') as f: f.write(stdout) with open(os.path.join(base_dir, 'stderr'), 'wb') as f: f.write(stderr)
def ssh_terminate_single_job(ssh_bin, address, ec2_key_pair_file): """Terminate the only job running the Hadoop cluster with master node *address* using 'hadoop job -kill JOB_ID'. Return string output of command or None if there was no job to termiante. Raise :py:class:`IOError` if some other error occurred. :param ssh_bin: Path to ``ssh`` binary :param address: Address of your job's master node (obtained via :py:meth:`boto.emr.EmrConnection.describe_jobflow`) :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``) :return: ``True`` if successful, ``False`` if no job was running """ job_list_out = to_string(check_output(*ssh_run( ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-list']))) job_list_lines = job_list_out.splitlines() def job_list_output_error(): raise IOError('Could not read results of "hadoop job -list" and so' ' could not terminate job:\n%s' % job_list_out) num_jobs_match = HADOOP_JOB_LIST_NUM_RE.match(job_list_lines[0]) if not num_jobs_match: job_list_output_error() if int(num_jobs_match.group(1)) > 1: raise IOError('More than one job is running; unclear which one to' ' terminate, so not terminating any jobs') if int(num_jobs_match.group(1)) == 0: return None job_info_match = HADOOP_JOB_LIST_INFO_RE.match(job_list_lines[2]) if not job_info_match: job_list_output_error() job_id = to_string(job_info_match.group(1)) job_kill_out = to_string(check_output(*ssh_run( ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-kill', job_id]))) return job_kill_out
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True): """Given an :py:class:`EMRJobRunner`, run the command specified by *cmd_args* on all nodes in the cluster and save the stdout and stderr of each run to subdirectories of *output_dir*. You should probably have run :py:meth:`_enable_slave_ssh_access()` on the runner before calling this function. """ master_addr = runner._address_of_master() addresses = [master_addr] ssh_bin = runner._opts['ssh_bin'] ec2_key_pair_file = runner._opts['ec2_key_pair_file'] keyfile = None slave_addrs = runner.fs.ssh_slave_hosts(master_addr) if slave_addrs: addresses += ['%s!%s' % (master_addr, slave_addr) for slave_addr in slave_addrs] # copying key file like a boss (name of keyfile doesn't really matter) keyfile = 'mrboss-%s.pem' % random_identifier() _ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile) for addr in addresses: stdout, stderr = _ssh_run_with_recursion( ssh_bin, addr, ec2_key_pair_file, keyfile, cmd_args, ) if print_stderr: print('---') print('Command completed on %s.' % addr) print(to_string(stderr), end=' ') if '!' in addr: base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1]) else: base_dir = os.path.join(output_dir, 'master') if not os.path.exists(base_dir): os.makedirs(base_dir) with open(os.path.join(base_dir, 'stdout'), 'wb') as f: f.write(stdout) with open(os.path.join(base_dir, 'stderr'), 'wb') as f: f.write(stderr)
def _ssh_slave_addresses(ssh_bin, master_address, ec2_key_pair_file): """Get the IP addresses of the slave nodes. Fails silently because it makes testing easier and if things are broken they will fail before this function is called. """ if not ec2_key_pair_file or not os.path.exists(ec2_key_pair_file): return [] # this is a testing environment cmd = "hadoop dfsadmin -report | grep ^Name | cut -f2 -d: | cut -f2 -d' '" args = ['bash -c "%s"' % cmd] ips = to_string(_check_output( *_ssh_run(ssh_bin, master_address, ec2_key_pair_file, args))) return [ip for ip in ips.split('\n') if ip]
def ssh_slave_addresses(ssh_bin, master_address, ec2_key_pair_file): """Get the IP addresses of the slave nodes. Fails silently because it makes testing easier and if things are broken they will fail before this function is called. """ if not ec2_key_pair_file or not os.path.exists(ec2_key_pair_file): return [] # this is a testing environment cmd = "hadoop dfsadmin -report | grep ^Name | cut -f2 -d: | cut -f2 -d' '" args = ['bash -c "%s"' % cmd] ips = to_string(check_output( *ssh_run(ssh_bin, master_address, ec2_key_pair_file, args))) return [ip for ip in ips.split('\n') if ip]
def get_hadoop_version(self): """Invoke the hadoop executable to determine its version""" # mkdir() needs this if not self._hadoop_version: stdout = self.invoke_hadoop(['version'], return_stdout=True) if stdout: first_line = stdout.split(b'\n')[0] m = _HADOOP_VERSION_RE.match(first_line) if m: self._hadoop_version = to_string(m.group('version')) log.info("Using Hadoop version %s" % self._hadoop_version) else: raise Exception('Unable to determine Hadoop version.') return self._hadoop_version
def _parse_counters_0_20(counter_string): # 0.20 counters look like this: # {(groupid)(groupname)[(counterid)(countername)(countervalue)][...]...} groups = _GROUP_RE_0_20.findall(counter_string) if not groups: log.warning('Cannot parse Hadoop counter string: %s' % counter_string) for group_id, group_name, counter_str in groups: matches = _COUNTER_RE_0_20.findall(counter_str) try: group_name = counter_unescape(group_name) except ValueError: log.warning("Could not decode group name %r" % group_name) group_name = to_string(group_name) for counter_id, counter_name, counter_value in matches: try: counter_name = counter_unescape(counter_name) except ValueError: log.warning("Could not decode counter name %r" % counter_name) counter_name = to_string(counter_name) yield group_name, counter_name, int(counter_value)
def find_input_uri_for_mapper(lines): """Scan a log file or other iterable for the path of an input file for the first mapper on Hadoop. Just returns the path, or None if no match. In logs from EMR, we find python tracebacks in ``task-attempts/*/syslog`` Matching log lines look like:: 2010-07-27 17:54:54,344 INFO org.apache.hadoop.fs.s3native.NativeS3FileSystem (main): Opening 's3://yourbucket/logs/2010/07/23/log2-00077.gz' for reading """ val = None for line in lines: match = _OPENING_FOR_READING_RE.match(line) if match: val = to_string(match.group(1)) return val
def find_job_log_multiline_error(lines): """Scan a log file for an arbitrary multi-line error. Return it as a list of lines, or None of nothing was found. Here is an example error:: MapAttempt TASK_TYPE="MAP" TASKID="task_201106280040_0001_m_000218" TASK_ATTEMPT_ID="attempt_201106280040_0001_m_000218_5" TASK_STATUS="FAILED" FINISH_TIME="1309246900665" HOSTNAME="/default-rack/ip-10-166-239-133.us-west-1.compute.internal" ERROR="Error initializing attempt_201106280040_0001_m_000218_5: java.io.IOException: Cannot run program "bash": java.io.IOException: error=12, Cannot allocate memory at java.lang.ProcessBuilder.start(ProcessBuilder.java:460) at org.apache.hadoop.util.Shell.runCommand(Shell.java:149) at org.apache.hadoop.util.Shell.run(Shell.java:134) at org.apache.hadoop.fs.DF.getAvailable(DF.java:73) at org.apache.hadoop.fs.LocalDirAllocator$AllocatorPerContext.getLocalPathForWrite(LocalDirAllocator.java:296) at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:124) at org.apache.hadoop.mapred.TaskTracker.localizeJob(TaskTracker.java:648) at org.apache.hadoop.mapred.TaskTracker.startNewTask(TaskTracker.java:1320) at org.apache.hadoop.mapred.TaskTracker.offerService(TaskTracker.java:956) at org.apache.hadoop.mapred.TaskTracker.run(TaskTracker.java:1357) at org.apache.hadoop.mapred.TaskTracker.main(TaskTracker.java:2361) Caused by: java.io.IOException: java.io.IOException: error=12, Cannot allocate memory at java.lang.UNIXProcess.<init>(UNIXProcess.java:148) at java.lang.ProcessImpl.start(ProcessImpl.java:65) at java.lang.ProcessBuilder.start(ProcessBuilder.java:453) ... 10 more " The first line returned will only include the text after ``ERROR="``, and discard the final line with just ``"``. These errors are parsed from jobs/\*.jar. """ for line in lines: m = _MULTILINE_JOB_LOG_ERROR_RE.match(line) if m: st_lines = [] if m.group('first_line'): st_lines.append(m.group('first_line')) for line in lines: st_lines.append(line) for line in lines: if line.strip() == b'"': break st_lines.append(line) return [to_string(line) for line in st_lines] return None
def _wrap_streaming_pty_output(lines): """Make output from PTY running hadoop streaming behave like stderr. This screens out and logs lines that look like they came from stdout, and treats the EIO error as EOF. """ while True: try: line = next(lines) # okay to get StopIteration if _HADOOP_STDOUT_RE.match(line): _log_line_from_hadoop(to_string(line).rstrip('\r\n')) else: yield line except IOError as e: if e.errno == errno.EIO: return else: raise
def find_interesting_hadoop_streaming_error(lines): """Scan a log file or other iterable for a hadoop streaming error other than "Job not Successful!". Return the error as a string, or None if nothing found. In logs from EMR, we find java stack traces in ``steps/*/syslog`` Example line:: 2010-07-27 19:53:35,451 ERROR org.apache.hadoop.streaming.StreamJob (main): Error launching job , Output path already exists : Output directory s3://yourbucket/logs/2010/07/23/ already exists and is not empty """ for line in lines: match = _HADOOP_STREAMING_ERROR_RE.match(line) or _HADOOP_STREAMING_ERROR_RE_2.match(line) if match: msg = to_string(match.group(1)) if msg != "Job not Successful!": return msg return None
def _ssh_ls(ssh_bin, address, ec2_key_pair_file, path, keyfile=None): """Recursively list files under ``path`` on the specified SSH host. Return the file at ``path`` as a string. Raises ``IOError`` if the path doesn't exist or SSH access fails. :param ssh_bin: Path to ``ssh`` binary :param address: Address of your job's master node :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``) :param path: Path on the remote host to list :param keyfile: Name of the EMR private key file on the master node in case ``path`` exists on one of the slave nodes """ out = to_string(_check_output(*_ssh_run_with_recursion( ssh_bin, address, ec2_key_pair_file, keyfile, ['find', '-L', path, '-type', 'f']))) if 'No such file or directory' in out: raise IOError("No such file or directory: %s" % path) return out.split('\n')
def find_interesting_hadoop_streaming_error(lines): """Scan a log file or other iterable for a hadoop streaming error other than "Job not Successful!". Return the error as a string, or None if nothing found. In logs from EMR, we find java stack traces in ``steps/*/syslog`` Example line:: 2010-07-27 19:53:35,451 ERROR org.apache.hadoop.streaming.StreamJob (main): Error launching job , Output path already exists : Output directory s3://yourbucket/logs/2010/07/23/ already exists and is not empty """ for line in lines: match = (_HADOOP_STREAMING_ERROR_RE.match(line) or _HADOOP_STREAMING_ERROR_RE_2.match(line)) if match: msg = to_string(match.group(1)) if msg != 'Job not Successful!': return msg return None
def ssh_ls(ssh_bin, address, ec2_key_pair_file, path, keyfile=None): """Recursively list files under ``path`` on the specified SSH host. Return the file at ``path`` as a string. Raises ``IOError`` if the path doesn't exist or SSH access fails. :param ssh_bin: Path to ``ssh`` binary :param address: Address of your job's master node (obtained via :py:meth:`boto.emr.EmrConnection.describe_jobflow`) :param ec2_key_pair_file: Path to the key pair file (argument to ``-i``) :param path: Path on the remote host to list :param keyfile: Name of the EMR private key file on the master node in case ``path`` exists on one of the slave nodes """ out = to_string( check_output(*ssh_run_with_recursion( ssh_bin, address, ec2_key_pair_file, keyfile, ['find', '-L', path, '-type', 'f']))) if 'No such file or directory' in out: raise IOError("No such file or directory: %s" % path) return out.split('\n')
def test_non_ascii_unicode(self): self.assertEqual(to_string(u'café'), u'café')
def test_ascii_unicode(self): self.assertEqual(to_string(u'foo'), u'foo')
def test_latin_1_bytes(self): self.assertEqual(to_string(b'caf\xe9'), 'caf\xe9')
def _run_job_in_hadoop(self): for step_num in range(self._num_steps()): step_args = self._args_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) step_info = _process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_info = _process_stderr_from_streaming( _wrap_streaming_pty_output(master)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if not step_info['output_dir']: step_info['output_dir'] = self._hdfs_step_output_dir(step_num) if not step_info['counters']: pass # TODO: fetch counters; see _fetch_counters() self._steps_info.append(step_info) # just print counters for this one step self._print_counters(step_nums=[step_num]) if returncode: err_lines = [ 'Job failed with return code %d: %s' % (returncode, cmd_line(step_args)) ] cause = self._find_probable_cause_of_failure(**step_info) if cause: err_lines.append('') # pad with empty line err_lines.extend(_format_cause_of_failure(cause)) for err_line in err_lines: log.error(err_line) raise Exception('\n'.join(err_lines) + '\n')
def _run_job_in_hadoop(self): for step_num in range(self._num_steps()): step_args = self._args_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._hdfs_step_output_dir(step_num)) log_interpretation['step'] = step_interpretation counters = self._pick_counters(log_interpretation) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())
def _process_streaming_stderr_lines(self, lines): """Process lines (with \r and \n stripped) from Hadoop binary's stderr. Return a dict of counters for the step. This handles counter parsing, debug printouts, and the job timestamp. """ # counter-parsing state step_counters = {} parsing_counters = False counter_group = None counter_group_indent = None for line in lines: line = _HADOOP_STREAMING_OUTPUT_RE.match(line).group(2) # don't print HADOOP: <counter stuff>, since we print # counters later anyway # start of counters if not (parsing_counters or step_counters): m = _HADOOP_COUNTERS_START_RE.match(line) if m: parsing_counters = True log.info('Parsing counters from hadoop output') continue if parsing_counters: if not (counter_group is None or counter_group_indent is None): m = _HADOOP_COUNTER_RE.match(line) if m and len(m.group('indent')) > counter_group_indent: counter = to_string(m.group('counter')) amount = int(m.group('amount')) log.debug(' counter: %s=%d' % (counter, amount)) step_counters.setdefault(counter_group, {}) step_counters[counter_group][counter] = amount continue m = _HADOOP_COUNTER_GROUP_RE.match(line) if m: counter_group = to_string(m.group('group')) counter_group_indent = len(m.group('indent')) log.debug(' counter group: %s' % counter_group) continue else: parsing_counters = False log.info('HADOOP: ' + to_string(line)) if b'Streaming Job Failed!' in line: raise Exception(line) # The job identifier is printed to stderr. We only want to parse it # once because we know how many steps we have and just want to know # what Hadoop thinks the first step's number is. if self._job_timestamp is None: m = _HADOOP_JOB_TIMESTAMP_RE.match(line) if m: self._job_timestamp = m.group('timestamp') self._start_step_num = int(m.group('step_num')) return step_counters
def _run_job_in_hadoop(self): for step_num in range(self._num_steps()): step_args = self._args_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) step_info = _process_stderr_from_streaming( step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_info = _process_stderr_from_streaming( _wrap_streaming_pty_output(master)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if not step_info['output_dir']: step_info['output_dir'] = self._hdfs_step_output_dir(step_num) if not step_info['counters']: log.info('Attempting to read counters from history log') history = self._interpret_history_log(step_info) if history: step_info['counters'] = history['counters'] self._steps_info.append(step_info) # just print counters for this one step self._print_counters(step_nums=[step_num]) if returncode: err_lines = [ 'Job failed with return code %d: %s' % (returncode, cmd_line(step_args))] cause = self._find_probable_cause_of_failure(**step_info) if cause: err_lines.append('') # pad with empty line err_lines.extend(_format_cause_of_failure(cause)) for err_line in err_lines: log.error(err_line) raise Exception('\n'.join(err_lines) + '\n')
def _run_job_in_hadoop(self): self._counters = [] for step_num in range(self._num_steps()): log.debug('running step %d of %d' % (step_num + 1, self._num_steps())) step_args = self._args_for_step(step_num) log.debug('> %s' % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) step_counters = self._process_stderr_from_streaming( step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + to_string(line.strip(b'\n'))) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_counters = self._process_stderr_from_streaming( master) _, returncode = os.waitpid(pid, 0) # TODO: looks like we are parsing counters # but they don't get printed out. if step_counters: self._counters.append(step_counters) else: self._fetch_counters([step_num + self._start_step_num]) # just print counters for this one step self._print_counters(step_nums=[step_num]) if returncode: msg = ('Job failed with return code %d: %s' % (returncode, step_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend(line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise CalledProcessError(returncode, step_args)
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = self._env_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())
def _run_job_in_hadoop(self): self._counters = [] for step_num in range(self._num_steps()): log.debug('running step %d of %d' % (step_num + 1, self._num_steps())) step_args = self._args_for_step(step_num) log.debug('> %s' % cmd_line(step_args)) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + to_string(line.strip(b'\n'))) returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) self._process_stderr_from_streaming(master) _, returncode = os.waitpid(pid, 0) if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = ('Job failed with return code %d: %s' % (returncode, step_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend( line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise CalledProcessError(returncode, step_args)
def _run_job_in_hadoop(self): for step_num in range(self._num_steps()): step_args = self._args_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._hdfs_step_output_dir(step_num)) log_interpretation['step'] = step_interpretation if 'counters' not in step_interpretation: log.info('Attempting to read counters from history log') self._interpret_history_log(log_interpretation) # just print counters for this one step self._print_counters(step_nums=[step_num]) if returncode: error = self._pick_error(log_interpretation) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) raise CalledProcessError(returncode, step_args)