def _wait_for_process(self, proc_dict, step_num): # handle counters, status msgs, and other stuff on stderr proc = proc_dict['proc'] stderr_lines = self._process_stderr_from_script(proc.stderr, step_num=step_num) tb_lines = _find_python_traceback(stderr_lines) # proc.stdout isn't always defined if proc.stdout: proc.stdout.close() proc.stderr.close() returncode = proc.wait() if returncode != 0: # show counters before raising exception counters = self._counters[step_num] if counters: log.info(_format_counters(counters)) # try to throw a useful exception if tb_lines: for line in tb_lines: log.error(line.rstrip('\r\n')) reason = str(CalledProcessError(returncode, proc_dict['args'])) raise StepFailedException(reason=reason, step_num=step_num, num_steps=len(self._get_steps()))
def _wait_for_step_to_complete(self, job_id, step_num=None, num_steps=None): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job_result = self._api_job_get(job_id) job_state = job_result['status']['state'] log.info('%s => %s' % (job_id, job_state)) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa if job_state in _DATAPROC_JOB_STATES_ACTIVE: self._wait_for_api('job completion') continue # we're done, will return at the end of this elif job_state == 'DONE': break raise StepFailedException(step_num=step_num, num_steps=num_steps)
def _wait_for_step_to_complete(self, job_id, step_num=None, num_steps=None): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) log.info('%s => %s' % (job_id, job_state)) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # we're done, will return at the end of this elif job_state == 'DONE': break raise StepFailedException(step_num=step_num, num_steps=num_steps)
def _run_step_on_spark(self, step, step_num): if self._opts['upload_archives']: log.warning('Spark master %r will probably ignore archives' % self._spark_master()) step_args = self._args_for_spark_step(step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) def _log_line(line): log.info(' %s' % to_unicode(line).strip('\r\n')) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # spark-submit is running log.debug('No PTY available, using Popen() to invoke spark-submit') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) for line in step_proc.stderr: for record in _parse_hadoop_log4j_records( _yield_lines_from_pty_or_pipe(step_proc.stderr)): _log_log4j_record(record) # there shouldn't be much output on STDOUT for line in step_proc.stdout: _log_line_from_driver(line) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking spark-submit via PTY') with os.fdopen(master_fd, 'rb') as master: for record in _parse_hadoop_log4j_records( _yield_lines_from_pty_or_pipe(master)): _log_log4j_record(record) _, returncode = os.waitpid(pid, 0) if returncode: reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())
def _wait_for_step_to_complete(self, job_id, step_num, num_steps): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) log_interpretation['step'] = {} step_type = self._get_step(step_num)['type'] while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) log.info('%s => %s' % (job_id, job_state)) log_interpretation['step']['driver_output_uri'] = ( job.driver_output_resource_uri) self._interpret_step_logs(log_interpretation, step_type) progress = log_interpretation['step'].get('progress') if progress: log.info(' ' + progress['message']) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # print counters if job wasn't CANCELLED if job_state != 'CANCELLED': self._log_counters(log_interpretation, step_num) if job_state == 'ERROR': error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n\n' % _format_error(error)) # we're done, will return at the end of this if job_state == 'DONE': break else: raise StepFailedException(step_num=step_num, num_steps=num_steps)
def _wait_for_step_to_complete(self, job_id, step_num=None, num_steps=None): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) step_interpretation = {} log_interpretation['step'] = step_interpretation while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) driver_output_uri = job.driver_output_resource_uri log.info('%s => %s' % (job_id, job_state)) # interpret driver output so far if driver_output_uri: self._update_step_interpretation(step_interpretation, driver_output_uri) if step_interpretation.get('progress'): log.info(' ' + step_interpretation['progress']['message']) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # print counters if job wasn't CANCELLED if job_state != 'CANCELLED': self._log_counters(log_interpretation, step_num) # we're done, will return at the end of this if job_state == 'DONE': break else: raise StepFailedException(step_num=step_num, num_steps=num_steps)
def _run_step_on_spark(self, step, step_num, last_step_num=None): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num, last_step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode, step_interpretation = self._run_spark_submit( spark_submit_args, env, record_callback=_log_log4j_record) counters = None if step['type'] == 'streaming': counter_file = self.fs.join(self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes counters = json.loads(to_unicode(counter_json)) if isinstance(counters, list): self._counters.extend(counters) # desc_num is 1-indexed user-readable step num for desc_num, counter_dict in enumerate(counters, start=(step_num + 1)): if counter_dict: log.info( _format_counters(counter_dict, desc=('Counters for step %d' % desc_num))) # for non-streaming steps, there are no counters. # pad self._counters to match number of steps while len(self._counters) < (last_step_num or step_num) + 1: self._counters.append({}) if returncode: error = _pick_error(dict(step=step_interpretation)) if error: _log_probable_cause_of_failure(log, error) reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException(reason=reason, step_num=step_num, last_step_num=last_step_num, num_steps=self._num_steps())
def _run_step_on_spark(self, step, step_num): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode = self._run_spark_submit(spark_submit_args, env, record_callback=_log_log4j_record) if returncode: reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_type = step['type'] step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) if self._step_type_uses_spark(step_type): returncode, step_interpretation = self._run_spark_submit( step_args, env, record_callback=_log_log4j_record) else: returncode, step_interpretation = self._run_hadoop( step_args, env, record_callback=_log_record_from_hadoop) # make sure output_dir is filled (used for history log) if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation self._log_counters(log_interpretation, step_num) if returncode: error = self._pick_error(log_interpretation, step_type) if error: _log_probable_cause_of_failure(log, error) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())
def _run_step_on_spark(self, step, step_num): if self._opts['upload_archives']: log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode, step_interpretation = self._run_spark_submit( spark_submit_args, env, record_callback=_log_log4j_record) if returncode: error = _pick_error(dict(step=step_interpretation)) if error: _log_probable_cause_of_failure(log, error) reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())
def test_step_desc(self): ex = StepFailedException(step_desc='Master node setup step') self.assertEqual(str(ex), 'Master node setup step failed') self.assertEqual( repr(ex), "StepFailedException(step_desc='Master node setup step')")
def test_num_steps_with_no_step_num(self): ex = StepFailedException(num_steps=4) self.assertEqual(str(ex), 'Step failed') self.assertEqual(repr(ex), 'StepFailedException(num_steps=4)')
def test_step_num_with_num_steps(self): ex = StepFailedException(step_num=0, num_steps=4) self.assertEqual(str(ex), 'Step 1 of 4 failed') self.assertEqual(repr(ex), 'StepFailedException(step_num=0, num_steps=4)')
def test_reason(self): ex = StepFailedException('Hadoop is feeling sad today') self.assertEqual(str(ex), 'Step failed: Hadoop is feeling sad today') self.assertEqual( repr(ex), "StepFailedException(reason='Hadoop is feeling sad today')")
def test_empty(self): ex = StepFailedException() self.assertEqual(str(ex), 'Step failed') self.assertEqual(repr(ex), 'StepFailedException()')
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = self._env_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())
def test_step_num(self): ex = StepFailedException(step_num=0) self.assertEqual(str(ex), 'Step 1 failed') self.assertEqual(repr(ex), 'StepFailedException(step_num=0)')