def _GetResult(self, batch): """Handles errors. Error handling for batch jobs. This happen after the batch reaches one of the complete states. Overrides. Args: batch: The batch resource. Returns: None. The result is directly output to log.err. Raises: JobTimeoutError: When waiter timed out. JobError: When remote batch job is failed. """ if not batch: # Batch resource is None but polling is considered done. # This only happens when the waiter timed out. raise exceptions.JobTimeoutError( 'Timed out while waiting for batch job.') if (batch.state == self.dataproc.messages.Batch.StateValueValuesEnum.SUCCEEDED): if not self.driver_log_streamer: log.warning('Expected batch job output not found.') elif self.driver_log_streamer.open: # Remote output didn't end correctly. log.warning('Batch job terminated, but output did not finish ' 'streaming.') elif (batch.state == self.dataproc.messages.Batch.StateValueValuesEnum.CANCELLED): log.warning('Batch job is CANCELLED.') else: err_message = 'Batch job is FAILED.' if batch.stateMessage: err_message = '{} Detail: {}'.format(err_message, batch.stateMessage) if err_message[-1] != '.': err_message += '.' err_message += '\n' err_message += ( 'Running auto diagnostics on the batch. It may take few ' 'minutes before diagnostics output is available. Please ' 'check diagnostics output by running \'gcloud dataproc ' 'batches describe\' command.') raise exceptions.JobError(err_message) # Nothing to return, since the result is directly output to users. return None
def WaitForJobTermination(job, context, message, goal_state, stream_driver_log=False, log_poll_period_s=1, dataproc_poll_period_s=10, timeout_s=None): """Poll dataproc Job until its status is terminal or timeout reached. Args: job: The job to wait to finish. context: dict, dataproc Command context. message: str, message to display to user while polling. goal_state: JobStatus.StateValueValuesEnum, the state to define success stream_driver_log: bool, Whether to show the Job's driver's output. log_poll_period_s: number, delay in seconds between checking on the log. dataproc_poll_period_s: number, delay in seconds between requests to the Dataproc API. timeout_s: number, time out for job completion. None means no timeout. Returns: Operation: the return value of the last successful operations.get request. Raises: OperationError: if the operation times out or finishes with an error. """ client = context['dataproc_client'] job_ref = ParseJob(job.reference.jobId, context) request = client.MESSAGES_MODULE.DataprocProjectsRegionsJobsGetRequest( projectId=job_ref.projectId, region=job_ref.region, jobId=job_ref.jobId) driver_log_stream = None last_job_poll_time = 0 job_complete = False wait_display = None driver_output_uri = None def ReadDriverLogIfPresent(): if driver_log_stream and driver_log_stream.open: # TODO(b/36049794): Don't read all output. driver_log_stream.ReadIntoWritable(log.err) def PrintEqualsLine(): attr = console_attr.GetConsoleAttr() log.err.Print('=' * attr.GetTermSize()[0]) if stream_driver_log: log.status.Print('Waiting for job output...') wait_display = NoOpProgressDisplay() else: wait_display = progress_tracker.ProgressTracker(message, autotick=True) start_time = now = time.time() with wait_display: while not timeout_s or timeout_s > (now - start_time): # Poll logs first to see if it closed. ReadDriverLogIfPresent() log_stream_closed = driver_log_stream and not driver_log_stream.open if not job_complete and job.status.state in constants.TERMINAL_JOB_STATES: job_complete = True # Wait an 10s to get trailing output. timeout_s = now - start_time + 10 if job_complete and (not stream_driver_log or log_stream_closed): # Nothing left to wait for break regular_job_poll = ( not job_complete # Poll less frequently on dataproc API and now >= last_job_poll_time + dataproc_poll_period_s) # Poll at regular frequency before output has streamed and after it has # finished. expecting_output_stream = stream_driver_log and not driver_log_stream expecting_job_done = not job_complete and log_stream_closed if regular_job_poll or expecting_output_stream or expecting_job_done: last_job_poll_time = now try: job = client.projects_regions_jobs.Get(request) except apitools_exceptions.HttpError as error: log.warn('GetJob failed:\n{1}', error) # Keep trying until we timeout in case error is transient. if (stream_driver_log and job.driverOutputResourceUri and job.driverOutputResourceUri != driver_output_uri): if driver_output_uri: PrintEqualsLine() log.warn( "Job attempt failed. Streaming new attempt's output." ) PrintEqualsLine() driver_output_uri = job.driverOutputResourceUri driver_log_stream = storage_helpers.StorageObjectSeriesStream( job.driverOutputResourceUri) time.sleep(log_poll_period_s) now = time.time() # TODO(b/34836493): Get better test coverage of the next 20 lines. state = job.status.state if state is not goal_state and job.status.details: # Just log details, because the state will be in the error message. log.info(job.status.details) if state in constants.TERMINAL_JOB_STATES: if stream_driver_log: if not driver_log_stream: log.warn('Expected job output not found.') elif driver_log_stream.open: log.warn( 'Job terminated, but output did not finish streaming.') if state is goal_state: return job raise exceptions.JobError( 'Job [{0}] entered state [{1}] while waiting for [{2}].'.format( job_ref.jobId, state, goal_state)) raise exceptions.JobTimeoutError( 'Job [{0}] timed out while in state [{1}].'.format( job_ref.jobId, state))
def WaitForJobTermination(dataproc, job, job_ref, message, goal_state, error_state=None, stream_driver_log=False, log_poll_period_s=1, dataproc_poll_period_s=10, timeout_s=None): """Poll dataproc Job until its status is terminal or timeout reached. Args: dataproc: wrapper for dataproc resources, client and messages job: The job to wait to finish. job_ref: Parsed dataproc.projects.regions.jobs resource containing a projectId, region, and jobId. message: str, message to display to user while polling. goal_state: JobStatus.StateValueValuesEnum, the state to define success error_state: JobStatus.StateValueValuesEnum, the state to define failure stream_driver_log: bool, Whether to show the Job's driver's output. log_poll_period_s: number, delay in seconds between checking on the log. dataproc_poll_period_s: number, delay in seconds between requests to the Dataproc API. timeout_s: number, time out for job completion. None means no timeout. Returns: Job: the return value of the last successful jobs.get request. Raises: JobError: if the job finishes with an error. """ request = dataproc.messages.DataprocProjectsRegionsJobsGetRequest( projectId=job_ref.projectId, region=job_ref.region, jobId=job_ref.jobId) driver_log_stream = None last_job_poll_time = 0 job_complete = False wait_display = None driver_output_uri = None def ReadDriverLogIfPresent(): if driver_log_stream and driver_log_stream.open: # TODO(b/36049794): Don't read all output. driver_log_stream.ReadIntoWritable(log.err) def PrintEqualsLine(): attr = console_attr.GetConsoleAttr() log.err.Print('=' * attr.GetTermSize()[0]) if stream_driver_log: log.status.Print('Waiting for job output...') wait_display = NoOpProgressDisplay() else: wait_display = progress_tracker.ProgressTracker(message, autotick=True) start_time = now = time.time() with wait_display: while not timeout_s or timeout_s > (now - start_time): # Poll logs first to see if it closed. ReadDriverLogIfPresent() log_stream_closed = driver_log_stream and not driver_log_stream.open if (not job_complete and job.status.state in dataproc.terminal_job_states): job_complete = True # Wait an 10s to get trailing output. timeout_s = now - start_time + 10 if job_complete and (not stream_driver_log or log_stream_closed): # Nothing left to wait for break regular_job_poll = ( not job_complete # Poll less frequently on dataproc API and now >= last_job_poll_time + dataproc_poll_period_s) # Poll at regular frequency before output has streamed and after it has # finished. expecting_output_stream = stream_driver_log and not driver_log_stream expecting_job_done = not job_complete and log_stream_closed if regular_job_poll or expecting_output_stream or expecting_job_done: last_job_poll_time = now try: job = dataproc.client.projects_regions_jobs.Get(request) except apitools_exceptions.HttpError as error: log.warning('GetJob failed:\n{}'.format( six.text_type(error))) # Do not retry on 4xx errors. if IsClientHttpException(error): raise if (stream_driver_log and job.driverOutputResourceUri and job.driverOutputResourceUri != driver_output_uri): if driver_output_uri: PrintEqualsLine() log.warning( "Job attempt failed. Streaming new attempt's output." ) PrintEqualsLine() driver_output_uri = job.driverOutputResourceUri driver_log_stream = storage_helpers.StorageObjectSeriesStream( job.driverOutputResourceUri) time.sleep(log_poll_period_s) now = time.time() # TODO(b/34836493): Get better test coverage of the next 20 lines. state = job.status.state # goal_state and error_state will always be terminal if state in dataproc.terminal_job_states: if stream_driver_log: if not driver_log_stream: log.warning('Expected job output not found.') elif driver_log_stream.open: log.warning( 'Job terminated, but output did not finish streaming.') if state is goal_state: return job if error_state and state is error_state: if job.status.details: raise exceptions.JobError( 'Job [{0}] failed with error:\n{1}'.format( job_ref.jobId, job.status.details)) raise exceptions.JobError('Job [{0}] failed.'.format( job_ref.jobId)) if job.status.details: log.info('Details:\n' + job.status.details) raise exceptions.JobError( 'Job [{0}] entered state [{1}] while waiting for [{2}].'.format( job_ref.jobId, state, goal_state)) raise exceptions.JobTimeoutError( 'Job [{0}] timed out while in state [{1}].'.format( job_ref.jobId, state))