def Run(self, args): dataproc = dp.Dataproc(self.ReleaseTrack()) cluster_ref = args.CONCEPTS.cluster.Parse() request = None if args.tarball_access is not None: tarball_access = arg_utils.ChoiceToEnum( args.tarball_access, dataproc.messages.DiagnoseClusterRequest. TarballAccessValueValuesEnum) diagnose_request = dataproc.messages.DiagnoseClusterRequest( tarballAccess=tarball_access) request = dataproc.messages.DataprocProjectsRegionsClustersDiagnoseRequest( clusterName=cluster_ref.clusterName, region=cluster_ref.region, projectId=cluster_ref.projectId, diagnoseClusterRequest=diagnose_request) else: request = dataproc.messages.DataprocProjectsRegionsClustersDiagnoseRequest( clusterName=cluster_ref.clusterName, region=cluster_ref.region, projectId=cluster_ref.projectId) operation = dataproc.client.projects_regions_clusters.Diagnose(request) # TODO(b/36052522): Stream output during polling. operation = util.WaitForOperation( dataproc, operation, message='Waiting for cluster diagnose operation', timeout_s=args.timeout) if not operation.response: raise exceptions.OperationError('Operation is missing response') properties = encoding.MessageToDict(operation.response) output_uri = properties['outputUri'] if not output_uri: raise exceptions.OperationError('Response is missing outputUri') log.err.Print('Output from diagnostic:') log.err.Print('-----------------------------------------------') driver_log_stream = storage_helpers.StorageObjectSeriesStream( output_uri) # A single read might not read whole stream. Try a few times. read_retrier = retry.Retryer(max_retrials=4, jitter_ms=None) try: read_retrier.RetryOnResult( lambda: driver_log_stream.ReadIntoWritable(log.err), sleep_ms=100, should_retry_if=lambda *_: driver_log_stream.open) except retry.MaxRetrialsException: log.warning('Diagnostic finished successfully, ' 'but output did not finish streaming.') log.err.Print('-----------------------------------------------') return output_uri
def Run(self, args): client = self.context['dataproc_client'] messages = self.context['dataproc_messages'] cluster_ref = util.ParseCluster(args.name, self.context) request = messages.DataprocProjectsRegionsClustersDiagnoseRequest( clusterName=cluster_ref.clusterName, region=cluster_ref.region, projectId=cluster_ref.projectId) try: operation = client.projects_regions_clusters.Diagnose(request) # TODO(user): Stream output during polling. operation = util.WaitForOperation( operation, self.context, message='Waiting for cluster diagnose operation') response = operation.response except apitools_exceptions.HttpError as error: raise exceptions.HttpException(util.FormatHttpError(error)) if not response: raise exceptions.ToolException('Operation is missing response') properties = encoding.MessageToDict(response) output_uri = properties['outputUri'] if not output_uri: raise exceptions.ToolException('Response is missing outputUri') log.err.Print('Output from diagnostic:') log.err.Print('-----------------------------------------------') driver_log_stream = storage_helpers.StorageObjectSeriesStream( output_uri) # A single read might not read whole stream. Try a few times. read_retrier = retry.Retryer(max_retrials=4, jitter_ms=None) try: read_retrier.RetryOnResult( lambda: driver_log_stream.ReadIntoWritable(log.err), sleep_ms=100, should_retry_if=lambda *_: driver_log_stream.open) except retry.MaxRetrialsException: log.warn( 'Diagnostic finished succesfully, ' 'but output did not finish streaming.') log.err.Print('-----------------------------------------------') return output_uri
def SetUp(self): self.mock_gcs_client = apitools_mock.Client( core_apis.GetClientClass('storage', 'v1'), real_client=core_apis.GetClientInstance('storage', 'v1', no_http=True)) self.mock_gcs_client.Mock() self.addCleanup(self.mock_gcs_client.Unmock) self.mock_exec = self.StartPatch( 'googlecloudsdk.core.execution_utils.Exec') self.mock_config_bin_path = self.StartPropertyPatch( config.Paths, 'sdk_bin_path') self.mock_config_bin_path.return_value = 'bin' self.storage_client = storage_helpers.StorageClient() self.storage_stream = storage_helpers.StorageObjectSeriesStream( self.BASE_GCS_PATH, self.storage_client) self.storage_messages = core_apis.GetMessagesModule('storage', 'v1') self.storage_api_client = storage_api.StorageClient() self.copy_file_mock = self.StartObjectPatch(self.storage_api_client, 'CopyFileToGCS') self.storage_bucket = 'foo'
def _CheckStreamer(self, poll_result): """Checks if need to init a new output streamer. Checks if need to init a new output streamer. Remote may fail; switch to new output uri. Invalidate the streamer instance and init a new one if necessary. Args: poll_result: Poll result returned from Poll. """ # Mimic current job waiting behavior to print equal signs across the screen. def _PrintEqualsLineAccrossScreen(): attr = console_attr.GetConsoleAttr() log.err.Print('=' * attr.GetTermSize()[0]) # pylint: disable=assignment-from-none uri = self._GetOutputUri(poll_result) # pylint: enable=assignment-from-none if not uri: # Remote resource not ready, nothing to check. return # Invalidate current streamer if remote output uri changed. if self.saved_stream_uri and self.saved_stream_uri != uri: self.driver_log_streamer = None self.saved_stream_uri = None _PrintEqualsLineAccrossScreen() log.warning("Attempt failed. Streaming new attempt's output.") _PrintEqualsLineAccrossScreen() # Init a new streamer if there is no active streamer. if not self.driver_log_streamer: self.saved_stream_uri = uri self.driver_log_streamer = storage_helpers.StorageObjectSeriesStream( uri)
def Run(self, args): client = self.context['dataproc_client'] messages = self.context['dataproc_messages'] cluster_ref = util.ParseCluster(args.name, self.context) request = messages.DataprocProjectsClustersDiagnoseRequest( clusterName=cluster_ref.clusterName, projectId=cluster_ref.projectId) try: operation = client.projects_clusters.Diagnose(request) operation = util.WaitForOperation( operation, self.context, message='Waiting for cluster diagnose operation') response = operation.response except apitools_base.HttpError as error: raise exceptions.HttpException(util.FormatHttpError(error)) if not response: raise exceptions.ToolException('Operation is missing response') properties = apitools_base.MessageToDict(response) output_uri = properties['outputUri'] if not output_uri: raise exceptions.ToolException('Response is missing outputUri') log.err.Print('Output from diagnostic:') log.err.Print('-----------------------------------------------') driver_log_stream = storage_helpers.StorageObjectSeriesStream( output_uri) driver_log_stream.ReadIntoWritable(log.err) log.err.Print('-----------------------------------------------') return output_uri
def WaitForJobTermination(job, context, message, goal_state, stream_driver_log=False, log_poll_period_s=1, dataproc_poll_period_s=10, timeout_s=None): """Poll dataproc Job until its status is terminal or timeout reached. Args: job: The job to wait to finish. context: dict, dataproc Command context. message: str, message to display to user while polling. goal_state: JobStatus.StateValueValuesEnum, the state to define success stream_driver_log: bool, Whether to show the Job's driver's output. log_poll_period_s: number, delay in seconds between checking on the log. dataproc_poll_period_s: number, delay in seconds between requests to the Dataproc API. timeout_s: number, time out for job completion. None means no timeout. Returns: Operation: the return value of the last successful operations.get request. Raises: OperationError: if the operation times out or finishes with an error. """ client = context['dataproc_client'] job_ref = ParseJob(job.reference.jobId, context) request = client.MESSAGES_MODULE.DataprocProjectsRegionsJobsGetRequest( projectId=job_ref.projectId, region=job_ref.region, jobId=job_ref.jobId) driver_log_stream = None last_job_poll_time = 0 job_complete = False wait_display = None driver_output_uri = None def ReadDriverLogIfPresent(): if driver_log_stream and driver_log_stream.open: # TODO(b/36049794): Don't read all output. driver_log_stream.ReadIntoWritable(log.err) def PrintEqualsLine(): attr = console_attr.GetConsoleAttr() log.err.Print('=' * attr.GetTermSize()[0]) if stream_driver_log: log.status.Print('Waiting for job output...') wait_display = NoOpProgressDisplay() else: wait_display = progress_tracker.ProgressTracker(message, autotick=True) start_time = now = time.time() with wait_display: while not timeout_s or timeout_s > (now - start_time): # Poll logs first to see if it closed. ReadDriverLogIfPresent() log_stream_closed = driver_log_stream and not driver_log_stream.open if not job_complete and job.status.state in constants.TERMINAL_JOB_STATES: job_complete = True # Wait an 10s to get trailing output. timeout_s = now - start_time + 10 if job_complete and (not stream_driver_log or log_stream_closed): # Nothing left to wait for break regular_job_poll = ( not job_complete # Poll less frequently on dataproc API and now >= last_job_poll_time + dataproc_poll_period_s) # Poll at regular frequency before output has streamed and after it has # finished. expecting_output_stream = stream_driver_log and not driver_log_stream expecting_job_done = not job_complete and log_stream_closed if regular_job_poll or expecting_output_stream or expecting_job_done: last_job_poll_time = now try: job = client.projects_regions_jobs.Get(request) except apitools_exceptions.HttpError as error: log.warn('GetJob failed:\n{1}', error) # Keep trying until we timeout in case error is transient. if (stream_driver_log and job.driverOutputResourceUri and job.driverOutputResourceUri != driver_output_uri): if driver_output_uri: PrintEqualsLine() log.warn( "Job attempt failed. Streaming new attempt's output." ) PrintEqualsLine() driver_output_uri = job.driverOutputResourceUri driver_log_stream = storage_helpers.StorageObjectSeriesStream( job.driverOutputResourceUri) time.sleep(log_poll_period_s) now = time.time() # TODO(b/34836493): Get better test coverage of the next 20 lines. state = job.status.state if state is not goal_state and job.status.details: # Just log details, because the state will be in the error message. log.info(job.status.details) if state in constants.TERMINAL_JOB_STATES: if stream_driver_log: if not driver_log_stream: log.warn('Expected job output not found.') elif driver_log_stream.open: log.warn( 'Job terminated, but output did not finish streaming.') if state is goal_state: return job raise exceptions.JobError( 'Job [{0}] entered state [{1}] while waiting for [{2}].'.format( job_ref.jobId, state, goal_state)) raise exceptions.JobTimeoutError( 'Job [{0}] timed out while in state [{1}].'.format( job_ref.jobId, state))
def WaitForJobTermination(dataproc, job, job_ref, message, goal_state, error_state=None, stream_driver_log=False, log_poll_period_s=1, dataproc_poll_period_s=10, timeout_s=None): """Poll dataproc Job until its status is terminal or timeout reached. Args: dataproc: wrapper for dataproc resources, client and messages job: The job to wait to finish. job_ref: Parsed dataproc.projects.regions.jobs resource containing a projectId, region, and jobId. message: str, message to display to user while polling. goal_state: JobStatus.StateValueValuesEnum, the state to define success error_state: JobStatus.StateValueValuesEnum, the state to define failure stream_driver_log: bool, Whether to show the Job's driver's output. log_poll_period_s: number, delay in seconds between checking on the log. dataproc_poll_period_s: number, delay in seconds between requests to the Dataproc API. timeout_s: number, time out for job completion. None means no timeout. Returns: Job: the return value of the last successful jobs.get request. Raises: JobError: if the job finishes with an error. """ request = dataproc.messages.DataprocProjectsRegionsJobsGetRequest( projectId=job_ref.projectId, region=job_ref.region, jobId=job_ref.jobId) driver_log_stream = None last_job_poll_time = 0 job_complete = False wait_display = None driver_output_uri = None def ReadDriverLogIfPresent(): if driver_log_stream and driver_log_stream.open: # TODO(b/36049794): Don't read all output. driver_log_stream.ReadIntoWritable(log.err) def PrintEqualsLine(): attr = console_attr.GetConsoleAttr() log.err.Print('=' * attr.GetTermSize()[0]) if stream_driver_log: log.status.Print('Waiting for job output...') wait_display = NoOpProgressDisplay() else: wait_display = progress_tracker.ProgressTracker(message, autotick=True) start_time = now = time.time() with wait_display: while not timeout_s or timeout_s > (now - start_time): # Poll logs first to see if it closed. ReadDriverLogIfPresent() log_stream_closed = driver_log_stream and not driver_log_stream.open if (not job_complete and job.status.state in dataproc.terminal_job_states): job_complete = True # Wait an 10s to get trailing output. timeout_s = now - start_time + 10 if job_complete and (not stream_driver_log or log_stream_closed): # Nothing left to wait for break regular_job_poll = ( not job_complete # Poll less frequently on dataproc API and now >= last_job_poll_time + dataproc_poll_period_s) # Poll at regular frequency before output has streamed and after it has # finished. expecting_output_stream = stream_driver_log and not driver_log_stream expecting_job_done = not job_complete and log_stream_closed if regular_job_poll or expecting_output_stream or expecting_job_done: last_job_poll_time = now try: job = dataproc.client.projects_regions_jobs.Get(request) except apitools_exceptions.HttpError as error: log.warning('GetJob failed:\n{}'.format( six.text_type(error))) # Do not retry on 4xx errors. if IsClientHttpException(error): raise if (stream_driver_log and job.driverOutputResourceUri and job.driverOutputResourceUri != driver_output_uri): if driver_output_uri: PrintEqualsLine() log.warning( "Job attempt failed. Streaming new attempt's output." ) PrintEqualsLine() driver_output_uri = job.driverOutputResourceUri driver_log_stream = storage_helpers.StorageObjectSeriesStream( job.driverOutputResourceUri) time.sleep(log_poll_period_s) now = time.time() # TODO(b/34836493): Get better test coverage of the next 20 lines. state = job.status.state # goal_state and error_state will always be terminal if state in dataproc.terminal_job_states: if stream_driver_log: if not driver_log_stream: log.warning('Expected job output not found.') elif driver_log_stream.open: log.warning( 'Job terminated, but output did not finish streaming.') if state is goal_state: return job if error_state and state is error_state: if job.status.details: raise exceptions.JobError( 'Job [{0}] failed with error:\n{1}'.format( job_ref.jobId, job.status.details)) raise exceptions.JobError('Job [{0}] failed.'.format( job_ref.jobId)) if job.status.details: log.info('Details:\n' + job.status.details) raise exceptions.JobError( 'Job [{0}] entered state [{1}] while waiting for [{2}].'.format( job_ref.jobId, state, goal_state)) raise exceptions.JobTimeoutError( 'Job [{0}] timed out while in state [{1}].'.format( job_ref.jobId, state))