def test_workload_link(self): project = 'xl-ml-test' cluster = 'xl-ml-test' namespace = 'automated' pod_name = 'pt-1.5-resnet50-functional-v3-8-1584453600' zone = 'us-central1-b' self.assertEqual( util.workload_link(pod_name, namespace, zone, cluster, project), VALID_WORKLOAD_LINK)
def workload_link(self, job_name, job_namespace): """Build a link to the Kubernetes workload for a specific test run. Args: job_name (string): Name of the Kubernetes job. Should include the timestamp, e.g. 'pt-1.5-resnet-func-v3-8-1584453600'. job_namespace (string): Name of the Kubernetes namespace. location (string): GCP zone or region, e.g. 'us-central1-b'. Returns: link (string): A link to the Kubernetes workload page for this job. """ if not self.location: self._init_k8s_client() return util.workload_link(job_name, job_namespace, self.location, self.cluster_name, self.project_id)
def _process_pubsub_message(msg, status_handler, logger): publish_time = msg['publish_time'] msg_age_sec = time.time() - publish_time if msg_age_sec < MIN_MSG_AGE_SEC: logger.warning('Message was {} seconds old, which is less than the ' 'minimum of {}. Skipping for now but will retry on ' 'the next run.'.format(msg_age_sec, MIN_MSG_AGE_SEC)) return False # Do not ack the message. events_dir = msg.get('model_dir') test_name = msg.get('test_name') logs_link = util.add_unbound_time_to_logs_link(msg.get('logs_link', '')) metric_collection_config = msg.get('metric_collection_config') regression_test_config = msg.get('regression_test_config') job_name = msg.get('job_name') job_namespace = msg.get('job_namespace') test_type = msg.get('test_type') accelerator = msg.get('accelerator') framework_version = msg.get('framework_version') zone = msg.get('zone') cluster = msg.get('cluster_name') project = google.auth.default()[1] download_command = util.download_command( job_name, job_namespace, zone, cluster, project) workload_link = util.workload_link( job_name, job_namespace, zone, cluster, project) debug_info = alert_handler.DebugInfo( job_name, logs_link, download_command, workload_link) if not (events_dir and test_name and logs_link and job_name and zone \ and cluster and project): raise ValueError('Pubsub message must contain 7 required fields: ' 'events_dir, test_name, logs_link, job_name, ' 'zone, cluster, project. Message was: {}'.format(event)) if not regression_test_config and not metric_collection_config: raise ValueError('metric_collection_config and regression_test_config ' 'were both null; stopping early. See README for ' 'documentation on writing these configs.') status, stop_time, num_failures = status_handler.get_job_status( job_name, job_namespace) if status == job_status_handler.UNKNOWN_STATUS: logger.warning( 'Unknown status for job_name: {}. Message will be ' 'retried later.'.format(job_name)) return False # Do not ack the message. elif status == job_status_handler.DOES_NOT_EXIST: if msg_age_sec >= 60 * 60 * 24: logger.warning( 'Job with job_name: {} no longer exists in Kubernetes. Message ' 'will be acknowledged.'.format(job_name)) return True # Ack the message. else: logger.warning( 'Job with job_name: {} not found in Kubernetes. Message ' 'will be retried later.'.format(job_name)) return False # Do not ack the message. job_status = { 'final_status': status, 'start_time': publish_time, 'publish_time': publish_time, 'stop_time': stop_time, 'num_failures': num_failures, } # TODO: pass these in the pubsub message and remove this block. if not test_type: test_type = 'func' if 'func' in test_name else 'conv' if not accelerator: accelerator = 'tpu-v2-8' if 'v2-8' in test_name else 'tpu-v3-8' if not framework_version: framework_version = 'pt-nightly' if 'pt-nightly' in test_name \ else 'tf-nightly' handler = CloudMetricsHandler( test_name, events_dir, debug_info, metric_collection_config, regression_test_config, test_type, accelerator, framework_version, logger) # Sometimes pubsub messages get delayed. If we've already processed metrics # for a different attempt of this test, we need to see if that attempt came # before or after the current attempt. existing_row_uuid, existing_row_publish_time = handler.get_existing_row() if existing_row_publish_time: # If the current message is for an earlier attempt than the existing row, # we can stop early since we want to write metrics for the latest attempt. # Otherwise, proceed with processing the current message. if publish_time <= existing_row_publish_time: return True # Ack the message. # Alert for failing jobs unless the user has explicitly added a config # that disables alerts for this test. if job_status['final_status'] != job_status_handler.SUCCESS and ( not regression_test_config or regression_test_config.get( 'alert_for_failed_jobs', True)): logger.error( 'job_status was `{}` for test `{}`'.format( job_status['final_status'], test_name), debug_info=debug_info) raw_metrics, aggregated_metrics = handler.get_metrics_from_events_dir() computed_metrics = metrics.get_computed_metrics( raw_metrics, job_status, project, job_name, tta_config=metric_collection_config.get('time_to_accuracy')) aggregated_metrics.update(computed_metrics) if regression_test_config: metrics_history = handler.get_metrics_history_from_bigquery() metric_name_to_visual_bounds = handler.compute_bounds_and_report_errors( metrics_history, aggregated_metrics, job_status['final_status']) else: metric_name_to_visual_bounds = None handler.add_status_and_metrics_to_bigquery( job_status, aggregated_metrics, metric_name_to_visual_bounds) return True # Ack the message.