Esempio n. 1
0
 def test_download_command(self):
     project = 'xl-ml-test'
     cluster = 'xl-ml-test'
     namespace = 'automated'
     pod_name = 'pt-1.5-resnet50-functional-v3-8-1584453600'
     zone = 'us-central1-b'
     self.assertEqual(
         util.download_command(pod_name, namespace, zone, cluster, project),
         VALID_DOWNLOAD_COMMAND)
Esempio n. 2
0
def _process_pubsub_message(msg, status_handler, logger):
  publish_time = msg['publish_time']
  msg_age_sec = time.time() - publish_time
  if msg_age_sec < MIN_MSG_AGE_SEC:
    logger.warning('Message was {} seconds old, which is less than the '
                   'minimum of {}. Skipping for now but will retry on '
                   'the next run.'.format(msg_age_sec, MIN_MSG_AGE_SEC))
    return False  # Do not ack the message.
  events_dir = msg.get('model_dir')
  test_name = msg.get('test_name')
  logs_link = util.add_unbound_time_to_logs_link(msg.get('logs_link', ''))
  metric_collection_config = msg.get('metric_collection_config')
  regression_test_config = msg.get('regression_test_config')
  job_name = msg.get('job_name')
  job_namespace = msg.get('job_namespace')
  test_type = msg.get('test_type')
  accelerator = msg.get('accelerator')
  framework_version = msg.get('framework_version')
  zone = msg.get('zone')
  cluster = msg.get('cluster_name')
  project = google.auth.default()[1]
  download_command = util.download_command(
      job_name, job_namespace, zone, cluster, project)
  workload_link = util.workload_link(
      job_name, job_namespace, zone, cluster, project)
  debug_info = alert_handler.DebugInfo(
      job_name, logs_link, download_command, workload_link)

  if not (events_dir and test_name and logs_link and job_name and zone \
          and cluster and project):
    raise ValueError('Pubsub message must contain 7 required fields: '
                     'events_dir, test_name, logs_link, job_name, '
                     'zone, cluster, project. Message was: {}'.format(event))
  if not regression_test_config and not metric_collection_config:
    raise ValueError('metric_collection_config and regression_test_config '
                     'were both null; stopping early. See README for '
                     'documentation on writing these configs.')

  status, stop_time, num_failures = status_handler.get_job_status(
      job_name, job_namespace)
  if status == job_status_handler.UNKNOWN_STATUS:
    logger.warning(
        'Unknown status for job_name: {}. Message will be '
        'retried later.'.format(job_name))
    return False  # Do not ack the message.
  elif status == job_status_handler.DOES_NOT_EXIST:
    if msg_age_sec >= 60 * 60 * 24:
      logger.warning(
          'Job with job_name: {} no longer exists in Kubernetes. Message '
          'will be acknowledged.'.format(job_name))
      return True  # Ack the message.
    else:
      logger.warning(
          'Job with job_name: {} not found in Kubernetes. Message '
          'will be retried later.'.format(job_name))
      return False  # Do not ack the message.
  job_status = {
      'final_status': status,
      'start_time': publish_time,
      'publish_time': publish_time,
      'stop_time': stop_time,
      'num_failures': num_failures,
  }

  # TODO: pass these in the pubsub message and remove this block.
  if not test_type:
    test_type = 'func' if 'func' in test_name else 'conv'
  if not accelerator:
    accelerator = 'tpu-v2-8' if 'v2-8' in test_name else 'tpu-v3-8'
  if not framework_version:
    framework_version = 'pt-nightly' if 'pt-nightly' in test_name \
        else 'tf-nightly'

  handler = CloudMetricsHandler(
      test_name, events_dir, debug_info, metric_collection_config,
      regression_test_config, test_type, accelerator, framework_version, logger)

  # Sometimes pubsub messages get delayed. If we've already processed metrics
  # for a different attempt of this test, we need to see if that attempt came
  # before or after the current attempt.
  existing_row_uuid, existing_row_publish_time = handler.get_existing_row()
  if existing_row_publish_time:
    # If the current message is for an earlier attempt than the existing row,
    # we can stop early since we want to write metrics for the latest attempt.
    # Otherwise, proceed with processing the current message.
    if publish_time <= existing_row_publish_time:
      return True  # Ack the message.

  # Alert for failing jobs unless the user has explicitly added a config
  # that disables alerts for this test.
  if job_status['final_status'] != job_status_handler.SUCCESS and (
      not regression_test_config or regression_test_config.get(
          'alert_for_failed_jobs', True)):
    logger.error(
        'job_status was `{}` for test `{}`'.format(
            job_status['final_status'], test_name),
        debug_info=debug_info)

  raw_metrics, aggregated_metrics = handler.get_metrics_from_events_dir()
  computed_metrics = metrics.get_computed_metrics(
      raw_metrics, job_status, project, job_name,
      tta_config=metric_collection_config.get('time_to_accuracy'))
  aggregated_metrics.update(computed_metrics)
  if regression_test_config:
    metrics_history = handler.get_metrics_history_from_bigquery()
    metric_name_to_visual_bounds = handler.compute_bounds_and_report_errors(
        metrics_history, aggregated_metrics, job_status['final_status'])
  else:
    metric_name_to_visual_bounds = None

  handler.add_status_and_metrics_to_bigquery(
      job_status, aggregated_metrics, metric_name_to_visual_bounds)
  return True  # Ack the message.