def setUp(self):
     self.logger = alert_handler.AlertHandler(
         project_id=None,
         write_to_logging=True,
         write_to_error_reporting=False,
         write_to_email=False)
     self.handler = job_status_handler.JobStatusHandler(
         "unused", "unused", "unused", self.logger)
예제 #2
0
 def setUp(self):
     self.handler = alert_handler.AlertHandler(
         project_id='my-project-id',
         write_to_logging=True,
         write_to_error_reporting=False,
         write_to_email=False  # Skip the init of the real email client.
     )
     self.handler.write_to_email = True  # Enable writing to draft email.
     # The step below normally would have happened while initializing the
     # real email client.
     self.handler.messages_to_email = collections.defaultdict(list)
예제 #3
0
  def setUp(self):
    self.logger = alert_handler.AlertHandler(
      project_id=None,
      write_to_logging=True,
      write_to_error_reporting=False,
      write_to_email=False)

    self.temp_dir = self.create_tempdir().full_path
    self.summary_writer = tf.summary.create_file_writer(self.temp_dir)

    with self.summary_writer.as_default():
      tf.summary.scalar("foo", 1, 0)
      tf.summary.scalar("bar", tf.convert_to_tensor(1), 0)

      tf.summary.scalar("foo", 2, 100)
      tf.summary.scalar("bar", tf.convert_to_tensor(2), 100)

    self.summary_writer.close()

    self.job_status_dict = {
      'job_status': 'SUCCESS',
      'stop_time': 1000,
      'start_time': 2000,
    }
예제 #4
0
def run_main(event, context):
  project_id = google.auth.default()[1]
  logger = alert_handler.AlertHandler(project_id)

  # Retrieve pubsub messages for all the tests that have been kicked off by
  # the test runner.
  subscriber = pubsub_v1.SubscriberClient()
  project = subscriber.project_path(project_id)
  subscription = None
  for s in subscriber.list_subscriptions(project):
    if s.topic.split('/')[-1] == METRICS_WRITTEN_TOPIC:
      subscription = s.name
      break
  if not subscription:
    subscription_id = subscriber.subscription_path(
        project_id, 'metrics-handler-subscription')
    topic = subscriber.topic_path(project_id, METRICS_WRITTEN_TOPIC)
    subscription = subscriber.create_subscription(
        subscription_id, topic, ack_deadline_seconds=300).name
  try:
    all_msgs = subscriber.pull(subscription, 100).received_messages
  except google.api_core.exceptions.DeadlineExceeded:
    logger.info(
        'No messages found for subscription: {}'.format(subscription))
    return

  # Group messages by test. Each test might have made multiple attempts and
  # therefore could have multiple messages.
  test_name_to_msgs = collections.defaultdict(list)
  ids_to_ack = []
  for msg in all_msgs:
    data_str = msg.message.data
    try:
      message_id = msg.message.message_id
      logger.info('Found message_id: {}'.format(message_id))
      data = json.loads(data_str)
      data['publish_time'] = msg.message.publish_time.seconds
      data['ack_id'] = msg.ack_id
      data['message_id'] = message_id
      test_name_to_msgs[data['test_name']].append(data)
    except Exception as e:
      logger.error(
          'Metrics handler encountered an invalid message in pubsub queue '
          'for topic `{}` which led to Exception: {}. This message will '
          'be acknowledged and ignored. The message was: {}'.format(
              METRICS_WRITTEN_TOPIC, e, msg))
      ids_to_ack.append(msg.ack_id)

  # Grab the latest message for each test. We will process only that message
  # and all other messages for that test will be ack'ed without being processed.
  msgs_to_process = []
  for test_name, msgs in test_name_to_msgs.items():
    sorted_msgs = sorted(msgs, key = lambda x: x['publish_time'])
    ids_to_ack.extend([msg['ack_id'] for msg in msgs[:-1]])
    msgs_to_process.append(msgs[-1])
  logger.info('Finished deduplicating messages from test runs.')

  # Note: it's good to ack early and often since pubsub will resend messages
  # that are not ack'ed within the queue's deadline.
  if ids_to_ack:
    logger.info('Will ack these ids: {}'.format(ids_to_ack))
    subscriber.acknowledge(subscription, ids_to_ack)
    logger.info('Successful ack for ids: {}'.format(ids_to_ack))

  if not msgs_to_process:
    logger.info('No messages to process. Stopping early.')
    return

  # TODO: Add support for multi-zone and/or multi-cluster setups.
  zone = msgs_to_process[0].get('zone')
  cluster = msgs_to_process[0].get('cluster_name')
  status_handler = job_status_handler.JobStatusHandler(
      project_id, zone, cluster, logger)

  # Handle the metrics for each test. Ack if the process was successful or if
  # the message is permanently invalid. Do not ack if the test is still running
  # so that we will retry again later once that test has finished running.
  for msg in msgs_to_process:
    try:
      logger.info('Pubsub message to process: {}'.format(msg))
      should_ack = _process_pubsub_message(msg, status_handler, logger)
    except Exception:
      logger.error(
          'Encountered exception while attempting to process message {}. '
          'The message will be acknowledged to prevent more crashes. '
          'Exception: {}'.format(msg, traceback.format_exc()))
      should_ack = True
    if should_ack:
      logger.info('Finished processing message. Will ack')
      subscriber.acknowledge(subscription, [msg['ack_id']])
      logger.info('Acknowledged ack_id: {}'.format(msg['ack_id']))
    else:
      logger.info('Finished processing message. Will not ack')
  logger.info('Processed a message for each of the following tests: '
              '{}'.format([x['test_name'] for x in msgs_to_process]))
  logger.send_email()