Exemple #1
0
def _HandleInactiveCommand(command, request_id, command_id):
    """Handle an inactive (timed out) command."""
    command_manager.Cancel(request_id,
                           command_id,
                           cancel_reason=common.CancelReason.QUEUE_TIMEOUT)
    request_manager.EvaluateState(request_id)
    metric.RecordCommandTimingMetric(
        cluster_id=command.cluster,
        run_target=command.run_target,
        create_timestamp=command.create_time,
        command_action=metric.CommandAction.CANCEL,
        count=True)
Exemple #2
0
 def testRecordCommandTimingMetric_givenLatency(self, record, increment):
     metric.RecordCommandTimingMetric(
         cluster_id='cluster',
         run_target='target',
         command_action=metric.CommandAction.INVOCATION_FETCH_BUILD,
         latency_secs=100)
     expected_metric_fields = {
         metric.METRIC_FIELD_CLUSTER: 'cluster',
         metric.METRIC_FIELD_RUN_TARGET: 'target',
         metric.METRIC_FIELD_ACTION: 'INVOCATION_FETCH_BUILD',
         metric.METRIC_FIELD_HOSTNAME: None
     }
     record.assert_called_once_with(100, expected_metric_fields)
     increment.assert_not_called()
def LogCommandEventMetrics(command, event):
  """Log metrics related to command events.

  Args:
    command: a command_manager.Command
    event: a command event
  """
  command_event_metric_fields = {
      metric.METRIC_FIELD_HOSTNAME: event.hostname or "",
      metric.METRIC_FIELD_TYPE: event.type or ""
  }
  metric.command_event_type_count.Increment(command_event_metric_fields)
  if (event.type != common.InvocationEventType.INVOCATION_COMPLETED or
      not command):
    return
  if command.start_time:
    metric.RecordCommandTimingMetric(
        cluster_id=command.cluster,
        run_target=command.run_target,
        create_timestamp=command.start_time,
        hostname=event.hostname,
        command_action=metric.CommandAction.INVOCATION_COMPLETED)
  # Invocation timing is only reported on InvocationCompleted events
  try:
    for key in TIMING_DATA_FIELDS_TO_COMMAND_ACTIONS:
      if key not in event.data:
        continue
      latency_secs = float(event.data.get(key)) / 1000
      metric.RecordCommandTimingMetric(
          cluster_id=command.cluster,
          run_target=command.run_target,
          command_action=TIMING_DATA_FIELDS_TO_COMMAND_ACTIONS[key],
          hostname=event.hostname,
          latency_secs=latency_secs)
  except:      # Protecting against bad data
    logging.warning("Failed to report timing metrics.", exc_info=True)
Exemple #4
0
 def testRecordCommandTimingMetric_withCount(self, record, increment):
     metric.RecordCommandTimingMetric(
         cluster_id='cluster',
         run_target='target',
         command_action=metric.CommandAction.INVOCATION_FETCH_BUILD,
         latency_secs=100,
         hostname='example.mtv',
         count=True)
     expected_metric_fields = {
         metric.METRIC_FIELD_CLUSTER: 'cluster',
         metric.METRIC_FIELD_RUN_TARGET: 'target',
         metric.METRIC_FIELD_ACTION: 'INVOCATION_FETCH_BUILD',
         metric.METRIC_FIELD_HOSTNAME: 'example.mtv'
     }
     record.assert_called_once_with(100, expected_metric_fields)
     increment.assert_called_once_with(expected_metric_fields)
Exemple #5
0
 def testRecordCommandTimingMetric_awareDatetime(self, record,
                                                 mock_datetime):
     mock_datetime.datetime.utcnow.return_value = UTC_NOW
     metric.RecordCommandTimingMetric(
         cluster_id='cluster',
         run_target='target',
         create_timestamp=TIMESTAMP_AWARE,
         command_action=metric.CommandAction.RESCHEDULE)
     expected_latency = 4 * 60 * 60  # 4 hours in seconds
     expected_metric_fields = {
         metric.METRIC_FIELD_CLUSTER: 'cluster',
         metric.METRIC_FIELD_RUN_TARGET: 'target',
         metric.METRIC_FIELD_ACTION: 'RESCHEDULE',
         metric.METRIC_FIELD_HOSTNAME: None
     }
     record.assert_called_once_with(expected_latency,
                                    expected_metric_fields)
  def _LeaseHostTasksForCluster(
      self, matcher, cluster, host, num_tasks=None):
    leased_tasks = []
    leasable_tasks = command_task_store.GetLeasableTasks(
        cluster, matcher.GetRunTargets())
    for task in leasable_tasks:
      matched_devices = matcher.Match(task)
      if not matched_devices:
        continue
      try:
        # b/27136167: Touch command to prevent coordinator from cancelling
        # during task lease.
        command = command_manager.Touch(task.request_id, task.command_id)
        # task comes from GetLeasableTasks is eventual consistent.
        # LeaseTask return the strong consistent task, so the data is more
        # accurate.
        leased_task = command_task_store.LeaseTask(task.task_id)
        if not leased_task:
          continue
        data_consistent = self._EnsureCommandConsistency(
            task.request_id, task.command_id, task.task_id)
      except grpc.RpcError as e:
        # Datastore entities can only be written to once per second.  If we fail
        # to update the command or task, log the error, and try leasing other
        # tasks.
        logging.warning("Error leasing task %s: %s", task.task_id, e)
        continue
      if not data_consistent:
        continue

      matcher.RemoveDeviceGroups(matched_devices)

      logging.debug("lease task %s to run on %s",
                    str(leased_task.task_id),
                    tuple(m.device_serial for m in matched_devices))
      plugin_data_dict = copy.copy(leased_task.plugin_data)
      plugin_data_dict[_HOSTNAME_KEY] = host.hostname
      plugin_data_dict[_LAB_NAME_KEY] = host.lab_name or common.UNKNOWN_LAB_NAME
      plugin_data_dict[_HOST_GROUP_KEY] = host.host_group
      if leased_task.schedule_timestamp:
        plugin_data_dict[_TFC_COMMAND_ATTEMPT_QUEUE_START_TIMESTAMP_KEY] = (
            common.DatetimeToAntsTimestampProperty(
                leased_task.schedule_timestamp))
      plugin_data_dict[_TFC_COMMAND_ATTEMPT_QUEUE_END_TIMESTAMP_KEY] = (
          common.DatetimeToAntsTimestampProperty(leased_task.lease_timestamp or
                                                 common.Now()))

      plugin_data_ = api_messages.MapToKeyValuePairMessages(plugin_data_dict)
      leased_tasks.append(
          CommandTask(
              request_id=leased_task.request_id,
              command_id=leased_task.command_id,
              task_id=leased_task.task_id,
              command_line=leased_task.command_line,
              request_type=leased_task.request_type,
              device_serials=[match.device_serial for match in matched_devices],
              shard_count=leased_task.shard_count,
              shard_index=leased_task.shard_index,
              run_index=leased_task.run_index,
              attempt_index=leased_task.attempt_index,
              plugin_data=plugin_data_))
      for run_target in leased_task.run_targets:
        metric.RecordCommandTimingMetric(
            cluster_id=cluster,
            run_target=run_target,
            create_timestamp=command.create_time,
            command_action=metric.CommandAction.LEASE,
            count=True)
      if matcher.IsEmpty():
        break
      if num_tasks is not None and len(leased_tasks) >= num_tasks:
        break
    return leased_tasks