コード例 #1
0
    def _scrubDataSamples(self, data, metricID, conn, lastDataRowID):
        """ Filter out metric data samples that are out of order or have duplicate
    timestamps.

    :param data: A sequence of data samples; each data sample is a pair:
                  (datetime.datetime, float)
    :param metricID: unique metric id
    :param sqlalchemy.engine.Connection conn: A sqlalchemy connection object
    :param lastDataRowID: last metric data row identifier for metric with given
      metric id

    :returns: a (possibly empty) sequence of metric data samples that passed
      the scrubbing.
    :rtype: sequence of pairs: (datetime.datetime, float)
    """
        passingSamples = []
        rejectedDataTimestamps = []
        prevSampleTimestamp = self._getTailMetricRowTimestamp(
            conn, metricID, lastDataRowID)
        for sample in data:
            timestamp, metricValue = sample
            # Filter out those whose timestamp is not newer than previous sampale's
            if (prevSampleTimestamp is not None
                    and timestamp < prevSampleTimestamp):
                # Reject it; this could be the result of an unordered sample feed or
                # concurrent feeds of samples for the same metric
                # TODO: unit-test
                rejectedDataTimestamps.append(timestamp)
                self._log.error(
                    "Rejected input sample older than previous ts=%s (%s): "
                    "metric=%s; rejectedTs=%s (%s); rejectedValue=%s",
                    prevSampleTimestamp,
                    epochFromNaiveUTCDatetime(prevSampleTimestamp), metricID,
                    timestamp, epochFromNaiveUTCDatetime(timestamp),
                    metricValue)
            elif timestamp == prevSampleTimestamp:
                # Reject it; this could be the result of guaranteed delivery via message
                # publish retry following transient connection loss with the message bus
                self._log.error(
                    "Rejected input sample with duplicate ts=%s (%s): "
                    "metric=%s; rejectedValue=%s", prevSampleTimestamp,
                    epochFromNaiveUTCDatetime(prevSampleTimestamp), metricID,
                    metricValue)
                rejectedDataTimestamps.append(timestamp)
            else:
                passingSamples.append(sample)
                prevSampleTimestamp = timestamp

        if rejectedDataTimestamps:
            # TODO: unit-test
            self._log.error(
                "Rejected input rows: metric=%s; numRejected=%d; "
                "rejectedRange=[%s..%s]", metricID,
                len(rejectedDataTimestamps), min(rejectedDataTimestamps),
                max(rejectedDataTimestamps))

        return passingSamples
コード例 #2
0
  def _scrubDataSamples(self, data, metricID, conn, lastDataRowID):
    """ Filter out metric data samples that are out of order or have duplicate
    timestamps.

    :param data: A sequence of data samples; each data sample is a pair:
                  (datetime.datetime, float)
    :param metricID: unique metric id
    :param sqlalchemy.engine.Connection conn: A sqlalchemy connection object
    :param lastDataRowID: last metric data row identifier for metric with given
      metric id

    :returns: a (possibly empty) sequence of metric data samples that passed
      the scrubbing.
    :rtype: sequence of pairs: (datetime.datetime, float)
    """
    passingSamples = []
    rejectedDataTimestamps = []
    prevSampleTimestamp = self._getTailMetricRowTimestamp(conn, metricID,
                                                          lastDataRowID)
    for sample in data:
      timestamp, metricValue = sample
      # Filter out those whose timestamp is not newer than previous sampale's
      if prevSampleTimestamp is not None and timestamp < prevSampleTimestamp:
        # Reject it; this could be the result of an unordered sample feed or
        # concurrent feeds of samples for the same metric
        # TODO: unit-test
        rejectedDataTimestamps.append(timestamp)
        self._log.error(
          "Rejected input sample older than previous ts=%s (%s): "
          "metric=%s; rejectedTs=%s (%s); rejectedValue=%s",
          prevSampleTimestamp, epochFromNaiveUTCDatetime(prevSampleTimestamp),
          metricID, timestamp, epochFromNaiveUTCDatetime(timestamp),
          metricValue)
      elif timestamp == prevSampleTimestamp:
        # Reject it; this could be the result of guaranteed delivery via message
        # publish retry following transient connection loss with the message bus
        self._log.error(
          "Rejected input sample with duplicate ts=%s (%s): "
          "metric=%s; rejectedValue=%s",
          prevSampleTimestamp, epochFromNaiveUTCDatetime(prevSampleTimestamp),
          metricID, metricValue)
        rejectedDataTimestamps.append(timestamp)
      else:
        passingSamples.append(sample)
        prevSampleTimestamp = timestamp

    if rejectedDataTimestamps:
      # TODO: unit-test
      self._log.error("Rejected input rows: metric=%s; numRejected=%d; "
                      "rejectedRange=[%s..%s]",
                      metricID, len(rejectedDataTimestamps),
                      min(rejectedDataTimestamps), max(rejectedDataTimestamps))

    return passingSamples
    def testEpochFromNaiveUTCDatetime(self):
        self.assertEqual(
            date_time_utils.epochFromNaiveUTCDatetime(
                datetime.utcfromtimestamp(0)), 0)

        self.assertEqual(
            date_time_utils.epochFromNaiveUTCDatetime(
                datetime.utcfromtimestamp(1426880474.306222)),
            1426880474.306222)

        self.assertEqual(
            date_time_utils.epochFromNaiveUTCDatetime(
                datetime.utcfromtimestamp(1426880474)), 1426880474)
コード例 #4
0
  def testEpochFromNaiveUTCDatetime(self):
    self.assertEqual(
      date_time_utils.epochFromNaiveUTCDatetime(datetime.utcfromtimestamp(0)),
      0)

    self.assertEqual(
      date_time_utils.epochFromNaiveUTCDatetime(
        datetime.utcfromtimestamp(1426880474.306222)),
      1426880474.306222)

    self.assertEqual(
      date_time_utils.epochFromNaiveUTCDatetime(
        datetime.utcfromtimestamp(1426880474)),
      1426880474)
コード例 #5
0
    def testModelResultHandlerSkipsStaleBatch(
        self, _amqpUtilsMock, deserializeModelResult, connectDynamoDB, _gracefulCreateTable
    ):
        """ Given a stale batch of model inference results, verify that it isn't
    saved to DynamoDB
    """

        # We're going to mostly mock out all of the arguments to
        # DynamoDBService.messageHandler() since it is normally called by amqp lib.
        # Then simulate the process of handling an inbound batch of model inference
        # results and assert that the appropriate put_item() calls are made at the
        # other end.

        message = amqp.messages.ConsumerMessage(
            body=Mock(),
            properties=Mock(headers=dict()),
            methodInfo=amqp.messages.MessageDeliveryInfo(
                consumerTag=Mock(), deliveryTag=Mock(), redelivered=False, exchange=Mock(), routingKey=""
            ),
            ackImpl=Mock(),
            nackImpl=Mock(),
        )

        # We will have to bypass the normal serialize/deserialize phases to avoid
        # dependency on sqlalchemy rowproxy.  Instead, we'll just mock out the
        # AnomalyService.deserializeModelResult() call, returning an object that
        # approximates a batch of model inference results as much as possible

        ts = epochFromNaiveUTCDatetime(
            datetime.utcnow().replace(microsecond=0) - timedelta(days=DynamoDBService._FRESH_DATA_THRESHOLD_DAYS + 1)
        )

        resultRow = dict(rowid=4790, ts=ts, value=9305.0, rawAnomaly=0.775, anomaly=0.999840891)

        metricId = "3b035a5916994f2bb950f5717138f94b"

        deserializeModelResult.return_value = dict(
            metric=dict(
                uid=metricId,
                name="XIGNITE.AGN.VOLUME",
                description="XIGNITE.AGN.VOLUME",
                resource="Resource-of-XIGNITE.AGN.VOLUME",
                location="",
                datasource="custom",
                spec=dict(userInfo=dict(symbol="AGN", metricType="StockVolume", metricTypeName="Stock Volume")),
            ),
            results=[resultRow],
        )

        service = DynamoDBService()
        publishMetricDataPatch = patch.object(service, "_publishMetricData", spec_set=service._publishMetricData)
        publishInstancePatch = patch.object(
            service, "_publishInstanceDataHourly", spec_set=service._publishInstanceDataHourly
        )
        with publishMetricDataPatch as publishMetricDataMock, publishInstancePatch as publishInstanceMock:
            service.messageHandler(message)

            deserializeModelResult.assert_called_once_with(message.body)
            self.assertEqual(publishMetricDataMock.call_count, 0)
            self.assertEqual(publishInstanceMock.call_count, 0)
コード例 #6
0
    def testPublishMetricDataWithDuplicateKeys(self, connectDynamoDB,
                                               _gracefulCreateTable):
        """ Test for elimination of rows with duplicate keys by _publishMetricData
    """
        metricId = "3b035a5916994f2bb950f5717138f94b"

        rowTemplate = dict(rowid=99,
                           ts=epochFromNaiveUTCDatetime(
                               datetime(2015, 3, 20, 0, 46, 28)),
                           value=10305.0,
                           rawAnomaly=0.275,
                           anomaly=0.999840891)

        row1 = dict(rowTemplate)
        row2 = dict(rowTemplate)
        row2["rowid"] = row1["rowid"] + 1
        rows = [row1, row2]

        service = DynamoDBService()

        service._publishMetricData(metricId, rows)

        data = dynamodb_service.convertInferenceResultRowToMetricDataItem(
            metricId, row1)
        mockPutItem = (service._metric_data.batch_write.return_value.__enter__.
                       return_value.put_item)
        mockPutItem.assert_called_once_with(data=data._asdict(),
                                            overwrite=True)
コード例 #7
0
    def _loadAndSendData(self, sock, filePath, metricName):
        """
    Returns the list of labels from the csv at filePath. Date and value
    fields are sent to the metric specified. As a side effect this
    creates the metric.

    :param sock: A connected socket object
    :param filePath: The csv with data to handle
    :param metricName: The target custom metric we will send data to
    """
        labels = []
        for (dttm, value, label) in self._loadDataGen(filePath):
            # Parse date string
            dttm = parsedate(dttm)
            # Convert to seconds since epoch (Graphite wants this)
            dttm = epochFromNaiveUTCDatetime(dttm)
            dttm = int(dttm)

            # Add data
            sock.sendall("%s %r %s\n" % (metricName, float(value), dttm))

            # Save the label for use later
            # Convert strings to appropriate numerical type
            try:
                labels.append(int(label))
            except ValueError:
                labels.append(float(label))

        self.gracefullyCloseSocket(sock)

        return labels
コード例 #8
0
  def _loadAndSendData(self, sock, filePath, metricName):
    """
    Returns the list of labels from the csv at filePath. Date and value
    fields are sent to the metric specified. As a side effect this
    creates the metric.

    :param sock: A connected socket object
    :param filePath: The csv with data to handle
    :param metricName: The target custom metric we will send data to
    """
    labels = []
    for (dttm, value, label) in self._loadDataGen(filePath):
      # Parse date string
      dttm = parsedate(dttm)
      # Convert to seconds since epoch (Graphite wants this)
      dttm = epochFromNaiveUTCDatetime(dttm)
      dttm = int(dttm)

      #LOGGER.info("{TAG:CLIENT.METRIC} metric=%s:%s:%s", metricName, dttm,
      #            value)

      # Add data
      sock.sendall("%s %r %s\n" % (metricName, float(value), dttm))

      # Save the label for use later
      # Convert strings to appropriate numerical type
      try:
        labels.append(int(label))
      except ValueError:
        labels.append(float(label))

    self.gracefullyCloseSocket(sock)

    return labels
コード例 #9
0
 def validateResultRow(resultRow, inputRow):
   self.assertEqual(resultRow["rowid"], inputRow.rowid)
   self.assertEqual(resultRow["ts"],
                    epochFromNaiveUTCDatetime(inputRow.timestamp))
   self.assertEqual(resultRow["value"], inputRow.metric_value)
   self.assertEqual(resultRow["rawAnomaly"], inputRow.raw_anomaly_score)
   self.assertEqual(resultRow["anomaly"], inputRow.anomaly_score)
コード例 #10
0
  def _composeModelInferenceResultsMessage(cls, metricRow, dataRows):
    """ Create a message body for publishing from the result of
    _processModelInferenceResults

    :param metricRow: Metric instance corresponding to the given metricID
    :param dataRows: a sequence of MutableMetricDataRow instances
      corresponding to the updated metric_data rows.
    :returns: JSON-ifiable dict conforming to
      model_inference_results_msg_schema.json
    :rtype: dict
    """
    return dict(
      metric=dict(
        uid=metricRow.uid,
        name=metricRow.name,
        description=metricRow.description,
        resource=metricRow.server,
        location=metricRow.location,
        datasource=metricRow.datasource,
        spec=json.loads(metricRow.parameters)["metricSpec"]
      ),

      results=[
        dict(
          rowid=row.rowid,
          ts=epochFromNaiveUTCDatetime(row.timestamp),
          value=row.metric_value,
          rawAnomaly=row.raw_anomaly_score,
          anomaly=row.anomaly_score,
          multiStepBestPredictions=row.multi_step_best_predictions
        )
        for row in dataRows
      ]
    )
コード例 #11
0
  def testPublishMetricDataWithDuplicateKeys(self, connectDynamoDB,
                                             _gracefulCreateTable):
    """ Test for elimination of rows with duplicate keys by _publishMetricData
    """
    metricId = "3b035a5916994f2bb950f5717138f94b"

    rowTemplate = dict(
      rowid=99,
      ts=epochFromNaiveUTCDatetime(datetime(2015, 3, 20, 0, 46, 28)),
      value=10305.0,
      rawAnomaly=0.275,
      anomaly=0.999840891
    )

    row1 = dict(rowTemplate)
    row2 = dict(rowTemplate)
    row2["rowid"] = row1["rowid"] + 1
    rows = [row1, row2]

    service = DynamoDBService()

    service._publishMetricData(metricId, rows)

    data = dynamodb_service.convertInferenceResultRowToMetricDataItem(metricId,
                                                                      row1)
    mockPutItem = (service._metric_data.batch_write.return_value.__enter__
                   .return_value.put_item)
    mockPutItem.assert_called_once_with(data=data._asdict(), overwrite=True)
コード例 #12
0
ファイル: anomaly_service.py プロジェクト: bopopescu/what
    def _composeModelInferenceResultsMessage(cls, metricRow, dataRows):
        """ Create a message body for publishing from the result of
    _processModelInferenceResults

    :param metricRow: Metric instance corresponding to the given metricID
    :param dataRows: a sequence of MutableMetricDataRow instances
      corresponding to the updated metric_data rows.
    :returns: JSON-ifiable dict conforming to
      model_inference_results_msg_schema.json
    :rtype: dict
    """
        return dict(metric=dict(uid=metricRow.uid,
                                name=metricRow.name,
                                description=metricRow.description,
                                resource=metricRow.server,
                                location=metricRow.location,
                                datasource=metricRow.datasource,
                                spec=json.loads(
                                    metricRow.parameters)["metricSpec"]),
                    results=[
                        dict(rowid=row.rowid,
                             ts=epochFromNaiveUTCDatetime(row.timestamp),
                             value=row.metric_value,
                             rawAnomaly=row.raw_anomaly_score,
                             anomaly=row.anomaly_score) for row in dataRows
                    ])
コード例 #13
0
 def validateResultRow(resultRow, inputRow):
     self.assertEqual(resultRow["rowid"], inputRow.rowid)
     self.assertEqual(resultRow["ts"],
                      epochFromNaiveUTCDatetime(inputRow.timestamp))
     self.assertEqual(resultRow["value"], inputRow.metric_value)
     self.assertEqual(resultRow["rawAnomaly"],
                      inputRow.raw_anomaly_score)
     self.assertEqual(resultRow["anomaly"], inputRow.anomaly_score)
コード例 #14
0
  def _encodeDateTime(cls, dateTime):
    """ Encode a datetime instance for serialization. This encoder is non-lossy.

    :param dateTime: a datetime.datetime instance to encode

    :returns: an opaque datetime state value suitable for use in
      ModelInputRow.__getstate__().
    """
    return [int(epochFromNaiveUTCDatetime(dateTime)), dateTime.microsecond]
コード例 #15
0
    def _encodeDateTime(cls, dateTime):
        """ Encode a datetime instance for serialization. This encoder is non-lossy.

    :param dateTime: a datetime.datetime instance to encode

    :returns: an opaque datetime state value suitable for use in
      ModelInputRow.__getstate__().
    """
        return [int(epochFromNaiveUTCDatetime(dateTime)), dateTime.microsecond]
コード例 #16
0
 def _constructSortKey(agg_ts):
   """ Construct an initial sort key by converting agg_ts to an epoch time,
   and multiply it by some power of 10.  On update, the key will be
   incremented by one atomically.  The original range queries will be
   preserved while allowing a response with tweets sorted by popularity
   within the time range bucket
   """
   ts = epochFromNaiveUTCDatetime(datetime.strptime(agg_ts.partition(".")[0],
                                                    "%Y-%m-%dT%H:%M:%S"))
   return int(ts * 1e5)
コード例 #17
0
ファイル: metric_utils.py プロジェクト: maparco/numenta-apps
def aggTimestampFromSampleTimestamp(sampleDatetime, aggRefDatetime, aggSec):
  """ Compute aggregation timestamp from the sample's timestamp as the lower
  aggregation boundary relative to the given reference.

  :param datetime sampleDatetime: offset-naive UTC timestamp of the sample (
    e.g., create_at property of a tweet)
  :param datetime aggRefDatetime: offset-naive UTC reference aggregation
    timestamp belonging to the sample stream; may precede, follow, or be equal
    to sampleDatetime
  :agg int aggSec: the corresponding metric's aggregation period in seconds

  :returns: offset=naive UTC timestamp of aggregation period that the sample
    belongs to, which is the bottom boundary of its aggregation window. E.g.,
      sample="2015-02-20 2:14:00", ref="2015-02-20 2:00:00", aggSec=300 (5min)
        would return "2015-02-20 2:10:00"
      sample="2015-02-20 2:14:00", ref="2015-02-20 2:20:00", aggSec=300 (5min)
        would return "2015-02-20 2:10:00"
      sample="2015-02-20 2:15:00", ref="2015-02-20 2:15:00", aggSec=300 (5min)
        would return "2015-02-20 2:15:00"
  :rtype: datetime
  """
  sampleEpoch = date_time_utils.epochFromNaiveUTCDatetime(sampleDatetime)
  aggRefEpoch = date_time_utils.epochFromNaiveUTCDatetime(aggRefDatetime)

  deltaSec = sampleEpoch - aggRefEpoch
  if deltaSec >= 0:
    # Sample timestamp equals or follows reference
    deltaAggIntervalSec = (deltaSec // aggSec) * aggSec
    aggEpoch = aggRefEpoch + deltaAggIntervalSec
  else:
    # Sample timestamp precedes reference

    # Back up to beginning of aggregation window
    deltaAggIntervalSec = ((abs(deltaSec) + (aggSec - 1)) // aggSec) * aggSec
    aggEpoch = aggRefEpoch - deltaAggIntervalSec


  return datetime.utcfromtimestamp(aggEpoch)
コード例 #18
0
def aggTimestampFromSampleTimestamp(sampleDatetime, aggRefDatetime, aggSec):
    """ Compute aggregation timestamp from the sample's timestamp as the lower
  aggregation boundary relative to the given reference.

  :param datetime sampleDatetime: offset-naive UTC timestamp of the sample (
    e.g., create_at property of a tweet)
  :param datetime aggRefDatetime: offset-naive UTC reference aggregation
    timestamp belonging to the sample stream; may precede, follow, or be equal
    to sampleDatetime
  :agg int aggSec: the corresponding metric's aggregation period in seconds

  :returns: offset=naive UTC timestamp of aggregation period that the sample
    belongs to, which is the bottom boundary of its aggregation window. E.g.,
      sample="2015-02-20 2:14:00", ref="2015-02-20 2:00:00", aggSec=300 (5min)
        would return "2015-02-20 2:10:00"
      sample="2015-02-20 2:14:00", ref="2015-02-20 2:20:00", aggSec=300 (5min)
        would return "2015-02-20 2:10:00"
      sample="2015-02-20 2:15:00", ref="2015-02-20 2:15:00", aggSec=300 (5min)
        would return "2015-02-20 2:15:00"
  :rtype: datetime
  """
    sampleEpoch = date_time_utils.epochFromNaiveUTCDatetime(sampleDatetime)
    aggRefEpoch = date_time_utils.epochFromNaiveUTCDatetime(aggRefDatetime)

    deltaSec = sampleEpoch - aggRefEpoch
    if deltaSec >= 0:
        # Sample timestamp equals or follows reference
        deltaAggIntervalSec = (deltaSec // aggSec) * aggSec
        aggEpoch = aggRefEpoch + deltaAggIntervalSec
    else:
        # Sample timestamp precedes reference

        # Back up to beginning of aggregation window
        deltaAggIntervalSec = ((abs(deltaSec) +
                                (aggSec - 1)) // aggSec) * aggSec
        aggEpoch = aggRefEpoch - deltaAggIntervalSec

    return datetime.utcfromtimestamp(aggEpoch)
コード例 #19
0
def _forwardNewsVolumeMetrics(metricSpecs,
                              lastEmittedAggTime,
                              stopDatetime,
                              periodSec,
                              metricDestAddr):
  """ Query news volume metrics since the given last emitted timestamp through
  stopDatetime and forward them to htmengine's Metric Listener. Update the
  datetime of the last successfully-emitted news volume metric batch in the
  database.

  NOTE: forwarding will be aborted upon failure to connect to Metic Listener. In
    this case, an error will be logged, and the function will return the UTC
    timestamp of the last successfully-emitted sample aggregation interval. Once
    Metric Listener comes online, a subsequent call to this function will catch
    up by forwarding the stored samples since last successful emission.

  :param metrics: a sequence of NewsVolumeMetricSpec objects corresponding to
    the metrics to be emitted
  :param lastEmittedAggTime: UTC datetime of last successfully-emitted sample
    batch
  :param stopDatetime: non-inclusive upper bound UTC datetime for forwarding
  :param periodSec: aggregation period in seconds
  :param metricDestAddr: two-tuple (metricDestHost, metricDestPort)
  :returns: UTC timestamp of the last successfully-emitted sample batch.
  :rtype: datetime.datetime
  """
  periodTimedelta = timedelta(seconds=periodSec)
  aggStartDatetime = lastEmittedAggTime + periodTimedelta
  while aggStartDatetime < stopDatetime:
    # Get News Volume metrics for one aggregation interval
    aggStopDatetime = aggStartDatetime + periodTimedelta
    symbolToNewsVolumeMap = defaultdict(
      int,
      _queryNewsVolumes(aggStartDatetime, aggStopDatetime))

    # Generate metric samples
    epochTimestamp = date_time_utils.epochFromNaiveUTCDatetime(aggStartDatetime)
    samples = tuple(
      dict(
        metricName=spec.metric,
        value=symbolToNewsVolumeMap[spec.symbol],
        epochTimestamp=epochTimestamp)
      for spec in metricSpecs
    )

    # Emit samples to Metric Listener
    try:
      with metric_utils.metricDataBatchWrite(log=g_log) as putSample:
        for sample in samples:
          putSample(**sample)
    except Exception:
      g_log.exception("Failure while emitting metric data for agg=%s "
                      "containing numSamples=%d",
                      aggStartDatetime, len(samples))
      return lastEmittedAggTime
    else:
      g_log.info("Forwarded numSamples=%d for agg=%s",
                 len(samples), aggStartDatetime)

    # Update db with last successfully-emitted datetime
    metric_utils.updateLastEmittedSampleDatetime(
      key=_EMITTED_NEWS_VOLUME_SAMPLE_TRACKER_KEY,
      sampleDatetime=aggStartDatetime)

    # Set up for next iteration
    lastEmittedAggTime = aggStartDatetime
    aggStartDatetime = aggStopDatetime


  return lastEmittedAggTime
コード例 #20
0
def main():
    """
  NOTE: main also serves as entry point for "console script" generated by setup
  """
    logging_support.LoggingSupport.initService()

    options = _parseArgs()

    # See OP_MODE_ACTIVE, etc. in ApplicationConfig
    opMode = config.get("xignite_security_news_agent", "opmode")
    g_log.info("Starting: opMode=%s", opMode)

    aggSec = options.aggIntervalSec

    # Load metric specs from metric configuration
    metricSpecs = _loadNewsVolumeMetricSpecs()

    # Load securities from metric configuration
    securities = getAllMetricSecurities()
    g_log.info("Collecting headlines and releases for %s", securities)

    # Maps security symbols to the datetime.date of most recently-stored headlines
    lastSecurityHeadlineEndDates = _querySecurityNewsEndDates(
        schema.xigniteSecurityHeadline)

    # Map security symbols to the datetime.date of most recently-stored releases
    lastSecurityReleaseEndDates = _querySecurityNewsEndDates(
        schema.xigniteSecurityRelease)

    # Establish/retrieve datetime of last successfully-emitted metric data batch
    lastEmittedAggTime = metric_utils.establishLastEmittedSampleDatetime(
        key=_EMITTED_NEWS_VOLUME_SAMPLE_TRACKER_KEY, aggSec=aggSec)

    # Calculate next aggregation start time using lastEmittedAggTime as base
    lastAggStart = date_time_utils.epochFromNaiveUTCDatetime(
        lastEmittedAggTime)
    nextAggEnd = lastAggStart + (int(
        (time.time() - lastAggStart + aggSec - 1) / aggSec) * aggSec) + aggSec

    # Poll, store and emit samples
    pollingIntervalSec = aggSec / 2.0
    numPoolWorkers = max(_MIN_POOL_CONCURRENCY, multiprocessing.cpu_count())
    g_log.info("Entering main loop: pollingIntervalSec=%s; numPoolWorkers=%d",
               pollingIntervalSec, numPoolWorkers)
    pool = multiprocessing.Pool(processes=numPoolWorkers)
    try:
        while True:
            pollingIntervalEnd = time.time() + pollingIntervalSec

            # Retrieve all headlines and releases of interest
            headlineTasks = _generateTasks(securities,
                                           lastSecurityHeadlineEndDates,
                                           options.backfillDays,
                                           taskClass=_HistoricalHeadlinesTask,
                                           dryRun=options.dryRun)

            releaseTasks = _generateTasks(securities,
                                          lastSecurityReleaseEndDates,
                                          options.backfillDays,
                                          taskClass=_HistoricalReleasesTask,
                                          dryRun=options.dryRun)

            allTasks = itertools.chain(headlineTasks, releaseTasks)

            _processNewsCollectionTasks(
                pool=pool,
                tasksIter=allTasks,
                headlineEndDates=lastSecurityHeadlineEndDates,
                releaseEndDates=lastSecurityReleaseEndDates,
                options=options)

            # Aggregate and forward metric samples to htmengine's Metric Listener
            if time.time() >= nextAggEnd:
                if opMode == config.OP_MODE_ACTIVE and not options.dryRun:
                    lastEmittedAggTime = _forwardNewsVolumeMetrics(
                        metricSpecs=metricSpecs,
                        lastEmittedAggTime=lastEmittedAggTime,
                        stopDatetime=datetime.utcfromtimestamp(nextAggEnd),
                        periodSec=aggSec,
                        metricDestAddr=options.metricDestAddr)

                nextAggEnd += aggSec

            sleepSec = pollingIntervalEnd - time.time()
            if sleepSec > 0:
                g_log.info("Sleeping for %f seconds. zzzzzzzz...", sleepSec)
                time.sleep(sleepSec)
            elif sleepSec < 0:
                g_log.warning(
                    "Processing exceeded pollingInterval=%ss by overage=%ss",
                    pollingIntervalSec, -sleepSec)
    except KeyboardInterrupt:
        # Log with exception info to help debug deadlocks
        g_log.info("Observed KeyboardInterrupt", exc_info=True)
        pass
    finally:
        g_log.info("Closing multiprocessing.Pool")
        pool.close()

        g_log.info("Terminating multiprocessing.Pool")
        pool.terminate()
        g_log.info("Multiprocessing.Pool terminated")
コード例 #21
0
def _forwardNewsVolumeMetrics(metricSpecs, lastEmittedAggTime, stopDatetime,
                              periodSec, metricDestAddr):
    """ Query news volume metrics since the given last emitted timestamp through
  stopDatetime and forward them to htmengine's Metric Listener. Update the
  datetime of the last successfully-emitted news volume metric batch in the
  database.

  NOTE: forwarding will be aborted upon failure to connect to Metic Listener. In
    this case, an error will be logged, and the function will return the UTC
    timestamp of the last successfully-emitted sample aggregation interval. Once
    Metric Listener comes online, a subsequent call to this function will catch
    up by forwarding the stored samples since last successful emission.

  :param metrics: a sequence of NewsVolumeMetricSpec objects corresponding to
    the metrics to be emitted
  :param lastEmittedAggTime: UTC datetime of last successfully-emitted sample
    batch
  :param stopDatetime: non-inclusive upper bound UTC datetime for forwarding
  :param periodSec: aggregation period in seconds
  :param metricDestAddr: two-tuple (metricDestHost, metricDestPort)
  :returns: UTC timestamp of the last successfully-emitted sample batch.
  :rtype: datetime.datetime
  """
    periodTimedelta = timedelta(seconds=periodSec)
    aggStartDatetime = lastEmittedAggTime + periodTimedelta
    while aggStartDatetime < stopDatetime:
        # Get News Volume metrics for one aggregation interval
        aggStopDatetime = aggStartDatetime + periodTimedelta
        symbolToNewsVolumeMap = defaultdict(
            int, _queryNewsVolumes(aggStartDatetime, aggStopDatetime))

        # Generate metric samples
        epochTimestamp = date_time_utils.epochFromNaiveUTCDatetime(
            aggStartDatetime)
        samples = tuple(
            dict(metricName=spec.metric,
                 value=symbolToNewsVolumeMap[spec.symbol],
                 epochTimestamp=epochTimestamp) for spec in metricSpecs)

        # Emit samples to Metric Listener
        try:
            with metric_utils.metricDataBatchWrite(log=g_log) as putSample:
                for sample in samples:
                    putSample(**sample)
        except Exception:
            g_log.exception(
                "Failure while emitting metric data for agg=%s "
                "containing numSamples=%d", aggStartDatetime, len(samples))
            return lastEmittedAggTime
        else:
            g_log.info("Forwarded numSamples=%d for agg=%s", len(samples),
                       aggStartDatetime)

        # Update db with last successfully-emitted datetime
        metric_utils.updateLastEmittedSampleDatetime(
            key=_EMITTED_NEWS_VOLUME_SAMPLE_TRACKER_KEY,
            sampleDatetime=aggStartDatetime)

        # Set up for next iteration
        lastEmittedAggTime = aggStartDatetime
        aggStartDatetime = aggStopDatetime

    return lastEmittedAggTime
コード例 #22
0
  def testPublishInstanceDataHourly(self, connectDynamoDB,
                                    _gracefulCreateTable):
    connectionMock = Mock(spec_set=DynamoDBConnection)
    connectionMock.update_item.side_effect = ResourceNotFoundException(
        400, "item not found")
    connectDynamoDB.return_value = connectionMock
    tableName = InstanceDataHourlyDynamoDBDefinition().tableName
    instanceName = "testName"
    condition = "attribute_not_exists(instance_id)"
    rows = [
        dict(
            rowid=99,
            ts=epochFromNaiveUTCDatetime(datetime(2015, 2, 20, 0, 46, 28)),
            value=10305.0,
            rawAnomaly=0.275,
            anomaly=0.999840891
        ),
        dict(
            rowid=100,
            ts=epochFromNaiveUTCDatetime(datetime(2015, 2, 20, 0, 51, 28)),
            value=9305.0,
            rawAnomaly=0.975,
            anomaly=0.999990891
        ),
        dict(
            rowid=101,
            ts=epochFromNaiveUTCDatetime(datetime(2015, 2, 20, 0, 56, 20)),
            value=6111.0,
            rawAnomaly=0.775,
            anomaly=0.999940891
        ),
        dict(
            rowid=102,
            ts=epochFromNaiveUTCDatetime(datetime(2015, 2, 20, 1, 1, 38)),
            value=7092.0,
            rawAnomaly=0.775,
            anomaly=0.999640891
        )
    ]

    service = DynamoDBService()

    # Run the function under test
    service._publishInstanceDataHourly(instanceName, "TwitterVolume", rows)

    # Validate results
    self.assertEqual(connectionMock.update_item.call_count, 2)
    self.assertEqual(connectionMock.put_item.call_count, 2)
    calls = connectionMock.put_item.call_args_list

    kwargs0 = calls[0][1]
    item0 = kwargs0["item"]
    self.assertDictEqual(item0["instance_id"], {"S": instanceName})
    self.assertEqual(item0["date_hour"], {"S": "2015-02-20T00"})
    self.assertEqual(item0["date"], {"S": "2015-02-20"})
    self.assertEqual(item0["hour"], {"S": "00"})
    self.assertDictEqual(item0["anomaly_score"]["M"]["TwitterVolume"],
                         {"N": "0.99999"})
    self.assertEqual(kwargs0["condition_expression"], condition)

    kwargs1 = calls[1][1]
    item1 = kwargs1["item"]
    self.assertEqual(item1["instance_id"], {"S": instanceName})
    self.assertEqual(item1["date_hour"], {"S": "2015-02-20T01"})
    self.assertEqual(item1["date"], {"S": "2015-02-20"})
    self.assertEqual(item1["hour"], {"S": "01"})
    self.assertDictEqual(item1["anomaly_score"]["M"]["TwitterVolume"],
                         {"N": "0.99964"})
    self.assertEqual(kwargs1["condition_expression"], condition)
コード例 #23
0
def main():
    """
  NOTE: main also serves as entry point for "console script" generated by setup
  """
    logging_support.LoggingSupport().initTool()

    try:
        options = _parseArgs()

        g_log.info("Verifying that agents are in hot_standby mode")
        for section in config.sections():
            try:
                assert config.get(section, "opmode") == ApplicationConfig.OP_MODE_HOT_STANDBY
            except Exception, e:
                raise

        g_log.info("Verifying that the old symbol has been removed from the " "metrics configuration")
        for stockData in metric_utils.getMetricsConfiguration().itervalues():
            assert stockData["symbol"] != options.old_symbol

        if options.twitter and (not options.stocks):
            g_log.info(
                "Migrating ONLY twitter data from old-symbol=%s " "to new-symbol=%s",
                options.old_symbol,
                options.new_symbol,
            )
        elif options.stocks and (not options.twitter):
            g_log.info(
                "Migrating ONLY xignite stock data from old-symbol=%s " "to new-symbol=%s",
                options.old_symbol,
                options.new_symbol,
            )
            raise NotImplementedError
        else:
            g_log.info(
                "Migrating BOTH twitter and xignite stock data from " "old-symbol=%s to new-symbol=%s",
                options.old_symbol,
                options.new_symbol,
            )
            raise NotImplementedError

        oldSymbolTweetPrefix = "TWITTER.TWEET.HANDLE.{symbol}.".format(symbol=options.old_symbol)
        newSymbolTweetPrefix = "TWITTER.TWEET.HANDLE.{symbol}.".format(symbol=options.new_symbol)
        oldSymbolTweetMetricsList = []

        with collectorsdb.engineFactory().begin() as conn:

            g_log.info("Renaming metrics to new symbol")
            if options.twitter:
                oldSymbolTweetsQuery = sql.select([tweetSamplesSchema]).where(
                    tweetSamplesSchema.c.metric.contains(oldSymbolTweetPrefix)
                )
                oldSymbolTweets = conn.execute(oldSymbolTweetsQuery)
                for tweetSample in oldSymbolTweets:
                    newMetricName = "{newPrefix}{metric}".format(
                        newPrefix=newSymbolTweetPrefix, metric=tweetSample.metric[len(oldSymbolTweetPrefix) :]
                    )
                    if tweetSample.metric not in oldSymbolTweetMetricsList:
                        oldSymbolTweetMetricsList.append(tweetSample.metric)

                    updateSampleQuery = (
                        tweetSamplesSchema.update()
                        .where(tweetSamplesSchema.c.seq == tweetSample.seq)
                        .values(metric=newMetricName)
                    )

                    conn.execute(updateSampleQuery)

            g_log.info("Forwarding new twitter metric data to Taurus engine...")
            if options.twitter:
                oldestRecordTs = conn.execute(
                    sql.select([tweetSamplesSchema.c.agg_ts], order_by=tweetSamplesSchema.c.agg_ts.asc())
                ).first()[0]
                lastEmittedAggTime = metric_utils.establishLastEmittedSampleDatetime(
                    key=_EMITTED_TWEET_VOLUME_SAMPLE_TRACKER_KEY, aggSec=options.aggPeriod
                )
                aggOffset = (
                    math.ceil(
                        (epochFromNaiveUTCDatetime(lastEmittedAggTime) - epochFromNaiveUTCDatetime(oldestRecordTs))
                        / options.aggPeriod
                    )
                    * options.aggPeriod
                )
                aggStartDatetime = (
                    lastEmittedAggTime - timedelta(seconds=aggOffset) - timedelta(seconds=options.aggPeriod)
                )

                metric_utils.updateLastEmittedSampleDatetime(
                    key=_EMITTED_TWEET_VOLUME_SAMPLE_TRACKER_KEY, sampleDatetime=aggStartDatetime
                )

                MetricDataForwarder.runInThread(
                    metricSpecs=loadMetricSpecs(),
                    aggSec=options.aggPeriod,
                    symbolList=[options.new_symbol],
                    forwardOnlyBacklog=True,
                )

                metric_utils.updateLastEmittedSampleDatetime(
                    key=_EMITTED_TWEET_VOLUME_SAMPLE_TRACKER_KEY, sampleDatetime=lastEmittedAggTime
                )

        g_log.info("Forwarding metrics to dynamodb using new symbol...")
        if options.twitter:
            migrate_tweets_to_dynamodb.main(symbolList=[options.new_symbol])

        g_log.info("Unmonitoring and deleting existing metrics associated with " "symbol=%s", options.old_symbol)
        oldModels = metric_utils.getSymbolModels(options.htmServer, options.apikey, options.old_symbol)
        for model in oldModels:
            metric_utils.unmonitorMetric(options.htmServer, options.apikey, model.uid)
            metric_utils.deleteMetric(options.htmServer, options.apikey, model.name)
コード例 #24
0
    def testPathwayToDynamoDB(self):
        """ Test metric data pathway to dynamodb
    """

        metricName = "TEST." + "".join(random.sample(string.ascii_letters, 16))

        nativeMetric = {
            "modelParams": {
                "minResolution": 0.2,
                "min": 0.0,
                "max": 10000.0,
            },
            "datasource": "custom",
            "metricSpec": {
                "metric": metricName,
                "resource": "Test",
                "userInfo": {
                    "symbol": "TEST",
                    "metricType": "TwitterVolume",
                    "metricTypeName": "Twitter Volume",
                }
            }
        }
        metricName = nativeMetric["metricSpec"]["metric"]
        instanceName = nativeMetric["metricSpec"]["resource"]
        userInfo = nativeMetric["metricSpec"]["userInfo"]

        now = datetime.datetime.utcnow().replace(minute=0,
                                                 second=0,
                                                 microsecond=0)

        data = [
            (5000.0, now - datetime.timedelta(minutes=10)),
            (6000.0, now - datetime.timedelta(minutes=5)),
            (7000.0, now),
        ]

        # We'll be explicitly deleting the metric below, but we need to add a
        # cleanup step that runs in case there is some other failure that prevents
        # that part of the test from being reached.

        def gracefulDelete():
            try:
                self._deleteMetric(metricName)
            except ObjectNotFoundError:
                pass

        self.addCleanup(gracefulDelete)

        # Add custom metric data
        sock = socket.socket()
        sock.connect(("localhost", self.plaintextPort))
        for metricValue, ts in data:
            sock.sendall(
                "%s %r %s\n" %
                (metricName, metricValue, epochFromNaiveUTCDatetime(ts)))

        self.gracefullyCloseSocket(sock)

        uid = self.checkMetricCreated(metricName)

        # Save the uid for later
        LOGGER.info("Metric %s has uid: %s", metricName, uid)

        # Send model creation request
        model = self._createModel(nativeMetric)
        parameters = json.loads(model.parameters)
        self.assertEqual(parameters["metricSpec"]["userInfo"], userInfo)

        for _ in xrange(60):
            with self.engine.begin() as conn:
                metric = repository.getMetric(conn, uid)

            if metric.status == MetricStatus.ACTIVE:
                break
            LOGGER.info("Model=%s not ready. Sleeping 1 second...", uid)
            time.sleep(1)
        else:
            self.fail("Model results not available within 5 minutes")

        # Check that the data all got processed
        self.checkModelResultsSize(uid, 3)

        # Now check that the data was published to dynamodb...
        dynamodb = DynamoDBService.connectDynamoDB()

        metricTable = Table(MetricDynamoDBDefinition().tableName,
                            connection=dynamodb)
        metricItem = metricTable.lookup(uid)
        self.assertEqual(metricItem["uid"], uid)
        self.assertEqual(metricItem["name"], metricName)
        self.assertEqual(metricItem["metricType"], "TwitterVolume")
        self.assertEqual(metricItem["metricTypeName"], "Twitter Volume")
        self.assertEqual(metricItem["symbol"], "TEST")

        metricDataTable = Table(MetricDataDynamoDBDefinition().tableName,
                                connection=dynamodb)
        instanceDataAnomalyScores = {}
        for metricValue, ts in data:
            metricDataItem = _RETRY_ON_ITEM_NOT_FOUND_DYNAMODB_ERROR(
                metricDataTable.lookup)(uid, ts.isoformat())
            # There is no server-side cleanup for metric data, so remove it here for
            # now to avoid accumulating test data
            self.addCleanup(metricDataItem.delete)
            self.assertEqual(metricValue, metricDataItem["metric_value"])
            dt = datetime.datetime.strptime(metricDataItem["timestamp"],
                                            "%Y-%m-%dT%H:%M:%S")
            self.assertEqual(ts, dt)
            ts = ts.replace(minute=0, second=0, microsecond=0)
            date = ts.strftime("%Y-%m-%d")
            hour = ts.strftime("%H")
            key = (date, hour)
            maxVal = instanceDataAnomalyScores.get(key, 0.0)
            instanceDataAnomalyScores[key] = max(
                maxVal, metricDataItem["anomaly_score"])

        # And check that the aggregated instance data is updated
        instanceDataHourlyTable = Table(
            InstanceDataHourlyDynamoDBDefinition().tableName,
            connection=dynamodb)
        for key, anomalyScore in instanceDataAnomalyScores.iteritems():
            date, hour = key
            instanceDataHourlyItem = _RETRY_ON_ITEM_NOT_FOUND_DYNAMODB_ERROR(
                instanceDataHourlyTable.lookup)(instanceName,
                                                "%sT%s" % (date, hour))
            self.addCleanup(instanceDataHourlyItem.delete)
            self.assertAlmostEqual(
                anomalyScore,
                float(
                    instanceDataHourlyItem["anomaly_score"]["TwitterVolume"]))
            self.assertEqual(date, instanceDataHourlyItem["date"])
            self.assertEqual(hour, instanceDataHourlyItem["hour"])

        # Now send some twitter data and validate that it made it to dynamodb

        twitterData = [{
            "metric_name": metricName,
            "tweet_uid": uid,
            "created_at": "2015-02-19T19:43:24.870109",
            "agg_ts": "2015-02-19T19:43:24.870118",
            "text": "Tweet text",
            "userid": "10",
            "username": "******",
            "retweet_count": "0"
        }]

        with MessageBusConnector() as messageBus:
            messageBus.publishExg(
                exchange=self.config.get("non_metric_data", "exchange_name"),
                routingKey=(
                    self.config.get("non_metric_data", "exchange_name") +
                    ".twitter"),
                body=json.dumps(twitterData))

        metricTweetsTable = Table(MetricTweetsDynamoDBDefinition().tableName,
                                  connection=dynamodb)
        metricTweetItem = metricTweetsTable.lookup(
            "-".join((metricName, uid)), "2015-02-19T19:43:24.870118")
        # There is no server-side cleanup for tweet data, so remove it here for
        # now to avoid accumulating test data
        self.addCleanup(metricTweetItem.delete)
        self.assertEqual(metricTweetItem["username"],
                         twitterData[0]["username"])
        self.assertEqual(metricTweetItem["tweet_uid"],
                         twitterData[0]["tweet_uid"])
        self.assertEqual(metricTweetItem["created_at"],
                         twitterData[0]["created_at"])
        self.assertEqual(metricTweetItem["agg_ts"], twitterData[0]["agg_ts"])
        self.assertEqual(metricTweetItem["text"], twitterData[0]["text"])
        self.assertEqual(metricTweetItem["userid"], twitterData[0]["userid"])
        self.assertEqual(metricTweetItem["username"],
                         twitterData[0]["username"])
        self.assertEqual(metricTweetItem["retweet_count"],
                         twitterData[0]["retweet_count"])

        queryResult = metricTweetsTable.query_2(
            metric_name__eq=metricName,
            agg_ts__eq=twitterData[0]["agg_ts"],
            index="taurus.metric_data-metric_name_index")
        queriedMetricTweetItem = next(queryResult)

        self.assertEqual(queriedMetricTweetItem["username"],
                         twitterData[0]["username"])
        self.assertEqual(queriedMetricTweetItem["tweet_uid"],
                         twitterData[0]["tweet_uid"])
        self.assertEqual(queriedMetricTweetItem["created_at"],
                         twitterData[0]["created_at"])
        self.assertEqual(queriedMetricTweetItem["agg_ts"],
                         twitterData[0]["agg_ts"])
        self.assertEqual(queriedMetricTweetItem["text"],
                         twitterData[0]["text"])
        self.assertEqual(queriedMetricTweetItem["userid"],
                         twitterData[0]["userid"])
        self.assertEqual(queriedMetricTweetItem["username"],
                         twitterData[0]["username"])
        self.assertEqual(queriedMetricTweetItem["retweet_count"],
                         twitterData[0]["retweet_count"])

        # Delete metric and ensure metric is deleted from dynamodb, too
        self._deleteMetric(metricName)

        for _ in xrange(60):
            time.sleep(1)
            try:
                metricItem = metricTable.lookup(uid)
            except ItemNotFound as err:
                break
        else:
            self.fail("Metric not deleted from dynamodb")
コード例 #25
0
def replayMetricDataToModelResultsExchange(messageBus, chunksize=DEFAULT_CHUNKSIZE):
    """ Reads metric data and synthesizes model inference result messages to the
  "model results" exchange, simulating the end result of the AnomalyService.
  This will afford the dynamodb service an opportunity to backfill older data
  :param messageBus: message bus connection
  :type messageBus: nta.utils.message_bus_connector.MessageBusConnector
  """
    engine = repository.engineFactory()

    twoWeeksAgo = datetime.datetime.utcnow() - datetime.timedelta(days=14)

    # Properties for publishing model command results on RabbitMQ exchange
    # (same as AnomalyService)
    modelCommandResultProperties = MessageProperties(
        deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE, headers=dict(dataType="model-cmd-result")
    )

    # Properties for publishing model inference results on RabbitMQ exchange
    # (same as AnomalyService)
    modelInferenceResultProperties = MessageProperties(deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE)

    g_log.info("Getting metric data...")
    result = repository.getMetricData(
        engine, score=0, fromTimestamp=twoWeeksAgo, sort=[metric_data.c.uid, metric_data.c.rowid.asc()]
    )
    numMetricDataRows = result.rowcount
    g_log.info("Got %d rows", numMetricDataRows)

    numModels = 0
    for uid, group in groupby(result, key=lambda x: x.uid):

        @retryOnTransientErrors
        def _getMetric():
            return repository.getMetric(engine, uid)

        metricObj = _getMetric()

        # Send defineModel command to ensure that the metric table entry is created
        numModels += 1
        modelCommandResult = {
            "status": htmengineerrno.SUCCESS,
            "method": "defineModel",
            "modelId": uid,
            "modelInfo": {
                "metricName": metricObj.name,
                "resource": metricObj.server,
                "modelSpec": json.loads(metricObj.parameters),
            },
        }

        # Serialize
        payload = anomaly_service.AnomalyService._serializeModelResult(modelCommandResult)

        g_log.info("Sending `defineModel` command: %r", repr(modelCommandResult))
        messageBus.publishExg(
            exchange=config.get("metric_streamer", "results_exchange_name"),
            routingKey="",
            body=payload,
            properties=modelCommandResultProperties,
        )

        metricInfo = dict(
            uid=metricObj.uid,
            name=metricObj.name,
            description=metricObj.description,
            resource=metricObj.server,
            location=metricObj.location,
            datasource=metricObj.datasource,
            spec=json.loads(metricObj.parameters)["metricSpec"],
        )

        args = [iter(group)] * chunksize
        for num, chunk in enumerate(izip_longest(fillvalue=None, *args)):
            # Create
            inferenceResultsMessage = dict(
                metric=metricInfo,
                results=[
                    dict(
                        rowid=row.rowid,
                        ts=epochFromNaiveUTCDatetime(row.timestamp),
                        value=row.metric_value,
                        rawAnomaly=row.raw_anomaly_score,
                        anomaly=row.anomaly_score,
                    )
                    for row in chunk
                    if row is not None
                ],
            )

            # Serialize
            payload = anomaly_service.AnomalyService._serializeModelResult(inferenceResultsMessage)

            g_log.info(
                "uid=%s chunk=%d rows=%d payload_size=%d bytes from %s to %s",
                uid,
                num,
                len(inferenceResultsMessage["results"]),
                sys.getsizeof(payload),
                datetime.datetime.utcfromtimestamp(inferenceResultsMessage["results"][0].ts),
                datetime.datetime.utcfromtimestamp(inferenceResultsMessage["results"][-1].timestamp),
            )

            messageBus.publishExg(
                exchange=config.get("metric_streamer", "results_exchange_name"),
                routingKey="",
                body=payload,
                properties=modelInferenceResultProperties,
            )

    g_log.info("Done! numMetricDataRows=%d; numModels=%d", numMetricDataRows, numModels)
コード例 #26
0
    def testModelResultHandlerSkipsStaleBatch(self, _amqpUtilsMock,
                                              deserializeModelResult,
                                              connectDynamoDB,
                                              _gracefulCreateTable):
        """ Given a stale batch of model inference results, verify that it isn't
    saved to DynamoDB
    """

        # We're going to mostly mock out all of the arguments to
        # DynamoDBService.messageHandler() since it is normally called by amqp lib.
        # Then simulate the process of handling an inbound batch of model inference
        # results and assert that the appropriate put_item() calls are made at the
        # other end.

        message = amqp.messages.ConsumerMessage(
            body=Mock(),
            properties=Mock(headers=dict()),
            methodInfo=amqp.messages.MessageDeliveryInfo(consumerTag=Mock(),
                                                         deliveryTag=Mock(),
                                                         redelivered=False,
                                                         exchange=Mock(),
                                                         routingKey=""),
            ackImpl=Mock(),
            nackImpl=Mock())

        # We will have to bypass the normal serialize/deserialize phases to avoid
        # dependency on sqlalchemy rowproxy.  Instead, we'll just mock out the
        # AnomalyService.deserializeModelResult() call, returning an object that
        # approximates a batch of model inference results as much as possible

        ts = epochFromNaiveUTCDatetime(
            datetime.utcnow().replace(microsecond=0) -
            timedelta(days=DynamoDBService._FRESH_DATA_THRESHOLD_DAYS + 1))

        resultRow = dict(rowid=4790,
                         ts=ts,
                         value=9305.0,
                         rawAnomaly=0.775,
                         anomaly=0.999840891)

        metricId = "3b035a5916994f2bb950f5717138f94b"

        deserializeModelResult.return_value = dict(metric=dict(
            uid=metricId,
            name="XIGNITE.AGN.VOLUME",
            description="XIGNITE.AGN.VOLUME",
            resource="Resource-of-XIGNITE.AGN.VOLUME",
            location="",
            datasource="custom",
            spec=dict(userInfo=dict(symbol="AGN",
                                    metricType="StockVolume",
                                    metricTypeName="Stock Volume"))),
                                                   results=[resultRow])

        service = DynamoDBService()
        publishMetricDataPatch = patch.object(
            service, "_publishMetricData", spec_set=service._publishMetricData)
        publishInstancePatch = patch.object(
            service,
            "_publishInstanceDataHourly",
            spec_set=service._publishInstanceDataHourly)
        with publishMetricDataPatch as publishMetricDataMock, \
            publishInstancePatch as publishInstanceMock:
            service.messageHandler(message)

            deserializeModelResult.assert_called_once_with(message.body)
            self.assertEqual(publishMetricDataMock.call_count, 0)
            self.assertEqual(publishInstanceMock.call_count, 0)
コード例 #27
0
    def testPublishInstanceDataHourly(self, connectDynamoDB,
                                      _gracefulCreateTable):
        connectionMock = Mock(spec_set=DynamoDBConnection)
        connectionMock.update_item.side_effect = ResourceNotFoundException(
            400, "item not found")
        connectDynamoDB.return_value = connectionMock
        tableName = InstanceDataHourlyDynamoDBDefinition().tableName
        instanceName = "testName"
        condition = "attribute_not_exists(instance_id)"
        rows = [
            dict(rowid=99,
                 ts=epochFromNaiveUTCDatetime(datetime(2015, 2, 20, 0, 46,
                                                       28)),
                 value=10305.0,
                 rawAnomaly=0.275,
                 anomaly=0.999840891),
            dict(rowid=100,
                 ts=epochFromNaiveUTCDatetime(datetime(2015, 2, 20, 0, 51,
                                                       28)),
                 value=9305.0,
                 rawAnomaly=0.975,
                 anomaly=0.999990891),
            dict(rowid=101,
                 ts=epochFromNaiveUTCDatetime(datetime(2015, 2, 20, 0, 56,
                                                       20)),
                 value=6111.0,
                 rawAnomaly=0.775,
                 anomaly=0.999940891),
            dict(rowid=102,
                 ts=epochFromNaiveUTCDatetime(datetime(2015, 2, 20, 1, 1, 38)),
                 value=7092.0,
                 rawAnomaly=0.775,
                 anomaly=0.999640891)
        ]

        service = DynamoDBService()

        # Run the function under test
        service._publishInstanceDataHourly(instanceName, "TwitterVolume", rows)

        # Validate results
        self.assertEqual(connectionMock.update_item.call_count, 2)
        self.assertEqual(connectionMock.put_item.call_count, 2)
        calls = connectionMock.put_item.call_args_list

        kwargs0 = calls[0][1]
        item0 = kwargs0["item"]
        self.assertDictEqual(item0["instance_id"], {"S": instanceName})
        self.assertEqual(item0["date_hour"], {"S": "2015-02-20T00"})
        self.assertEqual(item0["date"], {"S": "2015-02-20"})
        self.assertEqual(item0["hour"], {"S": "00"})
        self.assertDictEqual(item0["anomaly_score"]["M"]["TwitterVolume"],
                             {"N": "0.99999"})
        self.assertEqual(kwargs0["condition_expression"], condition)

        kwargs1 = calls[1][1]
        item1 = kwargs1["item"]
        self.assertEqual(item1["instance_id"], {"S": instanceName})
        self.assertEqual(item1["date_hour"], {"S": "2015-02-20T01"})
        self.assertEqual(item1["date"], {"S": "2015-02-20"})
        self.assertEqual(item1["hour"], {"S": "01"})
        self.assertDictEqual(item1["anomaly_score"]["M"]["TwitterVolume"],
                             {"N": "0.99964"})
        self.assertEqual(kwargs1["condition_expression"], condition)
コード例 #28
0
def main():
  """
  NOTE: main also serves as entry point for "console script" generated by setup
  """
  logging_support.LoggingSupport.initService()

  options = _parseArgs()

  # See OP_MODE_ACTIVE, etc. in ApplicationConfig
  opMode = config.get("xignite_security_news_agent", "opmode")
  g_log.info("Starting: opMode=%s", opMode)

  aggSec = options.aggIntervalSec

  # Load metric specs from metric configuration
  metricSpecs = _loadNewsVolumeMetricSpecs()

  # Load securities from metric configuration
  securities = getAllMetricSecurities()
  g_log.info("Collecting headlines and releases for %s", securities)

  # Maps security symbols to the datetime.date of most recently-stored headlines
  lastSecurityHeadlineEndDates = _querySecurityNewsEndDates(
    schema.xigniteSecurityHeadline)

  # Map security symbols to the datetime.date of most recently-stored releases
  lastSecurityReleaseEndDates = _querySecurityNewsEndDates(
    schema.xigniteSecurityRelease)

  # Establish/retrieve datetime of last successfully-emitted metric data batch
  lastEmittedAggTime = metric_utils.establishLastEmittedSampleDatetime(
    key=_EMITTED_NEWS_VOLUME_SAMPLE_TRACKER_KEY,
    aggSec=aggSec)

  # Calculate next aggregation start time using lastEmittedAggTime as base
  lastAggStart = date_time_utils.epochFromNaiveUTCDatetime(lastEmittedAggTime)
  nextAggEnd= lastAggStart + (
    int((time.time() - lastAggStart + aggSec - 1) / aggSec) * aggSec) + aggSec

  # Poll, store and emit samples
  pollingIntervalSec = aggSec / 2.0
  numPoolWorkers = max(_MIN_POOL_CONCURRENCY, multiprocessing.cpu_count())
  g_log.info("Entering main loop: pollingIntervalSec=%s; numPoolWorkers=%d",
             pollingIntervalSec, numPoolWorkers)
  pool = multiprocessing.Pool(processes=numPoolWorkers)
  try:
    while True:
      pollingIntervalEnd = time.time() + pollingIntervalSec

      # Retrieve all headlines and releases of interest
      headlineTasks = _generateTasks(
        securities,
        lastSecurityHeadlineEndDates,
        options.backfillDays,
        taskClass=_HistoricalHeadlinesTask,
        dryRun=options.dryRun)

      releaseTasks = _generateTasks(
        securities,
        lastSecurityReleaseEndDates,
        options.backfillDays,
        taskClass=_HistoricalReleasesTask,
        dryRun=options.dryRun)

      allTasks = itertools.chain(headlineTasks, releaseTasks)

      _processNewsCollectionTasks(pool=pool,
                                  tasksIter=allTasks,
                                  headlineEndDates=lastSecurityHeadlineEndDates,
                                  releaseEndDates=lastSecurityReleaseEndDates,
                                  options=options)

      # Aggregate and forward metric samples to htmengine's Metric Listener
      if time.time() >= nextAggEnd:
        if opMode == config.OP_MODE_ACTIVE and not options.dryRun:
          lastEmittedAggTime = _forwardNewsVolumeMetrics(
            metricSpecs=metricSpecs,
            lastEmittedAggTime=lastEmittedAggTime,
            stopDatetime=datetime.utcfromtimestamp(nextAggEnd),
            periodSec=aggSec,
            metricDestAddr=options.metricDestAddr)

        nextAggEnd += aggSec

      sleepSec = pollingIntervalEnd - time.time()
      if sleepSec > 0:
        g_log.info("Sleeping for %f seconds. zzzzzzzz...", sleepSec)
        time.sleep(sleepSec)
      elif sleepSec < 0:
        g_log.warning("Processing exceeded pollingInterval=%ss by overage=%ss",
                      pollingIntervalSec, -sleepSec)
  except KeyboardInterrupt:
    # Log with exception info to help debug deadlocks
    g_log.info("Observed KeyboardInterrupt", exc_info=True)
    pass
  finally:
    g_log.info("Closing multiprocessing.Pool")
    pool.close()

    g_log.info("Terminating multiprocessing.Pool")
    pool.terminate()
    g_log.info("Multiprocessing.Pool terminated")
コード例 #29
0
  def testPathwayToDynamoDB(self):
    """ Test metric data pathway to dynamodb
    """

    metricName = "TEST." + "".join(random.sample(string.ascii_letters, 16))

    nativeMetric = {
      "modelParams": {
        "minResolution": 0.2,
        "min": 0.0,
        "max": 10000.0,
      },
      "datasource": "custom",
      "metricSpec": {
        "metric": metricName,
        "resource": "Test",
        "userInfo": {
          "symbol": "TEST",
          "metricType": "TwitterVolume",
          "metricTypeName": "Twitter Volume",
        }
      }
    }
    metricName = nativeMetric["metricSpec"]["metric"]
    instanceName = nativeMetric["metricSpec"]["resource"]
    userInfo = nativeMetric["metricSpec"]["userInfo"]

    now = datetime.datetime.utcnow().replace(minute=0, second=0, microsecond=0)

    data = [
      (5000.0, now - datetime.timedelta(minutes=10)),
      (6000.0, now - datetime.timedelta(minutes=5)),
      (7000.0, now),
    ]

    # We'll be explicitly deleting the metric below, but we need to add a
    # cleanup step that runs in case there is some other failure that prevents
    # that part of the test from being reached.

    def gracefulDelete():
      try:
        self._deleteMetric(metricName)
      except ObjectNotFoundError:
        pass

    self.addCleanup(gracefulDelete)

    # Add custom metric data
    sock = socket.socket()
    sock.connect(("localhost", self.plaintextPort))
    for metricValue, ts in data:
      sock.sendall("%s %r %s\n" % (metricName,
                                   metricValue,
                                   epochFromNaiveUTCDatetime(ts)))

    self.gracefullyCloseSocket(sock)

    uid = self.checkMetricCreated(metricName)

    # Save the uid for later
    LOGGER.info("Metric %s has uid: %s", metricName, uid)

    # Send model creation request
    model = self._createModel(nativeMetric)
    parameters = json.loads(model.parameters)
    self.assertEqual(parameters["metricSpec"]["userInfo"], userInfo)

    for _ in xrange(60):
      with self.engine.begin() as conn:
        metric = repository.getMetric(conn, uid)

      if metric.status == MetricStatus.ACTIVE:
        break
      LOGGER.info("Model=%s not ready. Sleeping 1 second...", uid)
      time.sleep(1)
    else:
      self.fail("Model results not available within 5 minutes")

    # Check that the data all got processed
    self.checkModelResultsSize(uid, 3)

    # Now check that the data was published to dynamodb...
    dynamodb = DynamoDBService.connectDynamoDB()

    metricTable = Table(MetricDynamoDBDefinition().tableName,
                        connection=dynamodb)
    metricItem = metricTable.lookup(uid)
    self.assertEqual(metricItem["uid"], uid)
    self.assertEqual(metricItem["name"], metricName)
    self.assertEqual(metricItem["metricType"], "TwitterVolume")
    self.assertEqual(metricItem["metricTypeName"], "Twitter Volume")
    self.assertEqual(metricItem["symbol"], "TEST")

    metricDataTable = Table(MetricDataDynamoDBDefinition().tableName,
                            connection=dynamodb)
    instanceDataAnomalyScores = {}
    for metricValue, ts in data:
      metricDataItem = _RETRY_ON_ITEM_NOT_FOUND_DYNAMODB_ERROR(
        metricDataTable.lookup
      )(uid, ts.isoformat())
      # There is no server-side cleanup for metric data, so remove it here for
      # now to avoid accumulating test data
      self.addCleanup(metricDataItem.delete)
      self.assertEqual(metricValue, metricDataItem["metric_value"])
      dt = datetime.datetime.strptime(metricDataItem["timestamp"],
                                      "%Y-%m-%dT%H:%M:%S")
      self.assertEqual(ts, dt)
      ts = ts.replace(minute=0, second=0, microsecond=0)
      date = ts.strftime("%Y-%m-%d")
      hour = ts.strftime("%H")
      key = (date, hour)
      maxVal = instanceDataAnomalyScores.get(key, 0.0)
      instanceDataAnomalyScores[key] = max(
          maxVal, metricDataItem["anomaly_score"])

    # And check that the aggregated instance data is updated
    instanceDataHourlyTable = Table(
        InstanceDataHourlyDynamoDBDefinition().tableName, connection=dynamodb)
    for key, anomalyScore in instanceDataAnomalyScores.iteritems():
      date, hour = key
      instanceDataHourlyItem = _RETRY_ON_ITEM_NOT_FOUND_DYNAMODB_ERROR(
        instanceDataHourlyTable.lookup
      )(instanceName, "%sT%s" % (date, hour))
      self.addCleanup(instanceDataHourlyItem.delete)
      self.assertAlmostEqual(
          anomalyScore,
          float(instanceDataHourlyItem["anomaly_score"]["TwitterVolume"]))
      self.assertEqual(date, instanceDataHourlyItem["date"])
      self.assertEqual(hour, instanceDataHourlyItem["hour"])

    # Now send some twitter data and validate that it made it to dynamodb

    twitterData = [
      {
        "metric_name": metricName,
        "tweet_uid": uid,
        "created_at": "2015-02-19T19:43:24.870109",
        "agg_ts": "2015-02-19T19:43:24.870118",
        "text": "Tweet text",
        "userid": "10",
        "username": "******",
        "retweet_count": "0"
      }
    ]

    with MessageBusConnector() as messageBus:
      messageBus.publishExg(
        exchange=self.config.get("non_metric_data", "exchange_name"),
        routingKey=(
          self.config.get("non_metric_data", "exchange_name") + ".twitter"),
        body=json.dumps(twitterData)
      )


    metricTweetsTable = Table(MetricTweetsDynamoDBDefinition().tableName,
                              connection=dynamodb)
    for _ in range(30):
      try:
        metricTweetItem =  metricTweetsTable.lookup(
          twitterData[0]["text"],
          twitterData[0]["agg_ts"]
        )
        break
      except ItemNotFound:
        # LOL eventual consistency
        time.sleep(1)
        continue
    # There is no server-side cleanup for tweet data, so remove it here for
    # now to avoid accumulating test data
    self.addCleanup(metricTweetItem.delete)
    self.assertEqual(metricTweetItem["username"], twitterData[0]["username"])
    self.assertEqual(metricTweetItem["tweet_uid"], twitterData[0]["tweet_uid"])
    self.assertEqual(metricTweetItem["created_at"], twitterData[0]["created_at"])
    self.assertEqual(metricTweetItem["agg_ts"], twitterData[0]["agg_ts"])
    self.assertEqual(metricTweetItem["text"], twitterData[0]["text"])
    self.assertEqual(metricTweetItem["userid"], twitterData[0]["userid"])
    self.assertEqual(metricTweetItem["username"], twitterData[0]["username"])
    self.assertEqual(metricTweetItem["retweet_count"], twitterData[0]["retweet_count"])
    self.assertEqual(metricTweetItem["copy_count"], 0)

    sort_key = twitterData[0]["agg_ts"]

    ts = (epochFromNaiveUTCDatetime(
      datetime.datetime.strptime(twitterData[0]["agg_ts"].partition(".")[0],
                                 "%Y-%m-%dT%H:%M:%S")) * 1e5)
    queryResult = metricTweetsTable.query_2(
      metric_name__eq=metricName,
      sort_key__gte=ts,
      index="taurus.metric_data-metric_name_index")
    queriedMetricTweetItem = next(queryResult)

    self.assertEqual(queriedMetricTweetItem["username"], twitterData[0]["username"])
    self.assertEqual(queriedMetricTweetItem["tweet_uid"], twitterData[0]["tweet_uid"])
    self.assertEqual(queriedMetricTweetItem["created_at"], twitterData[0]["created_at"])
    self.assertEqual(queriedMetricTweetItem["agg_ts"], twitterData[0]["agg_ts"])
    self.assertEqual(queriedMetricTweetItem["text"], twitterData[0]["text"])
    self.assertEqual(queriedMetricTweetItem["userid"], twitterData[0]["userid"])
    self.assertEqual(queriedMetricTweetItem["username"], twitterData[0]["username"])
    self.assertEqual(queriedMetricTweetItem["retweet_count"], twitterData[0]["retweet_count"])
    self.assertEqual(queriedMetricTweetItem["copy_count"], 0)
    self.assertEqual(queriedMetricTweetItem["sort_key"], ts)

    duplicatedTwitterData = [
      {
        "metric_name": "copy of " + metricName,
        "tweet_uid": "copy of " + uid,
        "created_at": "2015-02-19T19:45:24.870109",
        "agg_ts": "2015-02-19T19:43:24.870118", # Same agg_ts!
        "text": "Tweet text", # Same text!
        "userid": "20",
        "username": "******",
        "retweet_count": "0"
      }
    ]

    with MessageBusConnector() as messageBus:
      messageBus.publishExg(
        exchange=self.config.get("non_metric_data", "exchange_name"),
        routingKey=(
          self.config.get("non_metric_data", "exchange_name") + ".twitter"),
        body=json.dumps(duplicatedTwitterData)
      )

    for _ in range(30):
      metricTweetItem =  metricTweetsTable.lookup(
        twitterData[0]["text"],
        twitterData[0]["agg_ts"]
      )

      if metricTweetItem["copy_count"] != 1:
        time.sleep(1)
        continue

      # Assert same as original, except for copy_count, which should be 1

      self.assertEqual(metricTweetItem["username"], twitterData[0]["username"])
      self.assertEqual(metricTweetItem["tweet_uid"], twitterData[0]["tweet_uid"])
      self.assertEqual(metricTweetItem["created_at"], twitterData[0]["created_at"])
      self.assertEqual(metricTweetItem["agg_ts"], twitterData[0]["agg_ts"])
      self.assertEqual(metricTweetItem["text"], twitterData[0]["text"])
      self.assertEqual(metricTweetItem["userid"], twitterData[0]["userid"])
      self.assertEqual(metricTweetItem["username"], twitterData[0]["username"])
      self.assertEqual(metricTweetItem["retweet_count"], twitterData[0]["retweet_count"])
      self.assertEqual(metricTweetItem["sort_key"], ts + 1)

      break
    else:
      self.fail("copy_count of original tweet not updated within reasonable"
                " amount of time (~30s) for duplicated tweet.")

    # Delete metric and ensure metric is deleted from dynamodb, too
    self._deleteMetric(metricName)

    for _ in xrange(60):
      time.sleep(1)
      try:
        metricItem = metricTable.lookup(uid)
      except ItemNotFound as err:
        break
    else:
      self.fail("Metric not deleted from dynamodb")
コード例 #30
0
def replayMetricDataToModelResultsExchange(messageBus,
                                           chunksize=DEFAULT_CHUNKSIZE):
    """ Reads metric data and synthesizes model inference result messages to the
  "model results" exchange, simulating the end result of the AnomalyService.
  This will afford the dynamodb service an opportunity to backfill older data
  :param messageBus: message bus connection
  :type messageBus: nta.utils.message_bus_connector.MessageBusConnector
  """
    engine = repository.engineFactory()

    twoWeeksAgo = datetime.datetime.utcnow() - datetime.timedelta(days=14)

    # Properties for publishing model command results on RabbitMQ exchange
    # (same as AnomalyService)
    modelCommandResultProperties = MessageProperties(
        deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE,
        headers=dict(dataType="model-cmd-result"))

    # Properties for publishing model inference results on RabbitMQ exchange
    # (same as AnomalyService)
    modelInferenceResultProperties = MessageProperties(
        deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE)

    g_log.info("Getting metric data...")
    result = repository.getMetricData(
        engine,
        score=0,
        fromTimestamp=twoWeeksAgo,
        sort=[metric_data.c.uid, metric_data.c.rowid.asc()])
    numMetricDataRows = result.rowcount
    g_log.info("Got %d rows", numMetricDataRows)

    numModels = 0
    for uid, group in groupby(result, key=lambda x: x.uid):

        @retryOnTransientErrors
        def _getMetric():
            return repository.getMetric(engine, uid)

        metricObj = _getMetric()

        # Send defineModel command to ensure that the metric table entry is created
        numModels += 1
        modelCommandResult = {
            "status": htmengineerrno.SUCCESS,
            "method": "defineModel",
            "modelId": uid,
            "modelInfo": {
                "metricName": metricObj.name,
                "resource": metricObj.server,
                "modelSpec": json.loads(metricObj.parameters)
            }
        }

        # Serialize
        payload = anomaly_service.AnomalyService._serializeModelResult(
            modelCommandResult)

        g_log.info("Sending `defineModel` command: %r",
                   repr(modelCommandResult))
        messageBus.publishExg(exchange=config.get("metric_streamer",
                                                  "results_exchange_name"),
                              routingKey="",
                              body=payload,
                              properties=modelCommandResultProperties)

        metricInfo = dict(uid=metricObj.uid,
                          name=metricObj.name,
                          description=metricObj.description,
                          resource=metricObj.server,
                          location=metricObj.location,
                          datasource=metricObj.datasource,
                          spec=json.loads(metricObj.parameters)["metricSpec"])

        args = [iter(group)] * chunksize
        for num, chunk in enumerate(izip_longest(fillvalue=None, *args)):
            # Create
            inferenceResultsMessage = dict(
                metric=metricInfo,
                results=[
                    dict(rowid=row.rowid,
                         ts=epochFromNaiveUTCDatetime(row.timestamp),
                         value=row.metric_value,
                         rawAnomaly=row.raw_anomaly_score,
                         anomaly=row.anomaly_score) for row in chunk
                    if row is not None
                ])

            # Serialize
            payload = anomaly_service.AnomalyService._serializeModelResult(
                inferenceResultsMessage)

            g_log.info(
                "uid=%s chunk=%d rows=%d payload_size=%d bytes from %s to %s",
                uid, num, len(inferenceResultsMessage["results"]),
                sys.getsizeof(payload),
                datetime.datetime.utcfromtimestamp(
                    inferenceResultsMessage["results"][0].ts),
                datetime.datetime.utcfromtimestamp(
                    inferenceResultsMessage["results"][-1].timestamp))

            messageBus.publishExg(exchange=config.get("metric_streamer",
                                                      "results_exchange_name"),
                                  routingKey="",
                                  body=payload,
                                  properties=modelInferenceResultProperties)

    g_log.info("Done! numMetricDataRows=%d; numModels=%d", numMetricDataRows,
               numModels)