Exemple #1
0
def establishLastEmittedSampleDatetime(key, aggSec):
    """ Query UTC timestamp of the last emitted sample batch; if one hasn't been
  saved yet, then synthesize one, using negative aggregation period offset
  from current time

  :param int aggSec: aggregation period in seconds
  :returns: (possibly synthesized) UTC timestamp of the last
    successfully-emitted sample batch
  :rtype: datetime.datetime
  """
    lastEmittedTimestamp = queryLastEmittedSampleDatetime(key)
    if lastEmittedTimestamp is not None:
        return lastEmittedTimestamp

    # Start at the present to avoid re-sending metric data that we may have
    # already sent to Taurus.
    lastEmittedTimestamp = (datetime.utcnow().replace(microsecond=0) -
                            timedelta(seconds=aggSec))
    collectorsdb.engineFactory().execute(
        schema.emittedSampleTracker.insert().prefix_with(
            "IGNORE", dialect="mysql").values(key=key,
                                              sample_ts=lastEmittedTimestamp))

    # Query again after saving to account for mysql's loss of accuracy
    return queryLastEmittedSampleDatetime(key)
def updateLastEmittedNonMetricSequence(key, seq):
  """ Update the last emitted sample timestamp value in the database for the
  Tweet Volume metrics

  :param str key: caller's key in schema.emittedNonMetricTracker
  :param int seq: sequence of last successfully-emitted non-metric
  """
  update = schema.emittedNonMetricTracker.update(  # pylint: disable=E1120
    ).values(
      last_seq=seq
    ).where(
      (schema.emittedNonMetricTracker.c.key == key)
    )

  result = collectorsdb.engineFactory().execute(update)

  # If update didn't find the key, then insert
  #
  # NOTE: sqlalchemy doesn't support "ON DUPLICATE KEY UPDATE" in its syntactic
  # sugar; see https://bitbucket.org/zzzeek/sqlalchemy/issue/960
  if result.rowcount == 0:
    # The row didn't exist, so create it
    collectorsdb.engineFactory().execute(
      schema.emittedNonMetricTracker.insert()  # pylint: disable=E1120
      .values(key=key, last_seq=seq))
Exemple #3
0
    def testEngineFactorySingletonPattern(self, sqlalchemyMock):

        # Explicitly spec out sqlalchemy.create_engine()
        firstCall = Mock(spec_set=sqlalchemy.engine.base.Engine)
        secondCall = Mock(spec_set=sqlalchemy.engine.base.Engine)
        sqlalchemyMock.create_engine.side_effect = iter(
            [firstCall, secondCall])

        # Call collectorsdb.engineFactory()
        engine = collectorsdb.engineFactory()
        self.assertIs(engine, firstCall)

        # Call collectorsdb.engineFactory() again and assert singleton
        engine2 = collectorsdb.engineFactory()
        self.assertIs(engine2, firstCall)
        self.assertEqual(sqlalchemyMock.create_engine.call_count, 1)

        # Call collectorsdb.engineFactory() in different process, assert new
        # instance
        with patch("taurus.metric_collectors.collectorsdb.os",
                   autospec=True) as osMock:
            osMock.getpid.return_value = collectorsdb._EngineSingleton._pid + 1
            engine3 = collectorsdb.engineFactory()
            self.assertTrue(engine.dispose.called)
            self.assertIs(engine3, secondCall)
Exemple #4
0
def establishLastEmittedSampleDatetime(key, aggSec):
  """ Query UTC timestamp of the last emitted sample batch; if one hasn't been
  saved yet, then synthesize one, using negative aggregation period offset
  from current time

  :param int aggSec: aggregation period in seconds
  :returns: (possibly synthesized) UTC timestamp of the last
    successfully-emitted sample batch
  :rtype: datetime.datetime
  """
  lastEmittedTimestamp = queryLastEmittedSampleDatetime(key)
  if lastEmittedTimestamp is not None:
    return lastEmittedTimestamp

  # Start at the present to avoid re-sending metric data that we may have
  # already sent to Taurus.
  lastEmittedTimestamp = (datetime.utcnow().replace(microsecond=0) -
                          timedelta(seconds=aggSec))
  collectorsdb.engineFactory().execute(
    schema.emittedSampleTracker.insert(
      ).prefix_with("IGNORE", dialect="mysql"
      ).values(key=key,
               sample_ts=lastEmittedTimestamp))

  # Query again after saving to account for mysql's loss of accuracy
  return queryLastEmittedSampleDatetime(key)
def updateLastEmittedNonMetricSequence(key, seq):
  """ Update the last emitted sample timestamp value in the database for the
  News Volume metrics

  :param str key: caller's key in schema.emittedNonMetricTracker
  :param int seq: sequence of last successfully-emitted non-metric
  """
  update = schema.emittedNonMetricTracker.update(  # pylint: disable=E1120
    ).values(
      last_seq=seq
    ).where(
      (schema.emittedNonMetricTracker.c.key == key)
    )

  result = collectorsdb.engineFactory().execute(update)

  # If update didn't find the key, then insert
  #
  # NOTE: sqlalchemy doesn't support "ON DUPLICATE KEY UPDATE" in its syntactic
  # sugar; see https://bitbucket.org/zzzeek/sqlalchemy/issue/960
  if result.rowcount == 0:
    # The row didn't exist, so create it
    collectorsdb.engineFactory().execute(
      schema.emittedNonMetricTracker.insert()  # pylint: disable=E1120
      .values(key=key, last_seq=seq))
def _flagUnknownSymbolAsReported(symbol):
    """
  Flag unknown company symbol as reported in database

  :param str symbol: symbol of the company's security (e.g., "AAPL")
  """
    ins = schema.companySymbolFailures.insert().prefix_with("IGNORE", dialect="mysql").values(symbol=symbol)

    collectorsdb.engineFactory().execute(ins)

    g_log.debug("Saved unknown company symbol=%s", symbol)
  def testEngineFactorySingletonPattern(self):
    # Call collectorsdb.engineFactory()
    engine = collectorsdb.engineFactory()

    # Call collectorsdb.engineFactory() again and assert singleton
    engine2 = collectorsdb.engineFactory()
    self.assertIs(engine2, engine)

    # Call collectorsdb.engineFactory() in different process, assert raises
    # AssertionError
    with self.assertRaises(AssertionError):
      multiprocessing.Pool(processes=1).apply(collectorsdb.engineFactory)
Exemple #8
0
def _flagUnknownSymbolAsReported(symbol):
    """
  Flag unknown company symbol as reported in database

  :param str symbol: symbol of the company's security (e.g., "AAPL")
  """
    ins = schema.companySymbolFailures.insert(  # pylint: disable=E1120
    ).prefix_with('IGNORE', dialect="mysql").values(symbol=symbol)

    collectorsdb.engineFactory().execute(ins)

    g_log.debug("Saved unknown company symbol=%s", symbol)
    def testEngineFactorySingletonPattern(self):
        # Call collectorsdb.engineFactory()
        engine = collectorsdb.engineFactory()

        # Call collectorsdb.engineFactory() again and assert singleton
        engine2 = collectorsdb.engineFactory()
        self.assertIs(engine2, engine)

        # Call collectorsdb.engineFactory() in different process, assert raises
        # AssertionError
        with self.assertRaises(AssertionError):
            multiprocessing.Pool(processes=1).apply(collectorsdb.engineFactory)
  def testEngineFactorySingletonPattern(self):
    # Call collectorsdb.engineFactory()
    engine = collectorsdb.engineFactory()

    # Call collectorsdb.engineFactory() again and assert singleton
    engine2 = collectorsdb.engineFactory()
    self.assertIs(engine2, engine)

    # Call collectorsdb.engineFactory() in different process, assert new
    # instance
    originalEngineId = id(engine)
    engine3 = multiprocessing.Pool(processes=1).apply(_forkedEngineId)
    self.assertNotEqual(id(engine3), originalEngineId)
    def testEngineFactorySingletonPattern(self):
        # Call collectorsdb.engineFactory()
        engine = collectorsdb.engineFactory()

        # Call collectorsdb.engineFactory() again and assert singleton
        engine2 = collectorsdb.engineFactory()
        self.assertIs(engine2, engine)

        # Call collectorsdb.engineFactory() in different process, assert new
        # instance
        originalEngineId = id(engine)
        engine3 = multiprocessing.Pool(processes=1).apply(_forkedEngineId)
        self.assertNotEqual(id(engine3), originalEngineId)
Exemple #12
0
def updateLastEmittedSampleDatetime(key, sampleDatetime):
    """ Update the last emitted sample timestamp value in the database for the
  News Volume metrics

  :param str key: caller's key in schema.emittedSampleTracker
  :param datetime sampleDatetime: UTC datetime of last successfully-emitted
    sample batch
  """
    update = schema.emittedSampleTracker.update().values(
        sample_ts=sampleDatetime).where(
            (schema.emittedSampleTracker.c.key == key))

    collectorsdb.engineFactory().execute(update)
def _saveScreenNameFailure(unmappedScreenName):
    """
  Save unmapped twitter handle in database

  :param unmappedScreenName: the twitter handle that is not valid anymore
  :type unmappedScreenName: string
  """

    ins = (collectorsdb.schema.twitterHandleFailures.insert().prefix_with(
        'IGNORE', dialect="mysql").values(handle=unmappedScreenName))

    collectorsdb.engineFactory().execute(ins)

    g_log.info("Saved unmapped twitter handle; handle=%s", unmappedScreenName)
def updateLastEmittedSampleDatetime(key, sampleDatetime):
    """ Update the last emitted sample timestamp value in the database for the
  News Volume metrics

  :param str key: caller's key in schema.emittedSampleTracker
  :param datetime sampleDatetime: UTC datetime of last successfully-emitted
    sample batch
  """
    update = (
        schema.emittedSampleTracker.update()  # pylint: disable=E1120
        .values(sample_ts=sampleDatetime)
        .where((schema.emittedSampleTracker.c.key == key))
    )

    collectorsdb.engineFactory().execute(update)
def _saveScreenNameFailure(unmappedScreenName):
  """
  Save unmapped twitter handle in database

  :param unmappedScreenName: the twitter handle that is not valid anymore
  :type unmappedScreenName: string
  """

  ins = (collectorsdb.schema.twitterHandleFailures.insert()
         .prefix_with('IGNORE', dialect="mysql")
         .values(handle=unmappedScreenName))

  collectorsdb.engineFactory().execute(ins)

  g_log.info("Saved unmapped twitter handle; handle=%s", unmappedScreenName)
Exemple #16
0
def _purgeTweetsSlatedForDeletion(limit):
    """ Purge tweets that are slated for deletion as indicated by entries in the
  schema.twitterDeletion table

  :param limit: max records to purge per call
  :returns: a sequence of id's of deleted tweets
  """
    twitterTweetsSchema = collectorsdb.schema.twitterTweets
    twitterDeletionSchema = collectorsdb.schema.twitterDeletion

    # NOTE: we first query the row id's to delete, so we can return them for
    # accountability and debugging
    rowsToDeleteSel = sqlalchemy.select([twitterTweetsSchema.c.uid]).where(
        twitterTweetsSchema.c.uid.in_(
            sqlalchemy.select([twitterDeletionSchema.c.tweet_uid
                               ]))).limit(limit)

    numDeleted = 0
    with collectorsdb.engineFactory().begin() as conn:
        rowIdsToDelete = tuple(
            str(row[0]) for row in conn.execute(rowsToDeleteSel).fetchall())

        if rowIdsToDelete:
            tweetDeletion = twitterTweetsSchema.delete().where(
                twitterTweetsSchema.c.uid.in_(rowIdsToDelete))

            numDeleted = conn.execute(tweetDeletion).rowcount

    if len(rowIdsToDelete) != numDeleted:
        g_log.error(
            "Expected to delete %d tweets, but actually deleted %d tweets",
            len(rowIdsToDelete), numDeleted)

    return rowIdsToDelete
Exemple #17
0
def _purgeStaleDeletionRecords(limit):
    """ Delete stale rows in schema.twitterDeletion table

  :param limit: max records to purge per call
  :returns: a sequence of tweet_uid's of deleted schema.twitterDeletion rows
  """
    twitterDeletionSchema = collectorsdb.schema.twitterDeletion

    # NOTE: we first query the row id's to delete, so we can return them for
    # accountability and debugging
    rowsToDeleteSel = sqlalchemy.select([
        twitterDeletionSchema.c.tweet_uid
    ]).where(twitterDeletionSchema.c.created_at < sqlalchemy.func.date_sub(
        sqlalchemy.func.current_timestamp(),
        sqlalchemy.text("INTERVAL %i DAY" %
                        (_DELETION_ROW_EXPIRY_DAYS, )))).limit(limit)

    numDeleted = 0
    with collectorsdb.engineFactory().begin() as conn:
        rowIdsToDelete = tuple(
            str(row[0]) for row in conn.execute(rowsToDeleteSel).fetchall())

        if rowIdsToDelete:
            deletion = twitterDeletionSchema.delete().where(
                twitterDeletionSchema.c.tweet_uid.in_(rowIdsToDelete))

            numDeleted = conn.execute(deletion).rowcount

    if len(rowIdsToDelete) != numDeleted:
        g_log.error(
            "Expected to delete %d tweet delition request rows, but "
            "actually deleted %d rows", len(rowIdsToDelete), numDeleted)

    return rowIdsToDelete
def _deleteSecurity(symbol):
  """Delete security from xignite_security table"""
  with collectorsdb.engineFactory().begin() as conn:
    conn.execute(
      schema.xigniteSecurity  # pylint: disable=E1120
      .delete()
      .where(schema.xigniteSecurity.c.symbol == symbol))
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument("--symbol", required=True)

  args = parser.parse_args()

  expectedAnswer = "Yes-%s" % (random.randint(1, 30),)

  with collectorsdb.engineFactory().begin() as conn:
    answer = raw_input(
      "Attention!  You are about to reset the emitted status for the \"{}\""
      " stock symbol at {}.\n"
      "\n"
      "To back out immediately without making any changes, feel free to type "
      "anything but \"{}\" in the prompt below, and press return.\n"
      "\n"
      "Are you sure you want to continue? ".format(args.symbol,
                                                   str(conn.engine),
                                                   str(expectedAnswer)))

    if answer.strip() != expectedAnswer:
      print "Aborting - Wise choice, my friend. Bye."
      return 1

    deleteFromEmitted(conn, schema.emittedStockPrice, args.symbol)
    deleteFromEmitted(conn, schema.emittedStockVolume, args.symbol)
def _queryNewsVolumes(aggStartDatetime, aggStopDatetime):
  """ Query the database for the counts of security releases+headlines for each
  company that were detected during the specified time window.

  :param aggStartDatetime: inclusive start of aggregation interval as
    UTC datetime
  :param aggStopDatetime: non-inclusive upper bound of aggregation interval as
    UTC datetime
  :returns: a sparse sequence of two-tuples: (symbol, count); companies that
    have no detected news in the given aggregation period will be absent from
    the result.
  """
  headlineSel = sql.select(
    [schema.xigniteSecurityHeadline.c.symbol.label("symbol")]
    ).where(
      (schema.xigniteSecurityHeadline.c.discovered_at >= aggStartDatetime) &
      (schema.xigniteSecurityHeadline.c.discovered_at < aggStopDatetime))

  releaseSel = sql.select(
    [schema.xigniteSecurityRelease.c.symbol]
    ).where(
      (schema.xigniteSecurityRelease.c.discovered_at >= aggStartDatetime) &
      (schema.xigniteSecurityRelease.c.discovered_at < aggStopDatetime))

  allNewsUnion = sql.union_all(headlineSel, releaseSel)

  aggSel = sql.select(
    ["symbol", sql.func.count("symbol").label("sym_count")]
    ).select_from(allNewsUnion.alias("union_of_tables")
    ).group_by("symbol")

  return collectorsdb.engineFactory().execute(aggSel).fetchall()
def main():
  """
  NOTE: main also serves as entry point for "console script" generated by setup
  """
  logging_support.LoggingSupport().initTool()

  try:
    options = _parseArgs()

    days = options["days"]

    g_log.info("Purging records from table=%s older than numDays=%s",
               collectorsdb.schema.twitterTweets, days)


    twitterTweetsSchema = collectorsdb.schema.twitterTweets

    query = twitterTweetsSchema.delete().where(
      twitterTweetsSchema.c.created_at <
      sqlalchemy.func.date_sub(
        sqlalchemy.func.utc_timestamp(),
        sqlalchemy.text("INTERVAL %i DAY" % (days,)))
    )
    with collectorsdb.engineFactory().begin() as conn:
      result = conn.execute(query)

    g_log.info("Purged numRows=%s from table=%s",
               result.rowcount, collectorsdb.schema.twitterTweets)
  except SystemExit as e:
    if e.code != 0:
      g_log.exception("Failed!")
    raise
  except Exception:
    g_log.exception("Failed!")
    raise
def _purgeStaleDeletionRecords(limit):
  """ Delete stale rows in schema.twitterDeletion table

  :param limit: max records to purge per call
  :returns: a sequence of tweet_uid's of deleted schema.twitterDeletion rows
  """
  twitterDeletionSchema = collectorsdb.schema.twitterDeletion

  # NOTE: we first query the row id's to delete, so we can return them for
  # accountability and debugging
  rowsToDeleteSel = sqlalchemy.select(
    [twitterDeletionSchema.c.tweet_uid]).where(
      twitterDeletionSchema.c.created_at <
      sqlalchemy.func.date_sub(
        sqlalchemy.func.current_timestamp(),
        sqlalchemy.text("INTERVAL %i DAY" % (_DELETION_ROW_EXPIRY_DAYS,)))
      ).limit(limit)

  numDeleted = 0
  with collectorsdb.engineFactory().begin() as conn:
    rowIdsToDelete = tuple(
      str(row[0]) for row in conn.execute(rowsToDeleteSel).fetchall()
    )

    if rowIdsToDelete:
      deletion = twitterDeletionSchema.delete().where(
        twitterDeletionSchema.c.tweet_uid.in_(rowIdsToDelete))

      numDeleted = conn.execute(deletion).rowcount

  if len(rowIdsToDelete) != numDeleted:
    g_log.error("Expected to delete %d tweet delition request rows, but "
                "actually deleted %d rows", len(rowIdsToDelete), numDeleted)

  return rowIdsToDelete
  def testEmittedSampleDatetime(self):
    key = "bogus-test-key"

    # Establish initial sample datetime

    result = metric_utils.establishLastEmittedSampleDatetime(key, 300)

    # Cleanup
    self.addCleanup(collectorsdb.engineFactory().execute,
      schema.emittedSampleTracker.delete().where(
        (schema.emittedSampleTracker.c.key == key)
      )
    )

    self.assertIsInstance(result, datetime)

    # Update latest emitted sample datetime to now

    now = datetime.utcnow().replace(microsecond=0)
    metric_utils.updateLastEmittedSampleDatetime(key, now)

    # Verify that it was updated

    lastEmittedSample = metric_utils.queryLastEmittedSampleDatetime(key)

    self.assertEqual(now, lastEmittedSample)
    self.assertLess(result, lastEmittedSample)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--symbol", required=True)

    args = parser.parse_args()

    expectedAnswer = "Yes-%s" % (random.randint(1, 30), )

    with collectorsdb.engineFactory().begin() as conn:
        answer = raw_input(
            "Attention!  You are about to reset the emitted status for the \"{}\""
            " stock symbol at {}.\n"
            "\n"
            "To back out immediately without making any changes, feel free to type "
            "anything but \"{}\" in the prompt below, and press return.\n"
            "\n"
            "Are you sure you want to continue? ".format(
                args.symbol, str(conn.engine), str(expectedAnswer)))

        if answer.strip() != expectedAnswer:
            print "Aborting - Wise choice, my friend. Bye."
            return 1

        deleteFromEmitted(conn, schema.emittedStockPrice, args.symbol)
        deleteFromEmitted(conn, schema.emittedStockVolume, args.symbol)
    def testEmittedSampleDatetime(self):
        key = "bogus-test-key"

        # Establish initial sample datetime

        result = metric_utils.establishLastEmittedSampleDatetime(key, 300)

        # Cleanup
        self.addCleanup(
            collectorsdb.engineFactory().execute,
            schema.emittedSampleTracker.delete().where(
                (schema.emittedSampleTracker.c.key == key)))

        self.assertIsInstance(result, datetime)

        # Update latest emitted sample datetime to now

        now = datetime.utcnow().replace(microsecond=0)
        metric_utils.updateLastEmittedSampleDatetime(key, now)

        # Verify that it was updated

        lastEmittedSample = metric_utils.queryLastEmittedSampleDatetime(key)

        self.assertEqual(now, lastEmittedSample)
        self.assertLess(result, lastEmittedSample)
Exemple #26
0
def _queryNewsVolumes(aggStartDatetime, aggStopDatetime):
    """ Query the database for the counts of security releases+headlines for each
  company that were detected during the specified time window.

  :param aggStartDatetime: inclusive start of aggregation interval as
    UTC datetime
  :param aggStopDatetime: non-inclusive upper bound of aggregation interval as
    UTC datetime
  :returns: a sparse sequence of two-tuples: (symbol, count); companies that
    have no detected news in the given aggregation period will be absent from
    the result.
  """
    headlineSel = sql.select([
        schema.xigniteSecurityHeadline.c.symbol.label("symbol")
    ]).where(
        (schema.xigniteSecurityHeadline.c.discovered_at >= aggStartDatetime)
        & (schema.xigniteSecurityHeadline.c.discovered_at < aggStopDatetime))

    releaseSel = sql.select([schema.xigniteSecurityRelease.c.symbol]).where(
        (schema.xigniteSecurityRelease.c.discovered_at >= aggStartDatetime)
        & (schema.xigniteSecurityRelease.c.discovered_at < aggStopDatetime))

    allNewsUnion = sql.union_all(headlineSel, releaseSel)

    aggSel = sql.select([
        "symbol", sql.func.count("symbol").label("sym_count")
    ]).select_from(allNewsUnion.alias("union_of_tables")).group_by("symbol")

    return collectorsdb.engineFactory().execute(aggSel).fetchall()
def _purgeTweetsSlatedForDeletion(limit):
  """ Purge tweets that are slated for deletion as indicated by entries in the
  schema.twitterDeletion table

  :param limit: max records to purge per call
  :returns: a sequence of id's of deleted tweets
  """
  twitterTweetsSchema = collectorsdb.schema.twitterTweets
  twitterDeletionSchema = collectorsdb.schema.twitterDeletion


  # NOTE: we first query the row id's to delete, so we can return them for
  # accountability and debugging
  rowsToDeleteSel = sqlalchemy.select([twitterTweetsSchema.c.uid]).where(
    twitterTweetsSchema.c.uid.in_(
      sqlalchemy.select([twitterDeletionSchema.c.tweet_uid]))).limit(limit)

  numDeleted = 0
  with collectorsdb.engineFactory().begin() as conn:
    rowIdsToDelete = tuple(
      str(row[0]) for row in conn.execute(rowsToDeleteSel).fetchall()
    )

    if rowIdsToDelete:
      tweetDeletion = twitterTweetsSchema.delete().where(
        twitterTweetsSchema.c.uid.in_(rowIdsToDelete))

      numDeleted = conn.execute(tweetDeletion).rowcount

  if len(rowIdsToDelete) != numDeleted:
    g_log.error("Expected to delete %d tweets, but actually deleted %d tweets",
                len(rowIdsToDelete), numDeleted)

  return rowIdsToDelete
Exemple #28
0
    def testPurgeOldTweets(self):

        gcThresholdDays = 90

        now = datetime.utcnow()

        oldRows = [
            dict(uid=uuid.uuid1().hex,
                 created_at=now - timedelta(days=gcThresholdDays + 1),
                 retweet=False,
                 lang="en-us"),
            dict(uid=uuid.uuid1().hex,
                 created_at=now - timedelta(days=gcThresholdDays + 2),
                 retweet=False,
                 lang="en-us"),
        ]

        youngRows = [
            dict(uid=uuid.uuid1().hex,
                 created_at=now,
                 retweet=False,
                 lang="en-us"),
            dict(uid=uuid.uuid1().hex,
                 created_at=now - timedelta(days=gcThresholdDays - 1),
                 retweet=False,
                 lang="en-us"),
            dict(uid=uuid.uuid1().hex,
                 created_at=now - timedelta(days=gcThresholdDays - 2),
                 retweet=False,
                 lang="en-us"),
        ]

        allRows = oldRows + youngRows

        # Patch collectorsdb config to use a temporary database
        with collectorsdb_test_utils.ManagedTempRepository("purgetweets"):
            engine = collectorsdb.engineFactory()

            numInserted = engine.execute(
                schema.twitterTweets.insert(),  # pylint: disable=E1120
                allRows).rowcount

            self.assertEqual(numInserted, len(allRows))

            # Execute
            numDeleted = purge_old_tweets.purgeOldTweets(gcThresholdDays)

            # Verify

            self.assertEqual(numDeleted, len(oldRows))

            # Verify that only the old tweets got purged
            remainingRows = engine.execute(
                sql.select([schema.twitterTweets.c.uid])).fetchall()

            self.assertEqual(len(remainingRows), len(youngRows))

            self.assertItemsEqual([row["uid"] for row in youngRows],
                                  [row.uid for row in remainingRows])  # pylint: disable=E1101
  def testTransientErrorRetryDecorator(self):
    # Setup proxy.  We'll patch config later, so we need to cache the values
    # so that the original proxy may be restarted with the original params
    config = collectorsdb.CollectorsDbConfig()

    originalHost = config.get("repository", "host")
    originalPort = config.getint("repository", "port")

    def _startProxy():
      p = startProxy(originalHost, originalPort, 6033)
      p.next()
      return p

    proxy = _startProxy()
    self.addCleanup(proxy.send, "kill")

    # Patch collectorsdb config with local proxy
    with ConfigAttributePatch(
          config.CONFIG_NAME,
          config.baseConfigDir,
          (("repository", "host", "127.0.0.1"),
           ("repository", "port", "6033"))):

      # Force refresh of engine singleton
      collectorsdb.resetEngineSingleton()
      engine = collectorsdb.engineFactory()

      # First, make sure valid query returns expected results
      res = collectorsdb.retryOnTransientErrors(engine.execute)("select 1")
      self.assertEqual(res.scalar(), 1)

      @collectorsdb.retryOnTransientErrors
      def _killProxyTryRestartProxyAndTryAgain(n=[]):
        if not n:
          # Kill the proxy on first attempt
          proxy.send("kill")
          proxy.next()
          try:
            engine.execute("select 1")
            self.fail("Proxy did not terminate as expected...")
          except sqlalchemy.exc.OperationalError:
            pass
          n.append(None)
        elif len(n) == 1:
          # Restore proxy in second attempt
          newProxy = _startProxy()
          self.addCleanup(newProxy.send, "kill")
          n.append(None)

        res = engine.execute("select 2")

        return res

      # Try again w/ retry decorator
      result = _killProxyTryRestartProxyAndTryAgain()

      # Verify that the expected value is eventually returned
      self.assertEqual(result.scalar(), 2)
    def testTransientErrorRetryDecorator(self):
        # Setup proxy.  We'll patch config later, so we need to cache the values
        # so that the original proxy may be restarted with the original params
        config = collectorsdb.CollectorsDbConfig()

        originalHost = config.get("repository", "host")
        originalPort = config.getint("repository", "port")

        def _startProxy():
            p = startProxy(originalHost, originalPort, 6033)
            p.next()
            return p

        proxy = _startProxy()
        self.addCleanup(proxy.send, "kill")

        # Patch collectorsdb config with local proxy
        with ConfigAttributePatch(config.CONFIG_NAME, config.baseConfigDir,
                                  (("repository", "host", "127.0.0.1"),
                                   ("repository", "port", "6033"))):

            # Force refresh of engine singleton
            collectorsdb.resetEngineSingleton()
            engine = collectorsdb.engineFactory()

            # First, make sure valid query returns expected results
            res = collectorsdb.retryOnTransientErrors(
                engine.execute)("select 1")
            self.assertEqual(res.scalar(), 1)

            @collectorsdb.retryOnTransientErrors
            def _killProxyTryRestartProxyAndTryAgain(n=[]):  # pylint: disable=W0102
                if not n:
                    # Kill the proxy on first attempt
                    proxy.send("kill")
                    proxy.next()
                    try:
                        engine.execute("select 1")
                        self.fail("Proxy did not terminate as expected...")
                    except sqlalchemy.exc.OperationalError:
                        pass
                    n.append(None)
                elif len(n) == 1:
                    # Restore proxy in second attempt
                    newProxy = _startProxy()
                    self.addCleanup(newProxy.send, "kill")
                    n.append(None)

                res = engine.execute("select 2")

                return res

            # Try again w/ retry decorator
            result = _killProxyTryRestartProxyAndTryAgain()

            # Verify that the expected value is eventually returned
            self.assertEqual(result.scalar(), 2)
def _clearUnknownSymbols():
    """
    Remove all rows from the company_symbol_failures table. 
    """

    result = collectorsdb.engineFactory().execute(schema.companySymbolFailures.delete())

    if result.rowcount:
        g_log.info("Deleted %s rows from %s table", result.rowcount, schema.companySymbolFailures)
def queryLastEmittedNonMetricSequence(key):
    """
  :param str key: caller's key in schema.emittedNonMetricTracker
  :returns: last emitted sequence number for non-metric source; None if one
    hasn't been saved yet.
  :rtype: int if not None
  """
    sel = sql.select([schema.emittedNonMetricTracker.c.last_seq]).where(schema.emittedNonMetricTracker.c.key == key)

    return collectorsdb.engineFactory().execute(sel).scalar()
def queryLastEmittedSampleDatetime(key):
    """
  :param str key: caller's key in schema.emittedSampleTracker
  :returns: UTC timestamp of the last successfully-emitted sample batch; None if
    one hasn't been set up yet; see establishLastEmittedSampleDatetime
  :rtype: datetime.datetime if not None
  """
    sel = sql.select([schema.emittedSampleTracker.c.sample_ts]).where(schema.emittedSampleTracker.c.key == key)

    return collectorsdb.engineFactory().execute(sel).scalar()
    def securityExists(symbol):
      security = collectorsdb.engineFactory().execute(
        sql.select([schema.xigniteSecurity.c.symbol])
        .where(schema.xigniteSecurity.c.symbol == symbol)
      ).scalar()

      if security is not None:
        self.assertEqual(security, symbol)
        return True

      return False
Exemple #35
0
def queryLastEmittedNonMetricSequence(key):
    """
  :param str key: caller's key in schema.emittedNonMetricTracker
  :returns: last emitted sequence number for non-metric source; None if one
    hasn't been saved yet.
  :rtype: int if not None
  """
    sel = sql.select([schema.emittedNonMetricTracker.c.last_seq
                      ]).where(schema.emittedNonMetricTracker.c.key == key)

    return collectorsdb.engineFactory().execute(sel).scalar()
Exemple #36
0
def queryLastEmittedSampleDatetime(key):
    """
  :param str key: caller's key in schema.emittedSampleTracker
  :returns: UTC timestamp of the last successfully-emitted sample batch; None if
    one hasn't been set up yet; see establishLastEmittedSampleDatetime
  :rtype: datetime.datetime if not None
  """
    sel = sql.select([schema.emittedSampleTracker.c.sample_ts
                      ]).where(schema.emittedSampleTracker.c.key == key)

    return collectorsdb.engineFactory().execute(sel).scalar()
def _queryCachedCompanySymbols():
  """Get the cached security symbols from the xignite_security table

  :returns: A sequence of stock symbols from the xignite_security table
  :rtype: sequence
  """
  engine = collectorsdb.engineFactory()

  return tuple(
    row.symbol for row in
    engine.execute(sql.select([schema.xigniteSecurity.c.symbol])).fetchall())
Exemple #38
0
def _clearUnknownSymbols():
    """
  Remove all rows from the company_symbol_failures table.
  """

    result = collectorsdb.engineFactory().execute(
        schema.companySymbolFailures.delete())  # pylint: disable=E1120

    if result.rowcount:
        g_log.info("Deleted %s rows from %s table", result.rowcount,
                   schema.companySymbolFailures)
def _deleteScreenNameFailures():
    """
  Clear rows from the twitter_handle_failures table. 
  """

    result = collectorsdb.engineFactory().execute(
        collectorsdb.schema.twitterHandleFailures.delete())

    if result.rowcount:
        g_log.info("Deleted %s rows from %s table", result.rowcount,
                   collectorsdb.schema.twitterHandleFailures)
        def securityExists(symbol):
            security = collectorsdb.engineFactory().execute(
                sql.select([
                    schema.xigniteSecurity.c.symbol
                ]).where(schema.xigniteSecurity.c.symbol == symbol)).scalar()

            if security is not None:
                self.assertEqual(security, symbol)
                return True

            return False
def _deleteScreenNameFailures():
    """
  Clear rows from the twitter_handle_failures table.
  """

    result = collectorsdb.engineFactory().execute(
        collectorsdb.schema.twitterHandleFailures.delete()
    )  # pylint: disable=E1120

    if result.rowcount:
        g_log.info("Deleted %s rows from %s table", result.rowcount, collectorsdb.schema.twitterHandleFailures)
def _unknownSymbolReported(symbol):
    """ Check if a specific company symbol already exists in the
  company_symbol_failures table.

  :param str symbol: symbol of the company's security (e.g., "AAPL")
  :returns: True, if symbol is already in the table. False, otherwise
  :rtype: bool
  """
    sel = schema.companySymbolFailures.select().where(schema.companySymbolFailures.c.symbol == symbol)
    rows = collectorsdb.engineFactory().execute(sel).fetchall()

    return len(rows) > 0
def _resymbolStockMetrics(oldSymbol, newSymbol):
    """ Resymbol stock metrics

  :param str oldSymbol: old stock symbol, upper case
  :param str newSymbol: new stock symbol, upper case
  """
    g_log.info("Renaming stock metrics: oldSymbol=%s, newSymbol=%s", oldSymbol,
               newSymbol)

    sqlEngine = collectorsdb.engineFactory()

    with sqlEngine.begin() as conn:
        # NOTE: the foreign key cascade-on-update relationship between
        # emitted_stock_price, emitted_stock_volume, xignite_security_bars, and the
        # xignite_security tables causes the symbol to be automatically updated or
        # the corresponding rows to be deleted in the former tables when the symbol
        # in xignite_security table is updated or deleted.

        # Delete emitted stock price rows for old symbol
        conn.execute(
            schema.emittedStockPrice  # pylint: disable=E1120
            .delete().where(schema.emittedStockPrice.c.symbol == oldSymbol))

        # Delete emitted stock volume rows for old symbol
        conn.execute(
            schema.emittedStockVolume  # pylint: disable=E1120
            .delete().where(schema.emittedStockVolume.c.symbol == oldSymbol))

        # Re-symbol xignite security row associated with the old symbol
        #
        # NOTE: we use IGNORE to ignore integrity errors (most likely duplicate),
        # because stock agent might insert a security row for the new symbol before
        # we do.
        conn.execute(schema.xigniteSecurity  # pylint: disable=E1120
                     .update().prefix_with('IGNORE', dialect="mysql").where(
                         schema.xigniteSecurity.c.symbol == oldSymbol).values(
                             symbol=newSymbol))
        # Delete old xignite security row just in case the rename aborted due to
        # integrity error
        conn.execute(
            schema.xigniteSecurity  # pylint: disable=E1120
            .delete().where(schema.xigniteSecurity.c.symbol == oldSymbol))

    # Forward stock metric data samples to Taurus Engine
    g_log.info(
        "Forwarding new stock metric data samples for symbol=%s to Taurus "
        "engine...", newSymbol)
    xignite_stock_agent.transmitMetricData(metricSpecs=[
        spec for spec in xignite_stock_agent.loadMetricSpecs()
        if spec.symbol == newSymbol
    ],
                                           symbol=newSymbol,
                                           engine=sqlEngine)
Exemple #44
0
def _unknownSymbolReported(symbol):
    """ Check if a specific company symbol already exists in the
  company_symbol_failures table.

  :param str symbol: symbol of the company's security (e.g., "AAPL")
  :returns: True, if symbol is already in the table. False, otherwise
  :rtype: bool
  """
    sel = schema.companySymbolFailures.select().where(
        schema.companySymbolFailures.c.symbol == symbol)
    rows = collectorsdb.engineFactory().execute(sel).fetchall()

    return len(rows) > 0
  def testEngineFactorySingletonPattern(self, sqlalchemyMock):

    # Explicitly spec out sqlalchemy.create_engine()
    firstCall = Mock(spec_set=sqlalchemy.engine.base.Engine)
    sqlalchemyMock.create_engine.side_effect = [firstCall]

    # Call collectorsdb.engineFactory()
    engine = collectorsdb.engineFactory()
    self.assertIs(engine, firstCall)

    # Call collectorsdb.engineFactory() again and assert singleton
    engine2 = collectorsdb.engineFactory()
    self.assertIs(engine2, firstCall)
    self.assertEqual(sqlalchemyMock.create_engine.call_count, 1)

    # Call collectorsdb.engineFactory() in different process, assert raises
    # assertion error
    with patch("taurus.metric_collectors.collectorsdb.os.getpid",
               return_value=collectorsdb._EngineSingleton._pid + 1,
               autospec=True):
      with self.assertRaises(AssertionError):
        collectorsdb.engineFactory()
Exemple #46
0
    def queryEndDates():
        sel = sql.select([
            srcSchema.c.symbol,
            sql.func.max(srcSchema.c.local_pub_date)
        ]).group_by(srcSchema.c.symbol)

        resultProxy = collectorsdb.engineFactory().execute(sel)

        endDateMap = dict((row[0], row[1]) for row in resultProxy)

        g_log.debug("%s endDateMap=%s", srcSchema, endDateMap)

        return endDateMap
    def testEngineFactorySingletonPattern(self, sqlalchemyMock):

        # Explicitly spec out sqlalchemy.create_engine()
        firstCall = Mock(spec_set=sqlalchemy.engine.base.Engine)
        secondCall = Mock(spec_set=sqlalchemy.engine.base.Engine)
        sqlalchemyMock.create_engine.side_effect = iter([firstCall, secondCall])

        # Call collectorsdb.engineFactory()
        engine = collectorsdb.engineFactory()
        self.assertIs(engine, firstCall)

        # Call collectorsdb.engineFactory() again and assert singleton
        engine2 = collectorsdb.engineFactory()
        self.assertIs(engine2, firstCall)
        self.assertEqual(sqlalchemyMock.create_engine.call_count, 1)

        # Call collectorsdb.engineFactory() in different process, assert new
        # instance
        with patch("taurus.metric_collectors.collectorsdb.os", autospec=True) as osMock:
            osMock.getpid.return_value = collectorsdb._EngineSingleton._pid + 1
            engine3 = collectorsdb.engineFactory()
            self.assertTrue(engine.dispose.called)
            self.assertIs(engine3, secondCall)
    def testEmittedNonMetricSequence(self):
        key = "bogus-test-key"

        metric_utils.updateLastEmittedNonMetricSequence(key, 1)

        # Cleanup
        self.addCleanup(
            collectorsdb.engineFactory().execute,
            schema.emittedNonMetricTracker.delete().where(
                (schema.emittedNonMetricTracker.c.key == key)))

        lastEmittedSample = metric_utils.queryLastEmittedNonMetricSequence(key)

        self.assertEqual(1, lastEmittedSample)
  def queryEndDates():
    sel = sql.select(
      [srcSchema.c.symbol,
       sql.func.max(srcSchema.c.local_pub_date)]
      ).group_by(srcSchema.c.symbol)

    resultProxy = collectorsdb.engineFactory().execute(sel)

    endDateMap = dict(
      (row[0], row[1]) for row in resultProxy
    )

    g_log.debug("%s endDateMap=%s", srcSchema, endDateMap)

    return endDateMap
def _screenNameFailureReported(screenName):
    """ Check if a specific twitter handle already exists in the
  tweet_handle_failures table.

  :param screenName: twitter handle
  :type screenName: string
  :returns: True, if twitter handle is already in the table. False, otherwise
  :rtype: Boolean
  """
    table = collectorsdb.schema.twitterHandleFailures

    sel = (table.select().where(table.c.handle == screenName))
    rows = collectorsdb.engineFactory().execute(sel)

    return rows.rowcount != 0
def _screenNameFailureReported(screenName):
    """ Check if a specific twitter handle already exists in the
  tweet_handle_failures table.

  :param screenName: twitter handle
  :type screenName: string
  :returns: True, if twitter handle is already in the table. False, otherwise
  :rtype: Boolean
  """
    table = collectorsdb.schema.twitterHandleFailures

    sel = table.select().where(table.c.handle == screenName)
    rows = collectorsdb.engineFactory().execute(sel)

    return rows.rowcount != 0
  def testEmittedNonMetricSequence(self):
    key = "bogus-test-key"

    metric_utils.updateLastEmittedNonMetricSequence(key, 1)

    # Cleanup
    self.addCleanup(collectorsdb.engineFactory().execute,
      schema.emittedNonMetricTracker.delete().where(
        (schema.emittedNonMetricTracker.c.key == key)
      )
    )

    lastEmittedSample = metric_utils.queryLastEmittedNonMetricSequence(key)

    self.assertEqual(1, lastEmittedSample)
Exemple #53
0
    def _queryExistingSecurityNewsRows(cls, table, symbol, startDate, endDate):
        """ Query the given headline or release table for rows matching the given
    ticker symbol within the given date range

    :param sqlalchemy.Table table: table to query
    :param symbol: symbol of the security (e.g., stock ticker symbol)
    :param datetime.date startDate: UTC start date for the operation; inclusive
    :param datetime.date endDate: UTC end date for the operation; inclusive
    :returns: (possibly empty) sequence of matching sqlalchemy.engine.RowProxy
      objects with the following fields: local_pub_date, url, source
    """
        sel = sql.select([table.c.local_pub_date, table.c.url, table.c.source
                          ]).where((table.c.local_pub_date >= startDate)
                                   & (table.c.local_pub_date <= endDate))

        return collectorsdb.engineFactory().execute(sel).fetchall()
  def _queryExistingSecurityNewsRows(cls, table, symbol, startDate, endDate):
    """ Query the given headline or release table for rows matching the given
    ticker symbol within the given date range

    :param sqlalchemy.Table table: table to query
    :param symbol: symbol of the security (e.g., stock ticker symbol)
    :param datetime.date startDate: UTC start date for the operation; inclusive
    :param datetime.date endDate: UTC end date for the operation; inclusive
    :returns: (possibly empty) sequence of matching sqlalchemy.engine.RowProxy
      objects with the following fields: local_pub_date, url, source
    """
    sel = sql.select([table.c.local_pub_date, table.c.url, table.c.source]
      ).where(
          (table.c.local_pub_date >= startDate) &
          (table.c.local_pub_date <= endDate))

    return collectorsdb.engineFactory().execute(sel).fetchall()
Exemple #55
0
    def _saveSecurityNews(self, headlineRows, xigniteSecurity):
        """ Store security news in the destination schema specified via
    _NEWS_SCHEMA member variable.

    :param headlineRows: rows of field values for target security news table
    :type headlineRows: sequence of dicts
    :param dict xigniteSecurity: Security info from xignite API results (e.g.,
      global security news, security bars, etc.)

    :returns: The count of new news rows that were saved; 0 if the news object
      has no headlines.
    """
        destSchema = self._NEWS_SCHEMA

        if not headlineRows:
            return 0

        if self.dryRun:
            g_log.info("%r.process(dryRun=True): security=%s, news=%s", self,
                       xigniteSecurity, headlineRows)
            return 0

        engine = collectorsdb.engineFactory()

        @collectorsdb.retryOnTransientErrors
        def saveNews():
            with engine.begin() as conn:
                # Save headlines
                newsIns = destSchema.insert().prefix_with("IGNORE",
                                                          dialect="mysql")
                return conn.execute(newsIns, headlineRows).rowcount

        try:
            return saveNews()
        except sql.exc.IntegrityError:
            # Most likely foreign key constraint violation against the
            # xignite_security table
            g_log.info("Inserting security row for symbol=%s",
                       xigniteSecurity["Symbol"])
            xignite_agent_utils.insertSecurity(engine, xigniteSecurity)

            # Re-insert news after resolving IntegrityError
            return saveNews()
    def addSecurity(symbol):
      self.addCleanup(_deleteSecurity, symbol)
      xignite_agent_utils.insertSecurity(
        engine=collectorsdb.engineFactory(),
        xigniteSecurity={
          "Symbol": symbol,
          "CIK": "CIK",
          "CUSIP": "CUSIP",
          "ISIN": "ISIN",
          "Valoren": "Valoren",
          "Name": "{sym} Inc.".format(sym=symbol),
          "Market": "Market",
          "MarketIdentificationCode": "mic1",
          "MostLiquidExchange": True,
          "CategoryOrIndustry": "CategoryOrIndustry"
        })

      self.assertTrue(securityExists(symbol),
                      "inserted {symbol} not found".format(symbol=symbol))
  def _saveSecurityNews(self, headlineRows, xigniteSecurity):
    """ Store security news in the destination schema specified via
    _NEWS_SCHEMA member variable.

    :param headlineRows: rows of field values for target security news table
    :type headlineRows: sequence of dicts
    :param dict xigniteSecurity: Security info from xignite API results (e.g.,
      global security news, security bars, etc.)

    :returns: The count of new news rows that were saved; 0 if the news object
      has no headlines.
    """
    destSchema = self._NEWS_SCHEMA

    if not headlineRows:
      return 0

    if self.dryRun:
      g_log.info("%r.process(dryRun=True): security=%s, news=%s", self,
                 xigniteSecurity, headlineRows)
      return 0

    engine = collectorsdb.engineFactory()

    @collectorsdb.retryOnTransientErrors
    def saveNews():
      with engine.begin() as conn:
        # Save headlines
        newsIns = destSchema.insert().prefix_with("IGNORE", dialect="mysql")
        return conn.execute(newsIns, headlineRows).rowcount

    try:
      return saveNews()
    except sql.exc.IntegrityError:
      # Most likely foreign key constraint violation against the
      # xignite_security table
      g_log.info("Inserting security row for symbol=%s",
                 xigniteSecurity["Symbol"])
      xignite_agent_utils.insertSecurity(engine, xigniteSecurity)

      # Re-insert news after resolving IntegrityError
      return saveNews()