def updateLastEmittedNonMetricSequence(key, seq): """ Update the last emitted sample timestamp value in the database for the Tweet Volume metrics :param str key: caller's key in schema.emittedNonMetricTracker :param int seq: sequence of last successfully-emitted non-metric """ update = schema.emittedNonMetricTracker.update( # pylint: disable=E1120 ).values( last_seq=seq ).where( (schema.emittedNonMetricTracker.c.key == key) ) result = collectorsdb.engineFactory().execute(update) # If update didn't find the key, then insert # # NOTE: sqlalchemy doesn't support "ON DUPLICATE KEY UPDATE" in its syntactic # sugar; see https://bitbucket.org/zzzeek/sqlalchemy/issue/960 if result.rowcount == 0: # The row didn't exist, so create it collectorsdb.engineFactory().execute( schema.emittedNonMetricTracker.insert() # pylint: disable=E1120 .values(key=key, last_seq=seq))
def establishLastEmittedSampleDatetime(key, aggSec): """ Query UTC timestamp of the last emitted sample batch; if one hasn't been saved yet, then synthesize one, using negative aggregation period offset from current time :param int aggSec: aggregation period in seconds :returns: (possibly synthesized) UTC timestamp of the last successfully-emitted sample batch :rtype: datetime.datetime """ lastEmittedTimestamp = queryLastEmittedSampleDatetime(key) if lastEmittedTimestamp is not None: return lastEmittedTimestamp # Start at the present to avoid re-sending metric data that we may have # already sent to Taurus. lastEmittedTimestamp = (datetime.utcnow().replace(microsecond=0) - timedelta(seconds=aggSec)) collectorsdb.engineFactory().execute( schema.emittedSampleTracker.insert() # pylint: disable=E1120 .prefix_with("IGNORE", dialect="mysql") .values(key=key, sample_ts=lastEmittedTimestamp)) # Query again after saving to account for mysql's loss of accuracy return queryLastEmittedSampleDatetime(key)
def _flagUnknownSymbolAsReported(symbol): """ Flag unknown company symbol as reported in database :param str symbol: symbol of the company's security (e.g., "AAPL") """ ins = schema.companySymbolFailures.insert( # pylint: disable=E1120 ).prefix_with('IGNORE', dialect="mysql").values(symbol=symbol) collectorsdb.engineFactory().execute(ins) g_log.debug("Saved unknown company symbol=%s", symbol)
def testEngineFactorySingletonPattern(self): # Call collectorsdb.engineFactory() engine = collectorsdb.engineFactory() # Call collectorsdb.engineFactory() again and assert singleton engine2 = collectorsdb.engineFactory() self.assertIs(engine2, engine) # Call collectorsdb.engineFactory() in different process, assert raises # AssertionError with self.assertRaises(AssertionError): multiprocessing.Pool(processes=1).apply(collectorsdb.engineFactory)
def _saveScreenNameFailure(unmappedScreenName): """ Save unmapped twitter handle in database :param unmappedScreenName: the twitter handle that is not valid anymore :type unmappedScreenName: string """ ins = ( collectorsdb.schema # pylint: disable=E1120 .twitterHandleFailures.insert().prefix_with( 'IGNORE', dialect="mysql").values(handle=unmappedScreenName)) collectorsdb.engineFactory().execute(ins) g_log.info("Saved unmapped twitter handle; handle=%s", unmappedScreenName)
def testEmittedSampleDatetime(self): key = "bogus-test-key" # Establish initial sample datetime result = metric_utils.establishLastEmittedSampleDatetime(key, 300) # Cleanup self.addCleanup( collectorsdb.engineFactory().execute, schema.emittedSampleTracker.delete().where( # pylint: disable=E1120 (schema.emittedSampleTracker.c.key == key) ) ) self.assertIsInstance(result, datetime) # Update latest emitted sample datetime to now now = datetime.utcnow().replace(microsecond=0) metric_utils.updateLastEmittedSampleDatetime(key, now) # Verify that it was updated lastEmittedSample = metric_utils.queryLastEmittedSampleDatetime(key) self.assertEqual(now, lastEmittedSample) self.assertLess(result, lastEmittedSample)
def _purgeStaleDeletionRecords(limit): """ Delete stale rows in schema.twitterDeletion table :param limit: max records to purge per call :returns: a sequence of tweet_uid's of deleted schema.twitterDeletion rows """ twitterDeletionSchema = collectorsdb.schema.twitterDeletion # NOTE: we first query the row id's to delete, so we can return them for # accountability and debugging rowsToDeleteSel = sqlalchemy.select( [twitterDeletionSchema.c.tweet_uid]).where( twitterDeletionSchema.c.created_at < sqlalchemy.func.date_sub( sqlalchemy.func.current_timestamp(), sqlalchemy.text("INTERVAL %i DAY" % (_DELETION_ROW_EXPIRY_DAYS,))) ).limit(limit) numDeleted = 0 with collectorsdb.engineFactory().begin() as conn: rowIdsToDelete = tuple( str(row[0]) for row in conn.execute(rowsToDeleteSel).fetchall() ) if rowIdsToDelete: deletion = twitterDeletionSchema.delete( # pylint: disable=E1120 ).where(twitterDeletionSchema.c.tweet_uid.in_(rowIdsToDelete)) numDeleted = conn.execute(deletion).rowcount if len(rowIdsToDelete) != numDeleted: g_log.error("Expected to delete %d tweet delition request rows, but " "actually deleted %d rows", len(rowIdsToDelete), numDeleted) return rowIdsToDelete
def _saveScreenNameFailure(unmappedScreenName): """ Save unmapped twitter handle in database :param unmappedScreenName: the twitter handle that is not valid anymore :type unmappedScreenName: string """ ins = (collectorsdb.schema # pylint: disable=E1120 .twitterHandleFailures.insert() .prefix_with('IGNORE', dialect="mysql") .values(handle=unmappedScreenName)) collectorsdb.engineFactory().execute(ins) g_log.info("Saved unmapped twitter handle; handle=%s", unmappedScreenName)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--symbol", required=True) args = parser.parse_args() expectedAnswer = "Yes-%s" % (random.randint(1, 30), ) with collectorsdb.engineFactory().begin() as conn: answer = raw_input( "Attention! You are about to reset the emitted status for the \"{}\"" " stock symbol at {}.\n" "\n" "To back out immediately without making any changes, feel free to type " "anything but \"{}\" in the prompt below, and press return.\n" "\n" "Are you sure you want to continue? ".format( args.symbol, str(conn.engine), str(expectedAnswer))) if answer.strip() != expectedAnswer: print "Aborting - Wise choice, my friend. Bye." return 1 deleteFromEmitted(conn, schema.emittedStockPrice, args.symbol) deleteFromEmitted(conn, schema.emittedStockVolume, args.symbol)
def testEmittedSampleDatetime(self): key = "bogus-test-key" # Establish initial sample datetime result = metric_utils.establishLastEmittedSampleDatetime(key, 300) # Cleanup self.addCleanup( collectorsdb.engineFactory().execute, schema.emittedSampleTracker.delete().where( # pylint: disable=E1120 (schema.emittedSampleTracker.c.key == key))) self.assertIsInstance(result, datetime) # Update latest emitted sample datetime to now now = datetime.utcnow().replace(microsecond=0) metric_utils.updateLastEmittedSampleDatetime(key, now) # Verify that it was updated lastEmittedSample = metric_utils.queryLastEmittedSampleDatetime(key) self.assertEqual(now, lastEmittedSample) self.assertLess(result, lastEmittedSample)
def _purgeTweetsSlatedForDeletion(limit): """ Purge tweets that are slated for deletion as indicated by entries in the schema.twitterDeletion table :param limit: max records to purge per call :returns: a sequence of id's of deleted tweets """ twitterTweetsSchema = collectorsdb.schema.twitterTweets twitterDeletionSchema = collectorsdb.schema.twitterDeletion # NOTE: we first query the row id's to delete, so we can return them for # accountability and debugging rowsToDeleteSel = sqlalchemy.select([twitterTweetsSchema.c.uid]).where( twitterTweetsSchema.c.uid.in_( sqlalchemy.select([twitterDeletionSchema.c.tweet_uid ]))).limit(limit) numDeleted = 0 with collectorsdb.engineFactory().begin() as conn: rowIdsToDelete = tuple( str(row[0]) for row in conn.execute(rowsToDeleteSel).fetchall()) if rowIdsToDelete: tweetDeletion = twitterTweetsSchema.delete( # pylint: disable=E1120 ).where(twitterTweetsSchema.c.uid.in_(rowIdsToDelete)) numDeleted = conn.execute(tweetDeletion).rowcount if len(rowIdsToDelete) != numDeleted: g_log.error( "Expected to delete %d tweets, but actually deleted %d tweets", len(rowIdsToDelete), numDeleted) return rowIdsToDelete
def main(): parser = argparse.ArgumentParser() parser.add_argument("--symbol", required=True) args = parser.parse_args() expectedAnswer = "Yes-%s" % (random.randint(1, 30),) with collectorsdb.engineFactory().begin() as conn: answer = raw_input( "Attention! You are about to reset the emitted status for the \"{}\"" " stock symbol at {}.\n" "\n" "To back out immediately without making any changes, feel free to type " "anything but \"{}\" in the prompt below, and press return.\n" "\n" "Are you sure you want to continue? ".format(args.symbol, str(conn.engine), str(expectedAnswer))) if answer.strip() != expectedAnswer: print "Aborting - Wise choice, my friend. Bye." return 1 deleteFromEmitted(conn, schema.emittedStockPrice, args.symbol) deleteFromEmitted(conn, schema.emittedStockVolume, args.symbol)
def _purgeTweetsSlatedForDeletion(limit): """ Purge tweets that are slated for deletion as indicated by entries in the schema.twitterDeletion table :param limit: max records to purge per call :returns: a sequence of id's of deleted tweets """ twitterTweetsSchema = collectorsdb.schema.twitterTweets twitterDeletionSchema = collectorsdb.schema.twitterDeletion # NOTE: we first query the row id's to delete, so we can return them for # accountability and debugging rowsToDeleteSel = sqlalchemy.select([twitterTweetsSchema.c.uid]).where( twitterTweetsSchema.c.uid.in_( sqlalchemy.select([twitterDeletionSchema.c.tweet_uid]))).limit(limit) numDeleted = 0 with collectorsdb.engineFactory().begin() as conn: rowIdsToDelete = tuple( str(row[0]) for row in conn.execute(rowsToDeleteSel).fetchall() ) if rowIdsToDelete: tweetDeletion = twitterTweetsSchema.delete( # pylint: disable=E1120 ).where(twitterTweetsSchema.c.uid.in_(rowIdsToDelete)) numDeleted = conn.execute(tweetDeletion).rowcount if len(rowIdsToDelete) != numDeleted: g_log.error("Expected to delete %d tweets, but actually deleted %d tweets", len(rowIdsToDelete), numDeleted) return rowIdsToDelete
def _deleteSecurity(symbol): """Delete security from xignite_security table""" with collectorsdb.engineFactory().begin() as conn: conn.execute( schema.xigniteSecurity # pylint: disable=E1120 .delete() .where(schema.xigniteSecurity.c.symbol == symbol))
def updateLastEmittedSampleDatetime(key, sampleDatetime): """ Update the last emitted sample timestamp value in the database for the Tweet Volume metrics :param str key: caller's key in schema.emittedSampleTracker :param datetime sampleDatetime: UTC datetime of last successfully-emitted sample batch """ update = schema.emittedSampleTracker.update( # pylint: disable=E1120 ).values( sample_ts=sampleDatetime ).where( (schema.emittedSampleTracker.c.key == key) ) collectorsdb.engineFactory().execute(update)
def _purgeStaleDeletionRecords(limit): """ Delete stale rows in schema.twitterDeletion table :param limit: max records to purge per call :returns: a sequence of tweet_uid's of deleted schema.twitterDeletion rows """ twitterDeletionSchema = collectorsdb.schema.twitterDeletion # NOTE: we first query the row id's to delete, so we can return them for # accountability and debugging rowsToDeleteSel = sqlalchemy.select([ twitterDeletionSchema.c.tweet_uid ]).where(twitterDeletionSchema.c.created_at < sqlalchemy.func.date_sub( sqlalchemy.func.current_timestamp(), sqlalchemy.text("INTERVAL %i DAY" % (_DELETION_ROW_EXPIRY_DAYS, )))).limit(limit) numDeleted = 0 with collectorsdb.engineFactory().begin() as conn: rowIdsToDelete = tuple( str(row[0]) for row in conn.execute(rowsToDeleteSel).fetchall()) if rowIdsToDelete: deletion = twitterDeletionSchema.delete( # pylint: disable=E1120 ).where(twitterDeletionSchema.c.tweet_uid.in_(rowIdsToDelete)) numDeleted = conn.execute(deletion).rowcount if len(rowIdsToDelete) != numDeleted: g_log.error( "Expected to delete %d tweet delition request rows, but " "actually deleted %d rows", len(rowIdsToDelete), numDeleted) return rowIdsToDelete
def testPurgeOldTweets(self): gcThresholdDays = 90 now = datetime.utcnow() oldRows = [ dict(uid=uuid.uuid1().hex, created_at=now - timedelta(days=gcThresholdDays + 1), retweet=False, lang="en-us"), dict(uid=uuid.uuid1().hex, created_at=now - timedelta(days=gcThresholdDays + 2), retweet=False, lang="en-us"), ] youngRows = [ dict(uid=uuid.uuid1().hex, created_at=now, retweet=False, lang="en-us"), dict(uid=uuid.uuid1().hex, created_at=now - timedelta(days=gcThresholdDays - 1), retweet=False, lang="en-us"), dict(uid=uuid.uuid1().hex, created_at=now - timedelta(days=gcThresholdDays - 2), retweet=False, lang="en-us"), ] allRows = oldRows + youngRows # Patch collectorsdb config to use a temporary database with collectorsdb_test_utils.ManagedTempRepository("purgetweets"): engine = collectorsdb.engineFactory() numInserted = engine.execute( schema.twitterTweets.insert(), # pylint: disable=E1120 allRows).rowcount self.assertEqual(numInserted, len(allRows)) # Execute numDeleted = purge_old_tweets.purgeOldTweets(gcThresholdDays) # Verify self.assertEqual(numDeleted, len(oldRows)) # Verify that only the old tweets got purged remainingRows = engine.execute( sql.select([schema.twitterTweets.c.uid])).fetchall() self.assertEqual(len(remainingRows), len(youngRows)) self.assertItemsEqual([row["uid"] for row in youngRows], [row.uid for row in remainingRows]) # pylint: disable=E1101
def testTransientErrorRetryDecorator(self): # Setup proxy. We'll patch config later, so we need to cache the values # so that the original proxy may be restarted with the original params config = collectorsdb.CollectorsDbConfig() originalHost = config.get("repository", "host") originalPort = config.getint("repository", "port") def _startProxy(): p = startProxy(originalHost, originalPort, 6033) p.next() return p proxy = _startProxy() self.addCleanup(proxy.send, "kill") # Patch collectorsdb config with local proxy with ConfigAttributePatch(config.CONFIG_NAME, config.baseConfigDir, (("repository", "host", "127.0.0.1"), ("repository", "port", "6033"))): # Force refresh of engine singleton collectorsdb.resetEngineSingleton() engine = collectorsdb.engineFactory() # First, make sure valid query returns expected results res = collectorsdb.retryOnTransientErrors(engine.execute)("select 1") self.assertEqual(res.scalar(), 1) @collectorsdb.retryOnTransientErrors def _killProxyTryRestartProxyAndTryAgain(n=[]): # pylint: disable=W0102 if not n: # Kill the proxy on first attempt proxy.send("kill") proxy.next() try: engine.execute("select 1") self.fail("Proxy did not terminate as expected...") except sqlalchemy.exc.OperationalError: pass n.append(None) elif len(n) == 1: # Restore proxy in second attempt newProxy = _startProxy() self.addCleanup(newProxy.send, "kill") n.append(None) res = engine.execute("select 2") return res # Try again w/ retry decorator result = _killProxyTryRestartProxyAndTryAgain() # Verify that the expected value is eventually returned self.assertEqual(result.scalar(), 2)
def testTransientErrorRetryDecorator(self): # Setup proxy. We'll patch config later, so we need to cache the values # so that the original proxy may be restarted with the original params config = collectorsdb.CollectorsDbConfig() originalHost = config.get("repository", "host") originalPort = config.getint("repository", "port") def _startProxy(): p = startProxy(originalHost, originalPort, 6033) p.next() return p proxy = _startProxy() self.addCleanup(proxy.send, "kill") # Patch collectorsdb config with local proxy with ConfigAttributePatch(config.CONFIG_NAME, config.baseConfigDir, (("repository", "host", "127.0.0.1"), ("repository", "port", "6033"))): # Force refresh of engine singleton collectorsdb.resetEngineSingleton() engine = collectorsdb.engineFactory() # First, make sure valid query returns expected results res = collectorsdb.retryOnTransientErrors( engine.execute)("select 1") self.assertEqual(res.scalar(), 1) @collectorsdb.retryOnTransientErrors def _killProxyTryRestartProxyAndTryAgain(n=[]): # pylint: disable=W0102 if not n: # Kill the proxy on first attempt proxy.send("kill") proxy.next() try: engine.execute("select 1") self.fail("Proxy did not terminate as expected...") except sqlalchemy.exc.OperationalError: pass n.append(None) elif len(n) == 1: # Restore proxy in second attempt newProxy = _startProxy() self.addCleanup(newProxy.send, "kill") n.append(None) res = engine.execute("select 2") return res # Try again w/ retry decorator result = _killProxyTryRestartProxyAndTryAgain() # Verify that the expected value is eventually returned self.assertEqual(result.scalar(), 2)
def queryLastEmittedSampleDatetime(key): """ :param str key: caller's key in schema.emittedSampleTracker :returns: UTC timestamp of the last successfully-emitted sample batch; None if one hasn't been set up yet; see establishLastEmittedSampleDatetime :rtype: datetime.datetime if not None """ sel = sql.select([schema.emittedSampleTracker.c.sample_ts]).where( schema.emittedSampleTracker.c.key == key) return collectorsdb.engineFactory().execute(sel).scalar()
def queryLastEmittedNonMetricSequence(key): """ :param str key: caller's key in schema.emittedNonMetricTracker :returns: last emitted sequence number for non-metric source; None if one hasn't been saved yet. :rtype: int if not None """ sel = sql.select([schema.emittedNonMetricTracker.c.last_seq]).where( schema.emittedNonMetricTracker.c.key == key) return collectorsdb.engineFactory().execute(sel).scalar()
def _clearUnknownSymbols(): """ Remove all rows from the company_symbol_failures table. """ result = collectorsdb.engineFactory().execute( schema.companySymbolFailures.delete()) # pylint: disable=E1120 if result.rowcount: g_log.info("Deleted %s rows from %s table", result.rowcount, schema.companySymbolFailures)
def _queryCachedCompanySymbols(): """Get the cached security symbols from the xignite_security table :returns: A sequence of stock symbols from the xignite_security table :rtype: sequence """ engine = collectorsdb.engineFactory() return tuple( row.symbol for row in engine.execute(sql.select([schema.xigniteSecurity.c.symbol])).fetchall())
def _deleteScreenNameFailures(): """ Clear rows from the twitter_handle_failures table. """ result = collectorsdb.engineFactory().execute( collectorsdb.schema.twitterHandleFailures.delete()) # pylint: disable=E1120 if result.rowcount: g_log.info("Deleted %s rows from %s table", result.rowcount, collectorsdb.schema.twitterHandleFailures)
def securityExists(symbol): security = collectorsdb.engineFactory().execute( sql.select([schema.xigniteSecurity.c.symbol]) .where(schema.xigniteSecurity.c.symbol == symbol) ).scalar() if security is not None: self.assertEqual(security, symbol) return True return False
def _unknownSymbolReported(symbol): """ Check if a specific company symbol already exists in the company_symbol_failures table. :param str symbol: symbol of the company's security (e.g., "AAPL") :returns: True, if symbol is already in the table. False, otherwise :rtype: bool """ sel = schema.companySymbolFailures.select().where( schema.companySymbolFailures.c.symbol == symbol) rows = collectorsdb.engineFactory().execute(sel).fetchall() return len(rows) > 0
def _unknownSymbolReported(symbol): """ Check if a specific company symbol already exists in the company_symbol_failures table. :param str symbol: symbol of the company's security (e.g., "AAPL") :returns: True, if symbol is already in the table. False, otherwise :rtype: bool """ sel = schema.companySymbolFailures.select( ).where(schema.companySymbolFailures.c.symbol == symbol) rows = collectorsdb.engineFactory().execute(sel).fetchall() return len(rows) > 0
def testEngineFactorySingletonPattern(self, sqlalchemyMock): # Explicitly spec out sqlalchemy.create_engine() firstCall = Mock(spec_set=sqlalchemy.engine.base.Engine) sqlalchemyMock.create_engine.side_effect = [firstCall] # Call collectorsdb.engineFactory() engine = collectorsdb.engineFactory() self.assertIs(engine, firstCall) # Call collectorsdb.engineFactory() again and assert singleton engine2 = collectorsdb.engineFactory() self.assertIs(engine2, firstCall) self.assertEqual(sqlalchemyMock.create_engine.call_count, 1) # Call collectorsdb.engineFactory() in different process, assert raises # assertion error with patch("taurus_metric_collectors.collectorsdb.os.getpid", return_value=collectorsdb._EngineSingleton._pid + 1, autospec=True): with self.assertRaises(AssertionError): collectorsdb.engineFactory()
def testEmittedNonMetricSequence(self): key = "bogus-test-key" metric_utils.updateLastEmittedNonMetricSequence(key, 1) # Cleanup self.addCleanup( collectorsdb.engineFactory().execute, schema.emittedNonMetricTracker.delete().where( # pylint: disable=E1120 (schema.emittedNonMetricTracker.c.key == key))) lastEmittedSample = metric_utils.queryLastEmittedNonMetricSequence(key) self.assertEqual(1, lastEmittedSample)
def testPurgeOldTweets(self): gcThresholdDays = 90 now = datetime.utcnow() oldRows = [ dict( uid=uuid.uuid1().hex, created_at=now - timedelta(days=gcThresholdDays + 1), retweet=False, lang="en-us" ), dict( uid=uuid.uuid1().hex, created_at=now - timedelta(days=gcThresholdDays + 2), retweet=False, lang="en-us" ), ] youngRows = [ dict(uid=uuid.uuid1().hex, created_at=now, retweet=False, lang="en-us"), dict( uid=uuid.uuid1().hex, created_at=now - timedelta(days=gcThresholdDays - 1), retweet=False, lang="en-us" ), dict( uid=uuid.uuid1().hex, created_at=now - timedelta(days=gcThresholdDays - 2), retweet=False, lang="en-us" ), ] allRows = oldRows + youngRows # Patch collectorsdb config to use a temporary database with collectorsdb_test_utils.ManagedTempRepository("purgetweets"): engine = collectorsdb.engineFactory() numInserted = engine.execute(schema.twitterTweets.insert(), allRows).rowcount # pylint: disable=E1120 self.assertEqual(numInserted, len(allRows)) # Execute numDeleted = purge_old_tweets.purgeOldTweets(gcThresholdDays) # Verify self.assertEqual(numDeleted, len(oldRows)) # Verify that only the old tweets got purged remainingRows = engine.execute(sql.select([schema.twitterTweets.c.uid])).fetchall() self.assertEqual(len(remainingRows), len(youngRows)) self.assertItemsEqual( [row["uid"] for row in youngRows], [row.uid for row in remainingRows] ) # pylint: disable=E1101
def _screenNameFailureReported(screenName): """ Check if a specific twitter handle already exists in the tweet_handle_failures table. :param screenName: twitter handle :type screenName: string :returns: True, if twitter handle is already in the table. False, otherwise :rtype: Boolean """ table = collectorsdb.schema.twitterHandleFailures sel = (table.select().where(table.c.handle == screenName)) rows = collectorsdb.engineFactory().execute(sel) return rows.rowcount != 0
def testEmittedNonMetricSequence(self): key = "bogus-test-key" metric_utils.updateLastEmittedNonMetricSequence(key, 1) # Cleanup self.addCleanup( collectorsdb.engineFactory().execute, schema.emittedNonMetricTracker.delete().where( # pylint: disable=E1120 (schema.emittedNonMetricTracker.c.key == key) ) ) lastEmittedSample = metric_utils.queryLastEmittedNonMetricSequence(key) self.assertEqual(1, lastEmittedSample)
def addSecurity(symbol): self.addCleanup(_deleteSecurity, symbol) xignite_agent_utils.insertSecurity( engine=collectorsdb.engineFactory(), xigniteSecurity={ "Symbol": symbol, "CIK": "CIK", "CUSIP": "CUSIP", "ISIN": "ISIN", "Valoren": "Valoren", "Name": "{sym} Inc.".format(sym=symbol), "Market": "Market", "MarketIdentificationCode": "mic1", "MostLiquidExchange": True, "CategoryOrIndustry": "CategoryOrIndustry" }) self.assertTrue(securityExists(symbol), "inserted {symbol} not found".format(symbol=symbol))
def poll(metricSpecs, apitoken, barlength, days): """ Poll XIgnite data for given metricspecs associated with the same symbol, returning only new data relative to previously fetched data :param metricSpecs: Sequence of one or more StockMetricSpec objects associated with the same stock symbol for which to conduct polling :param apitoken: XIgnite API Token :param barlength: Aggregation time period (in minutes) :param days: Number of days to request :type days: int :returns: security details (dict), and new data as a sequence of dicts :rtype: 2-tuple """ try: symbol = metricSpecs[0].symbol now = datetime.datetime.now(_UTC_TZ) # Now, in UTC time now -= datetime.timedelta(minutes=(barlength + now.minute % barlength), seconds=now.second) # Align and pad end time to # prevent too recent of a bucket from # being returned engine = collectorsdb.engineFactory() lastSample = _getLatestSample(engine, symbol) if lastSample: localizedLastEndTime = ( getEasternLocalizedEndTimestampFromSampleRow(lastSample)) else: # Need to bootstrap from existing file-based .history/ approach symbolFilename = getSymbolFilename(symbol) if os.path.isfile(symbolFilename): # TODO: TAUR-779 Remove this case once we've successfully migrated # away from file-based approach. with open(symbolFilename, "r+") as symbolFile: try: # Seek to end of file for latest sample lastline = StringIO(symbolFile.readlines()[-1]) csvin = csv.reader(lastline) lastSample = dict(zip(_COLS, next(csvin))) localizedLastEndTime = ( getEasternLocalizedTimestampFromSample(lastSample["EndDate"], lastSample["EndTime"], lastSample["UTCOffset"])) except IndexError: # File is empty lastSample = {} localizedLastEndTime = ( ((now - datetime.timedelta(days=days)).astimezone(_EASTERN_TZ))) else: localizedLastEndTime = ( ((now - datetime.timedelta(days=days)).astimezone(_EASTERN_TZ))) # Set start time to match last end, and end to be now # Use Eastern because that's what XIgnite assumes in the API localizedStartTime = localizedLastEndTime localizedEndTime = now.astimezone(_EASTERN_TZ) # Fetch XIgnite data fields = ["Outcome", "Message", "Identity", "Delay", "Security", "Security.CIK", "Security.CUSIP", "Security.Symbol", "Security.ISIN", "Security.Valoren", "Security.Name", "Security.Market", "Security.MarketIdentificationCode", "Security.MostLiquidExchange", "Security.CategoryOrIndustry", "Bars", "Bars.StartDate", "Bars.StartTime", "Bars.EndDate", "Bars.EndTime", "Bars.UTCOffset", "Bars.Open", "Bars.High", "Bars.Low", "Bars.Trades"] for spec in metricSpecs: fields.append("Bars.%s" % (spec.sampleKey,)) try: data = getData(symbol=symbol, apitoken=apitoken, barlength=barlength, startTime=localizedStartTime.strftime(DATE_FMT), endTime=localizedEndTime.strftime(DATE_FMT), fields=fields) except Exception as e: _LOG.exception("Unexpected error while retrieving data from XIgnite.") return {"Symbol": symbol}, [] if (data and "Bars" in data and "Outcome" in data and data["Outcome"] == "Success"): # Return only the new data def sampleStartIsGreaterThanOrEqualToLastEndTime(sample): # Compare w/ consistent timezones. return ( getEasternLocalizedTimestampFromSample(sample["StartDate"], sample["StartTime"], sample["UTCOffset"]) >= localizedLastEndTime) return (data["Security"], [sample for sample in data["Bars"] if (not lastSample or sampleStartIsGreaterThanOrEqualToLastEndTime(sample))]) else: return {"Symbol": symbol}, [] except Exception: _LOG.exception("poll failed for metricSpecs=%s", metricSpecs) raise
def testCollectorsdbIsAccessible(self): # pylint: disable=R0201 transactionContext = collectorsdb.engineFactory().begin() transactionContext.transaction.rollback()
def purgeOldTweets(thresholdDays): """ Purge tweets from twitter_tweets table that are older than the given number of days. :param int thresholdDays: tweets older than this many days will be deleted :returns: number of rows that were deleted """ twitterTweetsSchema = collectorsdb.schema.twitterTweets g_log.info("Estimating number of tweets in table=%s older than numDays=%s", twitterTweetsSchema, thresholdDays) sqlEngine = collectorsdb.engineFactory() selectionPredicate = (twitterTweetsSchema.c.created_at < sql.func.date_sub( sql.func.utc_timestamp(), sql.text("INTERVAL {:d} DAY".format(thresholdDays)))) estimate = _estimateNumTweetsToDelete(sqlEngine, selectionPredicate) g_log.info("Number of candidate old tweets to purge: estimate=%s", estimate) if estimate == 0: return 0 # NOTE: We'll be deleting in smaller batches to avoid "Lock wait timeout # exceeded". # # When the number of old rows is huge, if we try to delete them in a single # transaction, we get perpetually mired in the error "Lock wait timeout # exceeded; try restarting transaction". mysql/innodb provides the setting # `innodb_lock_wait_timeout` that may be overriden, but there isn't a good way # to estimate a value that guarantees success. Doing it in one transaction # also doesn't facilitate progress update, thus creating the perception that # the operation is "stuck". totalDeleted = 0 while totalDeleted < estimate: # NOTE: we're dealing with a couple of issues here: # # 1. sqlalchemy core doesn't support LIMIT in delete statements, so we can't # use delete directly with LIMIT and ORDER BY # 2. MySql (5.6.21) doesn't support LIMIT & IN subqueries: "This version of # MySQL doesn't yet support 'LIMIT & IN/ALL/ANY/SOME subquery" # # So, we're going to stick with sqlalchemy, and break the operation into two # queries: get the candidate uids, then delete rows with those uids limit = min(_MAX_DELETE_BATCH_SIZE, estimate - totalDeleted) uids = _queryCandidateRows(sqlEngine=sqlEngine, selectionPredicate=selectionPredicate, limit=limit) if uids: numDeleted = _deleteRows(sqlEngine=sqlEngine, uids=uids) else: # This could happen if something else deleted tweets in our range, such # as the process_tweet_deletions service that services deletion requests # from Twitter. break totalDeleted += numDeleted g_log.info("Purged %s old tweets [%s of %s]", numDeleted, totalDeleted, estimate) g_log.info("Purged numRows=%s of estimated=%s old tweets from table=%s", totalDeleted, estimate, twitterTweetsSchema) return totalDeleted
def purgeOldTweets(thresholdDays): """ Purge tweets from twitter_tweets table that are older than the given number of days. :param int thresholdDays: tweets older than this many days will be deleted :returns: number of rows that were deleted """ twitterTweetsSchema = collectorsdb.schema.twitterTweets g_log.info("Estimating number of tweets in table=%s older than numDays=%s", twitterTweetsSchema, thresholdDays) sqlEngine = collectorsdb.engineFactory() selectionPredicate = ( twitterTweetsSchema.c.created_at < sql.func.date_sub(sql.func.utc_timestamp(), sql.text("INTERVAL {:d} DAY".format(thresholdDays))) ) estimate = _estimateNumTweetsToDelete(sqlEngine, selectionPredicate) g_log.info("Number of candidate old tweets to purge: estimate=%s", estimate) if estimate == 0: return 0 # NOTE: We'll be deleting in smaller batches to avoid "Lock wait timeout # exceeded". # # When the number of old rows is huge, if we try to delete them in a single # transaction, we get perpetually mired in the error "Lock wait timeout # exceeded; try restarting transaction". mysql/innodb provides the setting # `innodb_lock_wait_timeout` that may be overriden, but there isn't a good way # to estimate a value that guarantees success. Doing it in one transaction # also doesn't facilitate progress update, thus creating the perception that # the operation is "stuck". totalDeleted = 0 while totalDeleted < estimate: # NOTE: we're dealing with a couple of issues here: # # 1. sqlalchemy core doesn't support LIMIT in delete statements, so we can't # use delete directly with LIMIT and ORDER BY # 2. MySql (5.6.21) doesn't support LIMIT & IN subqueries: "This version of # MySQL doesn't yet support 'LIMIT & IN/ALL/ANY/SOME subquery" # # So, we're going to stick with sqlalchemy, and break the operation into two # queries: get the candidate uids, then delete rows with those uids limit = min(_MAX_DELETE_BATCH_SIZE, estimate - totalDeleted) uids = _queryCandidateRows(sqlEngine=sqlEngine, selectionPredicate=selectionPredicate, limit=limit) if uids: numDeleted = _deleteRows(sqlEngine=sqlEngine, uids=uids) else: # This could happen if something else deleted tweets in our range, such # as the process_tweet_deletions service that services deletion requests # from Twitter. break totalDeleted += numDeleted g_log.info("Purged %s old tweets [%s of %s]", numDeleted, totalDeleted, estimate) g_log.info("Purged numRows=%s of estimated=%s old tweets from table=%s", totalDeleted, estimate, twitterTweetsSchema) return totalDeleted
def migrate(metrics=None): """ Migrate _BACKLOG_DAYS worth of previously-collected tweets to the Taurus non-metric-data RabbitMQ exchange. :param metrics: optional sequence of metric names; if specified (not None), the migration will be limited to tweets corresponding to the given metric names. """ startingTimestamp = datetime.utcnow() - timedelta(days=_BACKLOG_DAYS) sqlEngine = collectorsdb.engineFactory() # Retrieve the first twitter samples sequence number in the desired range # select min(seq) from twitter_tweet_samples where agg_ts >= startingTimestamp @collectorsdb.retryOnTransientErrors def queryMigrationRange(): sel = (sql.select([ sql.func.count(), sql.func.min(schema.twitterTweetSamples.c.seq), sql.func.max(schema.twitterTweetSamples.c.seq) ]).where(schema.twitterTweetSamples.c.agg_ts >= startingTimestamp)) if metrics is not None: sel = sel.where(schema.twitterTweetSamples.c.metric.in_(metrics)) return sqlEngine.execute(sel).first() totalNumItems, minSeq, maxSeq = queryMigrationRange() if totalNumItems == 0: g_log.info("Nothing forwarded: no tweet samples found since %s UTC", startingTimestamp.isoformat()) return if metrics is None: g_log.info( "Starting migration of tweets from %s UTC; totalNumItems=%d; " "minSeq=%s, maxSeq=%s", startingTimestamp.isoformat(), totalNumItems, minSeq, maxSeq) else: g_log.info( "Starting migration of tweets from %s UTC; totalNumItems=%d; " "minSeq=%s, maxSeq=%s; metrics=%s", startingTimestamp.isoformat(), totalNumItems, minSeq, maxSeq, metrics) # Retrieve and publish batches totalNumPublished = 0 totalNumBatches = 0 batchMinSeq = minSeq with MessageBusConnector() as messageBus: while True: batchEndSeq, batch = TweetForwarder.queryNonMetricTweetBatch( sqlEngine=sqlEngine, minSeq=batchMinSeq, maxItems=200, maxSeq=maxSeq, metrics=metrics) if batchEndSeq is None: break TweetForwarder.publishNonMetricTweetBatch(messageBus=messageBus, batch=batch) totalNumPublished += len(batch) totalNumBatches += 1 g_log.debug( "Published numItems=%d; batchMinSeq=%s; batchEndSeq=%s " "(%d of %d: %s%%)", len(batch), batchMinSeq, batchEndSeq, totalNumPublished, totalNumItems, int(float(totalNumPublished) / totalNumItems * 100)) if (totalNumBatches % 250) == 0 or totalNumPublished == totalNumItems: # Progress report g_log.info("Published %d of %d: %s%%", totalNumPublished, totalNumItems, int(float(totalNumPublished) / totalNumItems * 100)) # Prepare for next query batchMinSeq = batchEndSeq + 1 g_log.info( "Done publishing! publishedBatches=%d, publishedItems=%d, " "expectedItems=%d; minSeq=%s, maxSeq=%s", totalNumBatches, totalNumPublished, totalNumItems, minSeq, maxSeq)
def deleteCompanies(tickerSymbols, engineServer, engineApiKey, warnAboutDestructiveAction=True, warningTimeout=_DEFAULT_WARNING_PROMPT_TIMEOUT_SEC): """Delete companies from Taurus Collector and their metrics/models from Taurus Engine. :param sequence tickerSymbols: stock ticker symbols of companies to be deleted :param str engineServer: dns name of ip addres of Taurus API server :param str engineApiKey: API Key of Taurus HTM Engine :param bool warnAboutDestructiveAction: whether to warn about destructive action; defaults to True. :param float warningTimeout: Timeout for the warning prompt; ignored if warnAboutDestructiveAction is False :raises WarningPromptTimeout: if warning prompt timed out :raises UserAbortedOperation: if user chose to abort the operation :raises FlusherMetricNotFound: """ tickerSymbols = tuple(symbol.upper() for symbol in tickerSymbols) # Check for duplicate symbols repeatedSymbols = set(sym for sym in tickerSymbols if tickerSymbols.count(sym) > 1) if repeatedSymbols: raise ValueError( "{numRepeats} symbol(s) are present more than once in " "tickerSymbols arg: {repeats}".format( numRepeats=len(repeatedSymbols), repeats=repeatedSymbols)) # Set will be handier going forward tickerSymbols = set(tickerSymbols) if warnAboutDestructiveAction: _warnAboutDestructiveAction(timeout=warningTimeout, tickerSymbols=tickerSymbols, engineServer=engineServer) # If any of the ticker symbols still appear in the collector's metrics config, # abort the operation as a precautionary measure. allSymbols = set(security[0].upper() for security in metric_utils.getAllMetricSecurities()) problemSymbols = tickerSymbols & allSymbols assert not problemSymbols, ( "Can't delete - {numProblem} of the specified companies [{symbols}] are " "in active metrics configuration".format( numProblem=len(problemSymbols), symbols=problemSymbols)) # First, we need to synchronize with Taurus Engine's metric data path. # If any of the data still in the pipeline is for any of the companies being # deleted, then the metrics may be re-created in the Engine after we delete # them. This is an yet unresolved subtlety with custom metrics in htmengine. _flushTaurusEngineMetricDataPath(engineServer, engineApiKey) # NOTE: We must query custom metrics after flushing the metric data path, # since metrics may get created as a side-effect of processing metric data. allMetricsMap = { obj["name"]: obj for obj in metric_utils.getAllCustomMetrics(host=engineServer, apiKey=engineApiKey) } allMetricNames = allMetricsMap.keys() for symbolNum, symbol in enumerate(tickerSymbols, 1): # Delete corresponding metrics from Taurus Engine metricNamesToDelete = metric_utils.filterCompanyMetricNamesBySymbol( allMetricNames, symbol) if not metricNamesToDelete: g_log.info("No metrics to delete for symbol=%s (%d of %d)", symbol, symbolNum, len(tickerSymbols)) continue g_log.info( "Deleting metrics and models for ticker symbol=%s from Taurus " "Engine=%s (%d of %d)", symbol, engineServer, symbolNum, len(tickerSymbols)) for metricName in metricNamesToDelete: metric_utils.deleteMetric(host=engineServer, apiKey=engineApiKey, metricName=metricName) g_log.info("Deleted metric name=%s, uid=%s", metricName, allMetricsMap[metricName]["uid"]) # Delete the symbol from xignite_security table last; this cascades to # delete related rows in other tables via cascading delete relationship. # # NOTE: garbage collection from other tables not tied to xiginte_security # symbols presently depends on aging of the rows (e.g., twitter tables). # After ENG-83, all company-specific rows from all tables will be # cleaned up and THIS NOTE SHOULD THEN BE REMOVED with collectorsdb.engineFactory().begin() as conn: numDeleted = ( conn.execute(collectorsdb.schema.xigniteSecurity # pylint: disable=E1120 .delete().where( collectorsdb.schema.xigniteSecurity.c.symbol == symbol))).rowcount if numDeleted: g_log.info("Deleted row=%s from table=%s", symbol, collectorsdb.schema.xigniteSecurity) else: g_log.warning( "Couldn't delete security row=%s: not found in table=%s", symbol, collectorsdb.schema.xigniteSecurity)
def forward(metricSpecs, data, security, server=DEFAULT_SERVER, port=DEFAULT_PORT, dryrun=DEFAULT_DRYRUN): """ Forward stock data to HTM-IT/Taurus instance via custom metric :param metricSpecs: Sequence of one or more StockMetricSpec objects associated with the same stock symbol for which polling was conducted :param list data: List of sample dicts :param dict security: Details of security from XIgnite API """ try: symbol = security["Symbol"] engine = collectorsdb.engineFactory() lastSample = _getLatestSample(engine, symbol) if lastSample: localizedLastEndTime = ( getEasternLocalizedEndTimestampFromSampleRow(lastSample)) else: localizedLastEndTime = None # Implemented in two phases: # # 1. Buffer records to collectorsdb for sample in data: localizedSampleStartTime = ( getEasternLocalizedTimestampFromSample(sample["StartDate"], sample["StartTime"], sample["UTCOffset"])) if localizedSampleStartTime.time() < NAIVE_MARKET_OPEN_TIME: # Ignore samples that preceed market open _LOG.info("Skipping data before market hours: %s @ %s sample=%s", symbol, localizedSampleStartTime, sample) continue if localizedSampleStartTime.time() >= NAIVE_MARKET_CLOSE_TIME: # Ignore a quirk of the xignite API that duplicates some data at # end of trading day. This also excludes the closing auction on # NYSE. _LOG.info("Skipping data after market hours: %s @ %s sample=%s", symbol, localizedSampleStartTime, sample) continue if not lastSample or (localizedSampleStartTime >= localizedLastEndTime): # Current sample starts at, or after last recorded timestamp ends localizedSampleEndTime = ( getEasternLocalizedTimestampFromSample(sample["EndDate"], sample["EndTime"], sample["UTCOffset"])) ins = (xigniteSecurityBars .insert() .values(symbol=symbol, StartDate=localizedSampleStartTime.date(), StartTime=localizedSampleStartTime.time(), EndDate=localizedSampleEndTime.date(), EndTime=localizedSampleEndTime.time(), UTCOffset=sample["UTCOffset"], Open=sample["Open"], High=sample["High"], Low=sample["Low"], Close=sample["Close"], Volume=sample["Volume"], Trades=sample["Trades"])) @collectorsdb.retryOnTransientErrors def _insertBar(): engine.execute(ins) try: _insertBar() except IntegrityError: # Most likely foreign key constraint violation against the # xignite_security table _LOG.info("Inserting security row for symbol=%s", symbol) xignite_agent_utils.insertSecurity(engine, security) # Re-insert after resolving IntegrityError _insertBar() # 2. If in active mode, send ALL un-sent records to Taurus if g_opMode != ApplicationConfig.OP_MODE_ACTIVE: return transmitMetricData(metricSpecs=metricSpecs, symbol=symbol, engine=engine) except Exception: _LOG.exception("forward failed for metricSpecs=%s", metricSpecs) raise
def runQueryWithRetries(): return collectorsdb.engineFactory().execute(cleanupQuery).rowcount
def deleteCompanies(tickerSymbols, engineServer, engineApiKey, warnAboutDestructiveAction=True, warningTimeout=_DEFAULT_WARNING_PROMPT_TIMEOUT_SEC): """Delete companies from Taurus Collector and their metrics/models from Taurus Engine. :param sequence tickerSymbols: stock ticker symbols of companies to be deleted :param str engineServer: dns name of ip addres of Taurus API server :param str engineApiKey: API Key of Taurus HTM Engine :param bool warnAboutDestructiveAction: whether to warn about destructive action; defaults to True. :param float warningTimeout: Timeout for the warning prompt; ignored if warnAboutDestructiveAction is False :raises WarningPromptTimeout: if warning prompt timed out :raises UserAbortedOperation: if user chose to abort the operation :raises FlusherMetricNotFound: """ tickerSymbols = tuple(symbol.upper() for symbol in tickerSymbols) # Check for duplicate symbols repeatedSymbols = set(sym for sym in tickerSymbols if tickerSymbols.count(sym) > 1) if repeatedSymbols: raise ValueError("{numRepeats} symbol(s) are present more than once in " "tickerSymbols arg: {repeats}" .format(numRepeats=len(repeatedSymbols), repeats=repeatedSymbols)) # Set will be handier going forward tickerSymbols = set(tickerSymbols) if warnAboutDestructiveAction: _warnAboutDestructiveAction(timeout=warningTimeout, tickerSymbols=tickerSymbols, engineServer=engineServer) # If any of the ticker symbols still appear in the collector's metrics config, # abort the operation as a precautionary measure. allSymbols = set(security[0].upper() for security in metric_utils.getAllMetricSecurities()) problemSymbols = tickerSymbols & allSymbols assert not problemSymbols, ( "Can't delete - {numProblem} of the specified companies [{symbols}] are " "in active metrics configuration".format(numProblem=len(problemSymbols), symbols=problemSymbols)) # First, we need to synchronize with Taurus Engine's metric data path. # If any of the data still in the pipeline is for any of the companies being # deleted, then the metrics may be re-created in the Engine after we delete # them. This is an yet unresolved subtlety with custom metrics in htmengine. _flushTaurusEngineMetricDataPath(engineServer, engineApiKey) # NOTE: We must query custom metrics after flushing the metric data path, # since metrics may get created as a side-effect of processing metric data. allMetricsMap = { obj["name"] : obj for obj in metric_utils.getAllCustomMetrics(host=engineServer, apiKey=engineApiKey) } allMetricNames = allMetricsMap.keys() for symbolNum, symbol in enumerate(tickerSymbols, 1): # Delete corresponding metrics from Taurus Engine metricNamesToDelete = metric_utils.filterCompanyMetricNamesBySymbol( allMetricNames, symbol) if not metricNamesToDelete: g_log.info("No metrics to delete for symbol=%s (%d of %d)", symbol, symbolNum, len(tickerSymbols)) continue g_log.info("Deleting metrics and models for ticker symbol=%s from Taurus " "Engine=%s (%d of %d)", symbol, engineServer, symbolNum, len(tickerSymbols)) for metricName in metricNamesToDelete: metric_utils.deleteMetric(host=engineServer, apiKey=engineApiKey, metricName=metricName) g_log.info("Deleted metric name=%s, uid=%s", metricName, allMetricsMap[metricName]["uid"]) # Delete the symbol from xignite_security table last; this cascades to # delete related rows in other tables via cascading delete relationship. # # NOTE: garbage collection from other tables not tied to xiginte_security # symbols presently depends on aging of the rows (e.g., twitter tables). # After ENG-83, all company-specific rows from all tables will be # cleaned up and THIS NOTE SHOULD THEN BE REMOVED with collectorsdb.engineFactory().begin() as conn: numDeleted = ( conn.execute( collectorsdb.schema.xigniteSecurity # pylint: disable=E1120 .delete() .where(collectorsdb.schema.xigniteSecurity.c.symbol == symbol)) ).rowcount if numDeleted: g_log.info("Deleted row=%s from table=%s", symbol, collectorsdb.schema.xigniteSecurity) else: g_log.warning( "Couldn't delete security row=%s: not found in table=%s", symbol, collectorsdb.schema.xigniteSecurity)