def test_match_all(self): model = Model( dict( name="times-model", offset=30, span=300, bucket_interval=3, interval=60, features=FEATURES_MATCH_ALL_TAG1, threshold=30, )) res = self.source.get_times_data( bucket_interval=model.bucket_interval, features=model.features, from_date=self.t0, to_date=self.t0 + 8, ) baz_avg = [] for line in res: baz_avg.append(line[1][0]) np.testing.assert_allclose( np.array(baz_avg), np.array([216.0, np.nan, 18.0]), rtol=0, atol=0, ) model = Model( dict( name="times-model", offset=30, span=300, bucket_interval=3, interval=60, features=FEATURES_MATCH_ALL_TAG2, threshold=30, )) res = self.source.get_times_data( bucket_interval=model.bucket_interval, features=model.features, from_date=self.t0, to_date=self.t0 + 8, ) baz_avg = [] for line in res: baz_avg.append(line[1][0]) np.testing.assert_allclose( np.array(baz_avg), np.array([-216.0, np.nan, -18.0]), rtol=0, atol=0, )
def test_agg_id(self): model = Model( settings={ 'name': "foo", 'type': "generic", 'features': [ { 'name': 'f1', 'measurement': 'm', 'field': 'f', 'metric': 'avg', }, { 'name': 'f2', 'measurement': 'm', 'field': 'f', 'metric': 'avg', 'match_all': [{ 'key': 'key', 'value': 'value' }], }, { 'name': 'f3', 'measurement': 'm', 'field': 'f', 'metric': 'avg', 'match_all': [{ 'key': 'key', 'value': 'value' }], }, { 'name': 'f4', 'measurement': 'm', 'field': 'f', 'metric': 'avg', 'match_all': [{ 'key': 'key', 'value': 'value2' }], }, { 'name': 'f5', 'field': 'f', 'metric': 'avg', }, ], }) agg_ids = [feature.agg_id for feature in model.features] self.assertEqual(agg_ids[0], 'm') self.assertEqual(agg_ids[1], 'm_ced1b023686195d411caee8450821ff77ed0c5eb') self.assertEqual(agg_ids[2], agg_ids[1]) self.assertEqual(agg_ids[3], 'm_7359bacde7a306a62e35501cc9bb905e6b2c6f72') self.assertEqual(agg_ids[4], 'all')
def test_match_all(self): model = Model( dict( name="test-model", offset=30, span=300, bucket_interval=3, interval=60, features=FEATURES_MATCH_ALL_TAG1, )) res = self.source.get_times_data( bucket_interval=model.bucket_interval, features=model.features, from_date=self.t0, to_date=self.t0 + 9, ) baz_avg = [] for line in res: baz_avg.append(nan_to_none(line[1][0])) self.assertEqual(baz_avg, [216.0, None, 18.0]) model = Model( dict( name="test-model", offset=30, span=300, bucket_interval=3, interval=60, features=FEATURES_MATCH_ALL_TAG2, )) res = self.source.get_times_data( bucket_interval=model.bucket_interval, features=model.features, from_date=self.t0, to_date=self.t0 + 9, ) baz_avg = [] for line in res: baz_avg.append(nan_to_none(line[1][0])) self.assertEqual(baz_avg, [-216.0, None, -18.0])
def setUp(self): db = os.environ.get('MONGODB_DB') if not db: db = "test-{}".format(int(datetime.datetime.now().timestamp())) settings = { 'name': 'test', 'addr': os.environ.get('MONGODB_ADDR', "localhost:27017"), 'database': db, } username = os.environ.get('MONGODB_USER') if username: settings['username'] = username settings['password'] = os.environ.get('MONGODB_PWD') auth_source = os.environ.get('MONGODB_AUTH_SRC') if auth_source: settings['auth_source'] = auth_source self.source = MongoDataSource(settings) self.model = Model( dict( name="test-model", offset=30, span=300, bucket_interval=3600, interval=60, features=[ { 'name': 'avg_foo', 'metric': 'avg', 'collection': 'coll', 'field': 'foo', }, { 'name': 'count_bar', 'metric': 'count', 'collection': 'coll', 'field': 'bar', 'default': 0, }, ], )) self.t0 = datetime.datetime.now(datetime.timezone.utc).replace( hour=0, minute=0, second=0, microsecond=0, ).timestamp()
def setUpClass(cls): cls.prefix = "test-{}".format(datetime.datetime.now().timestamp()) cls.source = Warp10Bucket({ 'name': 'test', 'url': os.environ['WARP10_URL'], 'read_token': os.environ['WARP10_READ_TOKEN'], 'write_token': os.environ['WARP10_WRITE_TOKEN'], 'global_prefix': cls.prefix, }) logger = logging.getLogger('warp10client.client') logger.setLevel(logging.INFO) cls.tag = {'test': cls.prefix} cls.model = Model( dict( name="test-model", offset=30, span=3, bucket_interval=3600, interval=60, features=[ { 'name': 'avg_foo', 'metric': 'avg', 'field': 'foo', }, { 'name': 'count_bar', 'metric': 'count', 'field': 'bar', 'default': 0, }, ], )) cls.t0 = datetime.datetime.now(datetime.timezone.utc).replace( hour=0, minute=0, second=0, microsecond=0, ).timestamp()
def setUp(self): bucket_interval = 3 t0 = int(datetime.datetime.now().timestamp()) t0 -= t0 % bucket_interval self.t0 = t0 self.source = PrometheusBucket({ 'name': 'test', 'addr': ADDR, }) self.model = Model(dict( name="test-model", offset=30, span=300, bucket_interval=3, interval=60, features=FEATURES, ))
def test_match_all(self): model = Model( dict( name="test-model", offset=30, span=300, bucket_interval=3, interval=60, features=[ { 'name': 'avg_foo', 'metric': 'avg', 'collection': 'coll1', 'field': 'foo', 'match_all': [ { 'tag': 'tag_1', 'value': 'tag_A' }, ], }, ], )) t0 = self.t0 data = [ # (foo, timestamp) (33, t0 - 1), # excluded # empty (120, t0 + 1), (312, t0 + 2), (18, t0 + 7), (78, t0 + 10), # excluded ] for foo, ts in data: self.source.insert_times_data(collection='coll1', ts=ts, data={ 'foo': foo, }) self.source.insert_times_data(collection='coll1', ts=ts, tags={ 'tag_1': 'tag_A', 'tag_2': 'tag_B', }, data={ 'foo': foo, }) self.source.insert_times_data(collection='coll1', ts=ts, tags={ 'tag_1': 'tag_B', 'tag_2': 'tag_C', }, data={ 'foo': -foo, }) self.source.commit() res = self.source.get_times_data( model, from_date=t0, to_date=t0 + 3 * model.bucket_interval, ) foo_avg = [] for line in res: foo_avg.append(line[1][0]) np.testing.assert_allclose( np.array(foo_avg), np.array([216.0, np.nan, 18.0]), rtol=0, atol=0, ) model = Model( dict( name="test-model", offset=30, span=300, bucket_interval=3, interval=60, features=[ { 'collection': 'coll1', 'name': 'avg_foo', 'metric': 'avg', 'field': 'foo', 'match_all': [ { 'tag': 'tag_1', 'value': 'tag_B' }, ], }, ], )) res = self.source.get_times_data( model, from_date=self.t0, to_date=self.t0 + 8, ) avg_foo = [] for line in res: avg_foo.append(line[1][0]) np.testing.assert_allclose( np.array(avg_foo), np.array([-216.0, np.nan, -18.0]), rtol=0, atol=0, )
def setUp(self): bucket_interval = 3 t0 = int(datetime.datetime.now().timestamp()) t0 -= t0 % bucket_interval self.t0 = t0 self.index = "test-{}".format(t0) self.sink_index = "test-{}-prediction".format(t0) logging.info("creating index %s", self.index) if os.environ.get('ELASTICSEARCH_ADDR', None) is None: # tip: useful tool to query ES AWS remotely: # npm install aws-es-curl -g settings = dict( name='aws', type='elasticsearch_aws', doc_type='doc', host=os.environ['ELASTICSEARCH_HOST'], region='eu-west-1', get_boto_credentials=False, access_key=os.environ['AWS_ACCESS_KEY_ID'], secret_key=os.environ['AWS_SECRET_ACCESS_KEY'], ) settings['index'] = self.index self.source = loudml.bucket.load_bucket(settings) settings = copy.deepcopy(settings) settings['index'] = self.sink_index self.sink = loudml.bucket.load_bucket(settings) else: settings = { 'name': 'test', 'addr': os.environ['ELASTICSEARCH_ADDR'], 'index': self.index, 'doc_type': 'nosetests', } self.source = ElasticsearchBucket(settings) settings = copy.deepcopy(settings) settings['index'] = self.sink_index self.sink = ElasticsearchBucket(settings) data_schema = { "foo": { "type": "integer" }, "bar": { "type": "integer" }, "baz": { "type": "integer" }, "tag_kw": { "type": "keyword" }, "tag_int": { "type": "integer" }, "tag_bool": { "type": "boolean" }, } self.source.drop() self.source.init(data_schema=data_schema) self.model = Model( dict( name='times-model', # not test-model due to TEMPLATE offset=30, span=300, bucket_interval=bucket_interval, interval=60, features=FEATURES, threshold=30, )) data = [ # (foo, bar|baz, timestamp) (1, 33, t0 - 1), # excluded (2, 120, t0), (3, 312, t0 + 1), # empty (4, 18, t0 + 7), (5, 78, t0 + 9), # excluded ] for foo, bar, ts in data: self.source.insert_times_data(ts=ts, data={ 'foo': foo, }) self.source.insert_times_data(ts=ts, data={ 'bar': bar, }) self.source.insert_times_data(ts=ts, tags={ 'tag_kw': 'tag1', 'tag_int': 9, 'tag_bool': False, }, data={ 'baz': bar, }) self.source.insert_times_data(ts=ts, tags={ 'tag_kw': 'tag2', 'tag_int': 7, 'tag_bool': True, }, data={ 'baz': -bar, }) self.source.commit() # Let elasticsearch indexes the data before querying it time.sleep(10)
class TestElasticBucket(unittest.TestCase): def setUp(self): bucket_interval = 3 t0 = int(datetime.datetime.now().timestamp()) t0 -= t0 % bucket_interval self.t0 = t0 self.index = "test-{}".format(t0) self.sink_index = "test-{}-prediction".format(t0) logging.info("creating index %s", self.index) if os.environ.get('ELASTICSEARCH_ADDR', None) is None: # tip: useful tool to query ES AWS remotely: # npm install aws-es-curl -g settings = dict( name='aws', type='elasticsearch_aws', doc_type='doc', host=os.environ['ELASTICSEARCH_HOST'], region='eu-west-1', get_boto_credentials=False, access_key=os.environ['AWS_ACCESS_KEY_ID'], secret_key=os.environ['AWS_SECRET_ACCESS_KEY'], ) settings['index'] = self.index self.source = loudml.bucket.load_bucket(settings) settings = copy.deepcopy(settings) settings['index'] = self.sink_index self.sink = loudml.bucket.load_bucket(settings) else: settings = { 'name': 'test', 'addr': os.environ['ELASTICSEARCH_ADDR'], 'index': self.index, 'doc_type': 'nosetests', } self.source = ElasticsearchBucket(settings) settings = copy.deepcopy(settings) settings['index'] = self.sink_index self.sink = ElasticsearchBucket(settings) data_schema = { "foo": { "type": "integer" }, "bar": { "type": "integer" }, "baz": { "type": "integer" }, "tag_kw": { "type": "keyword" }, "tag_int": { "type": "integer" }, "tag_bool": { "type": "boolean" }, } self.source.drop() self.source.init(data_schema=data_schema) self.model = Model( dict( name='times-model', # not test-model due to TEMPLATE offset=30, span=300, bucket_interval=bucket_interval, interval=60, features=FEATURES, threshold=30, )) data = [ # (foo, bar|baz, timestamp) (1, 33, t0 - 1), # excluded (2, 120, t0), (3, 312, t0 + 1), # empty (4, 18, t0 + 7), (5, 78, t0 + 9), # excluded ] for foo, bar, ts in data: self.source.insert_times_data(ts=ts, data={ 'foo': foo, }) self.source.insert_times_data(ts=ts, data={ 'bar': bar, }) self.source.insert_times_data(ts=ts, tags={ 'tag_kw': 'tag1', 'tag_int': 9, 'tag_bool': False, }, data={ 'baz': bar, }) self.source.insert_times_data(ts=ts, tags={ 'tag_kw': 'tag2', 'tag_int': 7, 'tag_bool': True, }, data={ 'baz': -bar, }) self.source.commit() # Let elasticsearch indexes the data before querying it time.sleep(10) def tearDown(self): self.sink.drop() self.source.drop() def test_get_index_name(self): ts = 1527156069 self.assertEqual(self.source.get_index_name(), self.index) self.assertEqual(self.source.get_index_name("test"), "test") self.assertEqual(self.source.get_index_name("test", timestamp=ts), "test") self.assertEqual( self.source.get_index_name("test-*", timestamp=ts), "test-2018.05.24", ) def test_get_times_data(self): res = self.source.get_times_data( bucket_interval=self.model.bucket_interval, features=self.model.features, from_date=self.t0, to_date=self.t0 + 8, ) foo_avg = [] for line in res: foo_avg.append(line[1][0]) np.testing.assert_allclose( np.array(foo_avg), np.array([2.5, np.nan, 4.0]), rtol=0, atol=0, ) def test_save_timeseries_prediction(self): now_ts = datetime.datetime.now().timestamp() timestamps = [ now_ts, now_ts + self.model.bucket_interval, ] predicted = [4.0, 2.0] observed = [4.1, 1.9] prediction = TimeSeriesPrediction( self.model, timestamps=timestamps, predicted=np.array(predicted), observed=np.array(observed), ) self.sink.init(data_schema=prediction.get_schema()) self.sink.save_timeseries_prediction(prediction, tags=self.model.get_tags()) self.sink.refresh() res = self.sink.search(routing=self.model.routing, size=100, body={}) hits = res['hits']['hits'] self.assertEqual(len(hits), 2) for i, hit in enumerate( sorted(hits, key=lambda x: x['_source']['timestamp'])): source = hit['_source'] self.assertEqual( source, { 'avg_foo': predicted[i], '@avg_foo': observed[i], 'timestamp': int(timestamps[i] * 1000), 'model': self.model.name, }) def test_match_all(self): model = Model( dict( name="times-model", offset=30, span=300, bucket_interval=3, interval=60, features=FEATURES_MATCH_ALL_TAG1, threshold=30, )) res = self.source.get_times_data( bucket_interval=model.bucket_interval, features=model.features, from_date=self.t0, to_date=self.t0 + 8, ) baz_avg = [] for line in res: baz_avg.append(line[1][0]) np.testing.assert_allclose( np.array(baz_avg), np.array([216.0, np.nan, 18.0]), rtol=0, atol=0, ) model = Model( dict( name="times-model", offset=30, span=300, bucket_interval=3, interval=60, features=FEATURES_MATCH_ALL_TAG2, threshold=30, )) res = self.source.get_times_data( bucket_interval=model.bucket_interval, features=model.features, from_date=self.t0, to_date=self.t0 + 8, ) baz_avg = [] for line in res: baz_avg.append(line[1][0]) np.testing.assert_allclose( np.array(baz_avg), np.array([-216.0, np.nan, -18.0]), rtol=0, atol=0, )
def test_multi_fetch(self): model = Model( dict( name="test-model", offset=30, span=3, bucket_interval=3600, interval=60, features=[ { 'name': 'avg_foo', 'metric': 'avg', 'field': 'foo', 'match_all': [ { 'tag': 'a', 'value': 'b' }, ], }, { 'name': 'count_bar', 'metric': 'count', 'field': 'bar', 'default': 0, }, ], )) res = self.source.build_multi_fetch( bucket_interval=model.bucket_interval, features=model.features, from_str="2018-07-21T00:00:00Z", to_str="2018-07-22T00:00:00Z", ) self.assertEqual( res, """ [ [ [ '{}' '{}.foo' {{ 'a' 'b' }} '2018-07-21T00:00:00Z' '2018-07-22T00:00:00Z' ] FETCH bucketizer.mean 0 3600000000 0 ] BUCKETIZE [ [ '{}' '{}.bar' {{ }} '2018-07-21T00:00:00Z' '2018-07-22T00:00:00Z' ] FETCH bucketizer.count 0 3600000000 0 ] BUCKETIZE ] """.strip().format( self.source.read_token, self.prefix, self.source.read_token, self.prefix, ))
def test_train_predict(self): model = DonutModel( dict( name='test', offset=30, span=5, bucket_interval=60 * 60, interval=60, features=[ { 'name': 'count_foo', 'metric': 'count', 'field': 'prefix.foo', 'default': 0, }, { 'name': 'avg_foo', 'metric': 'avg', 'field': 'prefix.foo', 'default': 5, }, ], max_evals=1, )) generator = SinEventGenerator(base=3, sigma=0.05) to_date = datetime.datetime.now(datetime.timezone.utc).replace( hour=0, minute=0, second=0, microsecond=0, ).timestamp() from_date = to_date - 3600 * 24 for ts in generator.generate_ts(from_date, to_date, step_ms=60000): self.source.insert_times_data( ts=ts, data={'prefix.foo': random.lognormvariate(10, 1)}, ) self.source.commit() # Train model.train(self.source, from_date=from_date, to_date=to_date) # Check self.assertTrue(model.is_trained) # Predict pred_from = to_date - 3 * model.bucket_interval pred_to = to_date prediction = model.predict( bucket=self.source, from_date=pred_from, to_date=pred_to, ) self.source.save_timeseries_prediction(prediction, tags=self.tag) # Fake model just for extracting saved prediction model2 = Model( dict( name='test-prediction', offset=30, span=5, bucket_interval=60 * 60, interval=60, features=[ { 'name': 'count_foo', 'metric': 'avg', 'field': "{}.count_foo".format(model.name), }, { 'name': 'avg_foo', 'metric': 'avg', 'field': "{}.avg_foo".format(model.name), }, ], max_evals=1, )) res = self.source.get_times_data( bucket_interval=model2.bucket_interval, features=model2.features, from_date=pred_from, to_date=pred_to, tags=self.tag, ) for i, pred_ts in enumerate(prediction.timestamps): values, ts = res[i][1:] self.assertEqual(ts, pred_ts) np.testing.assert_allclose( np.array(values), prediction.predicted[i], )
def setUp(self): bucket_interval = 3 t0 = int(datetime.datetime.now().timestamp()) # XXX Bucket returned by InfluxDB are aligned on # modulo(bucket_interval), that's why # timestamp must be aligned for unit tests. t0 -= t0 % bucket_interval self.t0 = t0 self.db = 'test-{}'.format(t0) logging.info("creating database %s", self.db) self.source = InfluxBucket({ 'name': 'test', 'addr': ADDR, 'database': self.db, 'measurement': 'nosetests', }) self.source.drop() self.source.init() self.model = Model( dict( name="test-model", offset=30, span=300, bucket_interval=3, interval=60, features=FEATURES, )) data = [ # (foo, bar, timestamp) (1, 33, t0 - 1), # excluded (2, 120, t0), (3, 312, t0 + 1), # empty (4, 18, t0 + 7), (5, 78, t0 + 9), # excluded ] for foo, bar, ts in data: self.source.insert_times_data(measurement='measure1', ts=ts, data={ 'foo': foo, }) self.source.insert_times_data(measurement='measure2', ts=ts, data={ 'bar': bar, }) self.source.insert_times_data(measurement='measure3', ts=ts, tags={ 'tag_kw': 'tag1', 'tag_int': 9, 'tag_bool': False, }, data={ 'baz': bar, }) self.source.insert_times_data(measurement='measure3', ts=ts, tags={ 'tag_kw': 'tag2', 'tag_int': 7, 'tag_bool': True, }, data={ 'baz': -bar, }) self.source.commit()
def test_validate_model(self): # Valid Model( settings={ 'name': "foo", 'type': "generic", 'features': [ { 'name': 'bar', 'field': 'baz', 'metric': 'avg', }, { 'name': 'bar', 'field': 'baz', 'metric': 'count', } ], } ) Model( settings={ 'name': "foo", 'type': "generic", 'features': [ { 'name': 'bar', 'field': 'baz', 'metric': 'avg', }, ], 'routing': 'cux', } ) Model( settings={ 'name': "foo", 'type': "generic", 'features': [ { 'name': 'bar', 'measurement': 'prefix.measurement', 'field': 'prefix.baz', 'metric': 'avg', }, ], 'routing': 'cux', } ) # Invalid self.invalid_model( settings={ 'type': 'generic', 'features': [ { 'name': 'bar', 'field': 'baz', 'metric': 'avg', }, ], } ) self.invalid_model( settings={ 'name': 'foo', 'type': 'generic', 'features': [], } ) self.invalid_model( settings={ 'name': "foo", 'type': "generic", 'features': [ { 'name': 'bar', 'field': 'baz', 'metric': 'avg', 'io': 'i', }, ], } ) self.invalid_model( settings={ 'name': "foo", 'type': "generic", 'features': [ { 'name': 'bar', 'field': 'baz', 'metric': 'count', 'io': 'o', } ], } ) self.invalid_model( settings={ 'name': 'foo/invalid', 'type': 'generic', 'features': [ { 'name': 'bar', 'field': 'baz', 'metric': 'avg', }, ], } )
def invalid_model(self, **kwargs): with self.assertRaises(errors.Invalid): Model(**kwargs)
def setUpClass(cls): bucket_interval = 3 t0 = int(datetime.datetime.now().timestamp()) t0 -= t0 % bucket_interval cls.t0 = t0 cls.source = OpenTSDBBucket({ 'name': 'test', 'addr': ADDR, }) cls.source.drop() cls.source.init() cls.model = Model( dict( name="test-model", offset=30, span=300, bucket_interval=3, interval=60, features=FEATURES, )) data = [ # (foo, bar, timestamp) (1, 33, t0 - 1), # excluded (2, 120, t0), (3, 312, t0 + 1), # empty (4, 18, t0 + 7), (5, 78, t0 + 9), # excluded ] for foo, bar, ts in data: cls.source.insert_times_data(ts=ts, data={ 'foo': foo, }) cls.source.insert_times_data(ts=ts, data={ 'bar': bar, }) cls.source.insert_times_data(ts=ts, tags={ 'tag_kw': 'tag1', 'tag_int': 9, 'tag_bool': False, }, data={ 'baz': bar, }) cls.source.insert_times_data(ts=ts, tags={ 'tag_kw': 'tag2', 'tag_int': 7, 'tag_bool': True, }, data={ 'baz': -bar, }) cls.source.commit()