def run_MLDBFB_545_with_ds_type(self, ds_type): id1 = ds_type + 'mldbfb545_1' ds = mldb.create_dataset({'id': id1, 'type': ds_type + '.mutable'}) ds.record_row('user1', [['converted', 'n', 0]]) ds.commit() id2 = ds_type + 'mldbfb545_2' ds = mldb.create_dataset({'id': id2, 'type': ds_type + '.mutable'}) ds.record_row('user2', [['blah', 'blah', 0]]) ds.commit() # query directly on the dataset works res = mldb.query(""" SELECT * FROM {} WHERE converted='c' LIMIT 1 """.format(id1)) self.assertEqual(len(res), 1) merge_id = ds_type + 'mldbfb545merged' mldb.put("/v1/datasets/" + merge_id, { "type": "merged", "params": { "datasets": [{ "id": id1 }, { "id": id2 }] } }) # query on the merged dataset yields incorrect results res = mldb.query(""" SELECT * FROM {} WHERE converted='c' LIMIT 1 """.format(merge_id)) mldb.log(res) self.assertEqual(len(res), 1)
def setUpClass(cls): global d dataset_config = { 'type' : 'sparse.mutable', 'id' : 'example' } dataset = mldb.create_dataset(dataset_config) dataset.record_row('row1', [["x", "2015-01-01T15:14:39.123456Z", d]]) log("Committing dataset") dataset.commit() dataset_config = { 'type' : 'sparse.mutable', 'id' : 'example2' } dataset2 = mldb.create_dataset(dataset_config) dataset2.record_row('row1', [["x", "2014-12-31T15:14:39.123456Z", d]]) log("Committing dataset") dataset2.commit() # previous year dataset_config = { 'type' : 'sparse.mutable', 'id' : 'example3' } dataset3 = mldb.create_dataset(dataset_config) d = datetime.datetime.now() dataset3.record_row('row1', [["x", "2014-12-28T15:14:39.123456Z", d]]) log("Committing dataset") dataset3.commit()
def test_left_join_with_and(self): left = mldb.create_dataset({"id": "left_table", "type": "tabular"}) for i in range(0, 10): left.record_row( "a" + str(i), [["asc", i, 0], ["desc", 10 - i, 0], ["const", 729, 0]]) left.commit() right = mldb.create_dataset({"id": "right_table", "type": "tabular"}) for i in range(0, 10): right.record_row( "b" + str(i), [["index", i, 0], ["mod", i % 2, 0], ["const", 729, 0]]) right.commit() self.run_query_and_compare(""" select * from left_table left join right_table on (left_table.asc = right_table.index and left_table.const = right_table.const) """) self.run_query_and_compare(""" select * from left_table left join right_table on (left_table.asc + left_table.const = right_table.index + right_table.const) """)
def setUpClass(self): # Create embedding dataset dataset_config = {'type': 'embedding', 'id': 'wordEmbedding'} dataset = mldb.create_dataset(dataset_config) now = datetime.datetime.strptime('Jun 1 2005 1:33PM', '%b %d %Y %I:%M%p') dataset.record_row("allo", [["x", 0.2, now], ["y", 0, now]]) dataset.record_row("mon", [["x", 0.8, now], ["y", 0.95, now]]) dataset.record_row("beau", [["x", 0.4, now], ["y", 0.01, now]]) dataset.record_row("coco", [["x", 0, now], ["y", 0.5, now]]) dataset.commit() # Create bag of words dataset dataset_config = {'type': 'sparse.mutable', 'id': 'bag_o_words'} dataset = mldb.create_dataset(dataset_config) dataset.record_row("doc1", [["allo", 1, now], ["coco", 1, now]]) dataset.record_row( "doc2", [["allo", 1, now], ["mon", 1, now], ["beau", 1, now]]) dataset.record_row("doc3", [["patate", 1, now]]) dataset.record_row("doc4", [["j'ai", 1, now]]) dataset.commit()
def test_mldbfb_516_aggregator_incorrect_with_join(self): ds = mldb.create_dataset({'id': 'ds516', 'type': 'sparse.mutable'}) ds.record_row('user3', [['behA', 1, 11], ['conv', 1, 70], ['behB', 1, 14], ['behA', 1, 14]]) ds.commit() ds = mldb.create_dataset({'id': 'conv', 'type': 'sparse.mutable'}) ds.record_row('user3', [['ts', 70, 0]]) ds.commit() res = mldb.query(""" SELECT temporal_count({ds516.*}) AS * FROM ds516 """) mldb.log(res) self.assertTableResultEquals( res, [['_rowName', 'behA', 'behB', 'conv'], ['user3', 2, 1, 1]]) res = mldb.query(""" SELECT temporal_count({ds516.* as *}) AS * FROM ds516 INNER JOIN conv ON ds516.rowName() = conv.rowName() """) mldb.log(res) self.assertTableResultEquals(res, [['_rowName', 'behA', 'behB', 'conv'], ['[user3]-[user3]', 2, 1, 1]]) mldb.log(res)
def test_mldbfb_520_join(self): """ temporal_earliest doesn't yield correct result when used with join expressions. """ ds = mldb.create_dataset({ 'id': 'mldbfb520_join_left', 'type': 'sparse.mutable' }) ds.record_row('user1', [['behA', 1, 1], ['behA', 1, 2], ['behA', 1, 3], ['behB', 1, 9], ['behC', 1, 8]]) ds.commit() ds = mldb.create_dataset({ 'id': 'mldbfb520_join_right', 'type': 'sparse.mutable' }) ds.record_row( 'user1', [['behD', 1, 1], ['behD', 1, 2], ['behD', 1, 3], ['behB', 1, 9]]) ds.commit() query = """ SELECT temporal_earliest({ COLUMN EXPR (WHERE columnName() IN ('l.behA', 'l.behB', 'r.behD'))}) AS * FROM mldbfb520_join_left AS l INNER JOIN mldbfb520_join_right as r ON l.behB = r.behB """ res = mldb.get('/v1/query', q=query) expected = [{ "rowName": "[user1]-[user1]", "columns": [["l.behA", 1, "1970-01-01T00:00:01Z"], ["l.behB", 1, "1970-01-01T00:00:09Z"], ["r.behD", 1, "1970-01-01T00:00:01Z"]] }] mldb.log(res) self.assertFullResultEquals(res.json(), expected) query = """ SELECT temporal_latest({ COLUMN EXPR (WHERE columnName() IN ('l.behA', 'l.behB', 'r.behD'))}) AS * FROM mldbfb520_join_left AS l INNER JOIN mldbfb520_join_right as r ON l.behB = r.behB """ res = mldb.get('/v1/query', q=query) expected = [{ "rowName": "[user1]-[user1]", "columns": [["l.behA", 1, "1970-01-01T00:00:03Z"], ["l.behB", 1, "1970-01-01T00:00:09Z"], ["r.behD", 1, "1970-01-01T00:00:03Z"]] }] mldb.log(res) self.assertFullResultEquals(res.json(), expected)
def setUpClass(cls): ds = mldb.create_dataset({'id': 'ds1', 'type': 'sparse.mutable'}) ds.record_row('row1', [['A', 1, 0]]) ds.commit() ds = mldb.create_dataset({'id': 'ds2', 'type': 'sparse.mutable'}) ds.record_row('row1', [['B', 2, 0]]) ds.commit()
def setUpClass(cls): ds = mldb.create_dataset({'id': 'ds', 'type': 'sparse.mutable'}) ds.record_row('row1', [['colA', 1, 0], ['colB', 1, 1], ['colC', 1, 2]]) ds.commit() ds = mldb.create_dataset({'id': 'timeDs', 'type': 'sparse.mutable'}) ds.record_row('row1', [['time', cls.t, 0]]) ds.commit()
def setUpClass(cls): ds = mldb.create_dataset({'id': 'a', 'type': 'sparse.mutable'}) ds.record_row('row1', [['one', 1, 0], ['two', 1, 0]]) ds.record_row('row2', [['one', 1, 0], ['two', 2, 0]]) ds.record_row('row3', [['one', 2, 0], ['two', 1, 0]]) ds.record_row('row4', [['one', 2, 0], ['two', 2, 0]]) ds.commit() ds = mldb.create_dataset({'id': 'empty', 'type': 'sparse.mutable'}) ds.commit()
def setUpClass(self): # create a dummy dataset ds = mldb.create_dataset({ "id": "dataset1", "type": "sparse.mutable" }) ds.record_row("a",[["x", "toy story", 0]]) ds.commit() ds = mldb.create_dataset({ "id": "dataset2", "type": "sparse.mutable" }) ds.record_row("row_a",[["x", "toy story", 0]]) ds.record_row("row_b",[["x", "terminator", 0]]) ds.commit()
def test_mixed_utf8_escape(self): # the parser assumes utf-8 is already escaped _id = 'éé' mldb.create_dataset({'id': _id, 'type': 'sparse.mutable'}).commit() # fetch escaped ascii url = '/v1/datasets/é' + quote('é') mldb.log(url) res = mldb.get(url) mldb.log(res)
def setUpClass(cls): ds = mldb.create_dataset({'id': 'ds', 'type': 'sparse.mutable'}) ds.record_row('0', [['x', 1, 1], ['y', 1, 2]]) ds.record_row('1', [['x', 1, 3], ['y', 2, 4]]) ds.record_row('2', [['x', 2, 5], ['y', 1, 6]]) ds.record_row('3', [['x', 2, 7], ['y', 2, 8]]) ds.commit() ds2 = mldb.create_dataset({'id': 'ds2', 'type': 'sparse.mutable'}) ds2.record_row('0', [['x', 1, 1], ['x', 2, 2]]) ds2.commit()
def setUpClass(cls): ds1 = mldb.create_dataset({'type': 'sparse.mutable', 'id': 'dataset1'}) ds2 = mldb.create_dataset({'type': 'sparse.mutable', 'id': 'dataset2'}) for i in range(10): ds1.record_row('row_' + str(i), [['x', i, 0]]) for i in range(5): ds2.record_row('row_' + str(i), [['ds1_row', 'row_' + str(i), 0], ['y', i, 0]]) ds1.commit() ds2.commit()
def setUpClass(self): # create a dummy dataset ds = mldb.create_dataset({"id": "text", "type": "sparse.mutable"}) ds.record_row("a", [["txt", "raise shields", 0]]) ds.record_row("b", [["txt", "set a course", 0]]) ds.commit() ds = mldb.create_dataset({"id": "sub1", "type": "sparse.mutable"}) ds.record_row("row_a", [["warp", 8, 0]]) ds.commit() ds = mldb.create_dataset({"id": "sub2", "type": "sparse.mutable"}) ds.record_row("row_b", [["warp", 9, 0]]) ds.commit()
def test_MLDB_1453(self): dataset_config = { 'type' : 'sparse.mutable', 'id' : 'test5', } dataset = mldb.create_dataset(dataset_config) dataset.record_row('myrow', [ [ "a", 0, self.ts ] ]) dataset.commit() query1 = mldb.get('/v1/query', q = 'SELECT a IS NOT TIMESTAMP as x, a IS TIMESTAMP as y from test5') self.assertFullResultEquals(query1.json(), [{"rowName":"myrow","rowHash":"fbdba4c9be68f633","columns":[["x",1,"2015-01-01T00:00:00Z"],["y",0,"2015-01-01T00:00:00Z"]]}] ) query1 = mldb.get('/v1/query', q = 'SELECT latest_timestamp(1) IS NOT TIMESTAMP as x, latest_timestamp(1) IS TIMESTAMP as y from test5') self.assertFullResultEquals(query1.json(), [{"rowName":"myrow","rowHash":"fbdba4c9be68f633","columns":[["x",0,"-Inf"],["y",1,"-Inf"]]}] ) query1 = mldb.get('/v1/query', q = "SELECT interval '3d' IS NOT INTERVAL as x, interval '3d' IS INTERVAL as x from test5") self.assertFullResultEquals(query1.json(), [{"rowName": "myrow","rowHash": "fbdba4c9be68f633","columns":[["x",0,"-Inf"],["x",1,"-Inf"]]}] )
def setUpClass(self): # create a dummy dataset ds = mldb.create_dataset({ "id": "dataset1", "type": "sparse.mutable" }) ds.record_row("row1",[["x", "1", 0]]) ds.record_row("row3",[["x", "3", 0]]) ds.record_row("row2",[["x", "2", 0]]) ds.commit()
def setUpClass(cls): dataset_config = { 'type' : 'sparse.mutable', 'id' : "my_json_dataset" } dataset = mldb.create_dataset(dataset_config) row1 = { "name": "bill", "age": 25, "friends": [{"name": "mich", "age": 20}, {"name": "jean", "age": 18}] } dataset.record_row("row1" , [["data", json.dumps(row1), 0]]) row2 = { "name": "alexis", "age": 22, "friends": [{"name": "cross", "age": 20}, {"name": "fit", "age": 18}, {"name": "foot", "region": "south"}] } dataset.record_row("row2" , [["data", json.dumps(row2), 0]]) dataset.commit() res = mldb.get("/v1/query", q="SELECT parse_json(data, {arrays: 'encode'}) AS * NAMED rowPath() " "FROM my_json_dataset " "WHERE rowName()='row1'").json() assert_val(res, "row1", "age", 25) assert_val(res, "row1", "friends.1", "{\"age\":18,\"name\":\"jean\"}")
def setUpClass(self): # create a dummy dataset dataset = mldb.create_dataset({ "type": "sparse.mutable", "id": "dataset" }) # Horizontal functions first flatten to the latest value # of each column and then operate on the resulting row. # As an example, horizontal_count with first flatten col1 in row # x and then count the resulting atoms. The count will therefore # be 3 not 4. dataset.record_row("x",[ ["col1", 1, HorizontalTest.sometime], ["col1", 2, HorizontalTest.before], ["col2", 1, HorizontalTest.sometime], ["pwet", 1, HorizontalTest.sometime]]) dataset.record_row("y",[ ["col1", 0, HorizontalTest.sometime], ["col1", 1, HorizontalTest.after], ["col2", 1, HorizontalTest.sometime], ["prout", 1, HorizontalTest.sometime]]) dataset.record_row("z",[ ["col1", 5, HorizontalTest.sometime], ["col1", 1, HorizontalTest.before], ["col1", 10, HorizontalTest.after], ["col2", 1, HorizontalTest.sometime]]) dataset.commit()
def test_r2(self): ds = mldb.create_dataset({"id": "r2_sample", "type": "sparse.mutable"}) ds.record_row("a", [["score", 2.5, 0], ["score2", 25, 0], ["target", 3, 0]]) ds.record_row( "b", [["score", 0, 0], ["score2", -5, 0], ["target", -0.5, 0]]) ds.record_row("c", [["score", 2, 0], ["score2", 22, 0], ["target", 2, 0]]) ds.record_row("d", [["score", 8, 0], ["score2", 5, 0], ["target", 7, 0]]) ds.commit() for scoreCol, r2 in [("score", 0.948), ("score2", -30.1177)]: rez = mldb.put( "/v1/procedures/patate", { "type": "classifier.test", "params": { "testingData": "select %s as score, target as label from r2_sample" % scoreCol, "mode": "regression", "runOnCreation": True } }) mldb.log(rez.json()["status"]) self.assertAlmostEqual( rez.json()["status"]["firstRun"]["status"]["r2"], r2, places=2)
def test_it(self): ds = mldb.create_dataset({'id': 'ds', 'type': 'sparse.mutable'}) ds.record_row('row1', []) ds.commit() query = """ SELECT jseval(' {} return {{"foo" : "bar"}}; ', 'cols', {{*}} ) AS * FROM ds """ # the query works mldb.log(mldb.query(query.format(""))) # add an exception, good luck understanding what's going on now... try: mldb.query(query.format('throw "this query is weird";')) except ResponseException as exc: mldb.log(exc.response.json()) assert 'this query is weird' in exc.response.json()['error'] else: assert False, 'should not be here'
def setUpClass(cls): ds = mldb.create_dataset({ 'id': 'ds', 'type': 'sparse.mutable', }) ds.record_row('row1', [['colA', 1, 1]]) ds.commit()
def test_c_over_a_or_b(self): """ This is an alternate test of unknown values. The model was built with column a_or_b having always a value of either a or b. Here, we test with a dataset having always the "never seen value" of c for that column. """ ds = mldb.create_dataset({ 'id': 'test_c_over_a_or_b_ds', 'type': 'sparse.mutable' }) for idx in range(10): ds.record_row('row{}'.format(idx), [['line', idx, 0], ['label', 0, 0], ['feature', random.random() + 0.6, 0], ['noise', random.random(), 0], ['a_or_b', 'c', 0]]) ds.commit() mldb.post('/v1/procedures', { 'type': 'classifier.test', 'params' : { 'mode': 'boolean', 'testingData': 'SELECT score_it({features: {feature, noise, a_or_b}})[score] AS score, ' \ 'label FROM test_c_over_a_or_b_ds' } })
def test_string_over_null(self): """ The primary test. The issue column, which was always null for training, is now having a string values for testing. """ ds = mldb.create_dataset({ 'id': 'test_string_over_null_ds', 'type': 'sparse.mutable' }) for idx in range(10): ds.record_row( 'row{}'.format(idx), [['line', idx, 0], ['label', 0, 0], ['feature', random.random() + 0.6, 0], ['noise', random.random(), 0], ['issue', 'STRING', 0]]) ds.commit() mldb.post('/v1/procedures', { 'type': 'classifier.test', 'params' : { 'mode': 'boolean', 'testingData': 'SELECT score_it({features: {feature, noise, issue}})[score] AS score, ' \ 'label FROM test_string_over_null_ds' } })
def setUpClass(self): ds = mldb.create_dataset({"id": "dataset1", "type": "sparse.mutable"}) ds.record_row("row_c", [["x", 1, 0], ["y", 3, 0]]) ds.record_row("row_b", [["x", 2, 0], ["y", 2, 0]]) ds.record_row("row_a", [["x", 3, 0], ["y", 1, 0]]) ds.commit()
def setUpClass(cls): ds = mldb.create_dataset({'id' : 'ds', 'type' : 'sparse.mutable'}) ds.record_row('row1', [['label', 1, 12], ['feat1', 1, 0], ['feat2', 1, 0]]) ds.record_row('row2', [['label', 0, 12], ['feat1', 1, 0], ['feat2', 0, 0]]) ds.record_row('row3', [['label', 0, 12], ['feat1', 0, 0], ['feat2', 0, 0]]) ds.commit() mldb.post('/v1/procedures', { 'type' : 'classifier.train', 'params' : { 'runOnCreation' : True, "mode": "boolean", 'algorithm' : 'glz', "configuration": { "glz": { "type": "glz", "verbosity": 3, "normalize": False, "regularization": 'l2' } }, 'trainingData' : """ SELECT {* EXCLUDING(label)} AS features, label FROM ds """, "modelFileUrl": "file://build/x86_64/tmp/fmlhTODO.cls", } })
def test_most_frequent(self): ds = mldb.create_dataset({ 'id': 'most_freq_source', 'type': 'sparse.mutable' }) row_num = 0 class Counter(object): def __init__(self): self.num = 0 def __next__(self): self.num += 1 return self.num vals = { 'a': 5, 'b': 4, 'c': 3, 'd': 2, 'e': 1, 'f': 1, 'g': 1, 'h': 1, 'i': 1, 'j': 1, 'k': 1, 'l': 1, 'm': 1, } c = Counter() for k, count in vals.items(): for _ in range(count): ds.record_row(next(c), [['col', k, 0]]) ds.commit() mldb.post( '/v1/procedures', { 'type': 'summary.statistics', 'params': { 'runOnCreation': True, 'inputData': "SELECT * FROM most_freq_source", 'outputDataset': { 'id': 'most_freq_output', 'type': 'sparse.mutable' } } }) res = mldb.query("SELECT * FROM most_freq_output ORDER BY rowName()") self.assertTableResultEquals(res, [[ "_rowName", "value.data_type", "value.most_frequent_items.a", "value.most_frequent_items.b", "value.most_frequent_items.c", "value.most_frequent_items.d", "value.most_frequent_items.h", "value.most_frequent_items.i", "value.most_frequent_items.j", "value.most_frequent_items.k", "value.most_frequent_items.l", "value.most_frequent_items.m", "value.num_null", "value.num_unique" ], ["col", "categorical", 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 0, 13]])
def setUpClass(self): # create a dummy dataset ds = mldb.create_dataset({ "id": "sample", "type": "sparse.mutable" }) ds.record_row("a",[["x", 1, 0]]) ds.record_row("b",[["x", 2, 0], ["y", 25, 0]]) ds.record_row("c",[["y", 3, 0]]) ds.commit()
def test_dottest_col_names(self): ds = mldb.create_dataset({ 'id': 'dotted_col_ds', 'type': 'sparse.mutable' }) ds.record_row('row1', [['col.a', 1, 0]]) ds.commit() mldb.post( '/v1/procedures', { 'type': 'summary.statistics', 'params': { 'runOnCreation': True, 'inputData': 'SELECT * FROM dotted_col_ds', 'outputDataset': { 'id': 'output_dotted_col_ds', 'type': 'sparse.mutable' } } }) res = mldb.query("SELECT * FROM output_dotted_col_ds") self.assertTableResultEquals(res, [[ "_rowName", "value.1st_quartile", "value.3rd_quartile", "value.data_type", "value.max", "value.avg", "value.median", "value.min", "value.most_frequent_items.1", "value.num_null", "value.num_unique", "value.stddev" ], ['"col.a"', 1, 1, "number", 1, 1, 1, 1, 1, 0, 1, "NaN"]])
def load_test_dataset(): ds1 = mldb.create_dataset({'type': 'sparse.mutable', 'id': 'dataset1'}) ds1.record_row( 'user1', [['x', 1, same_time_tomorrow], ['y', 2, same_time_tomorrow]]) ds1.record_row('user2', [['x', 3, now], ['y', 4, now]]) ds1.commit()
def test_base(self): dataset_config = { 'type' : 'sparse.mutable', 'id' : 'toy' } dataset = mldb.create_dataset(dataset_config) dataset.record_row("rowA", [["feat1", 1, 0], ["feat2", 1, 0], ["feat3", 1, 0]]) dataset.record_row("rowB", [["feat1", 1, 0], ["feat2", 1, 0]]), dataset.record_row("rowC", [["feat1", 1, 0]]) dataset.commit() mldb.get( "/v1/query", q="select COLUMN EXPR (ORDER BY rowCount() DESC LIMIT 2) from toy") mldb.get( "/v1/query", q="""SELECT COLUMN EXPR ( WHERE regex_match(columnName(), 'feat[[:digit:]]') ORDER BY rowCount() DESC LIMIT 2) from toy""")