def test_no_table_error_message(self): msg = 'Wildcard usage requires a FROM statement' with self.assertRaisesRegex(ResponseException, msg): mldb.query("SELECT *") with self.assertRaisesRegex(ResponseException, msg): mldb.query("SELECT * WHERE columnCount() > 0")
def test_reshape_row(self): res = mldb.query( 'SELECT reshape({"0": 1, "1": 2, "2": 3, "3": 4}, [2, 2]) as *') expected = [["_rowName", "0.0", "0.1", "1.0", "1.1"], ["result", 1, 2, 3, 4]] self.assertTableResultEquals(res, expected) res = mldb.query( 'SELECT reshape({"0": 1, "1": 2, "2": 3, "3": 4}, [1, 4]) as *') expected = [["_rowName", "0.0", "0.1", "0.2", "0.3"], ["result", 1, 2, 3, 4]] self.assertTableResultEquals(res, expected) res = mldb.query( 'SELECT reshape({"0": {"0": 1, "1": 2}, "1": {"0": 3, "1": 4}}, [4]) as *' ) expected = [["_rowName", "0", "1", "2", "3"], ["result", 1, 2, 3, 4]] self.assertTableResultEquals(res, expected) res = mldb.query( 'SELECT reshape({"0": {"0": 1, "1": 2}, "1": {"0": 3, "1": 4}}, [1, 4]) as *' ) expected = [["_rowName", "0.0", "0.1", "0.2", "0.3"], ["result", 1, 2, 3, 4]] self.assertTableResultEquals(res, expected)
def test_reshape_not_embedding(self): with self.assertRaisesRegex(ResponseException, 'Null embedding'): mldb.query("SELECT shape(reshape('not an embedding', [1])) as dim") with self.assertRaisesRegex(ResponseException, 'requires an embedding'): mldb.query("SELECT shape(reshape([1], 'not an embedding')) as dim")
def test_rowHash(self): mldb.post( '/v1/procedures', { 'type': 'import.text', 'params': { "dataFileUrl": "https://raw.githubusercontent.com/datacratic/mldb-pytanic-plugin/master/titanic_train.csv", 'outputDataset': "titanic_hashed", "where": "rowHash() % 3 = 0", 'runOnCreation': True, } }) mldb.post( '/v1/procedures', { 'type': 'import.text', 'params': { "dataFileUrl": "https://raw.githubusercontent.com/datacratic/mldb-pytanic-plugin/master/titanic_train.csv", 'outputDataset': "titanic_no_hashed", 'runOnCreation': True, } }) self.assertTableResultEquals( mldb.query("select count(*) from titanic_hashed"), [["_rowName", "count(*)"], ["[]", 287]]) self.assertTableResultEquals( mldb.query("select count(*) from titanic_no_hashed"), [["_rowName", "count(*)"], ["[]", 891]])
def test_sequence(self): with self.assertRaisesRegex(ResponseException, "Executing builtin function exp: Can't convert value 'a' of type 'ASCII_STRING' to double") as re: query = "SELECT exp('a')" mldb.query(query) with self.assertRaisesRegex(ResponseException, "Binding builtin function sqrt: expected 1 argument, got 3") as re: query = "SELECT sqrt(1,2,3)" mldb.query(query)
def test_bad_alias_rhs_where(self): with self.assertRaises(ResponseException): mldb.query(""" SELECT * FROM a WHERE a.rowName()=undefined.column """)
def test_toy_regression_works(self): rez = mldb.put( "/v1/procedures/toy_reg", { "type": "classifier.test", "params": { "mode": "regression", "testingData": """ SELECT score as score, label as label from toy_regression """, "outputDataset": "toy_reg_output", "runOnCreation": True } }) jsRez = rez.json() mldb.log(jsRez) self.assertEqual(jsRez["status"]["firstRun"]["status"]["mse"], 0.375) quart_rez = mldb.query( """select abs((label-score)/label) as prnct_error, label, score from toy_regression order by prnct_error ASC""" ) mldb.log("------------------------ here") mldb.log(quart_rez) self.assertAlmostEqual( jsRez["status"]["firstRun"]["status"]["quantileErrors"]["0.5"], quart_rez[2][2]) self.assertAlmostEqual( jsRez["status"]["firstRun"]["status"]["quantileErrors"]["0.9"], quart_rez[3][2]) # Check the accuracy dataset self.assertEqual(len(mldb.query("select * from toy_reg_output")), 5)
def test_incomplete(self): res = mldb.put( "/v1/procedures/split", { "type": "split", "params": { "reproducible": True, "labels": "SELECT * FROM ds5", "splits": [0.8, 0.2], "outputDatasets": [{ "id": "ds_train", "type": "sparse.mutable" }, { "id": "ds_test", "type": "sparse.mutable" }], } }) self.assertEqual( res.json()["status"]["firstRun"]["status"]["incompleteLabels"], ["y"]) res1 = mldb.query("SELECT sum({*}) FROM ds_train") res2 = mldb.query("SELECT sum({*}) FROM ds_test") self.assertEqual( res1, [["_rowName", "sum({*}).x", "sum({*}).y"], ["[]", 2, 1]]) self.assertEqual(res2, [["_rowName", "sum({*}).x"], ["[]", 1]])
def test_no_duplicate_rows_in_left_join_with_batch_exec(self): # the left condition is always true resp = mldb.query(""" SELECT count(*) FROM left_table LEFT JOIN right_table ON left_table.c = right_table.c """) self.assertEqual(resp[1][1], 1000, "expected 1000 rows to be returned") # the right condition is always false resp = mldb.query(""" SELECT count(*) FROM left_table LEFT JOIN right_table ON left_table.c = right_table.c AND 2 < right_table.d """) self.assertEqual(resp[1][1], 100, "expected 100 rows to be returned") # the right condition is half the time true resp = mldb.query(""" SELECT count(*) FROM left_table LEFT JOIN right_table ON left_table.c = right_table.c AND right_table.d = 1 """) # when the row index is even the condition always fails. This accounts for # 50 rows. When the index is odd, each of the left row match 10 different # right rows. So this account for 50 * 10 rows. self.assertEqual(resp[1][1], 550, "expected 550 rows to be returned")
def test_join_with_and(self): resp = mldb.query('select * from ds_train') mldb.log(resp) mldb.post( "/v1/procedures", { "type": "transform", "params": { "inputData": """ select * from ds left join ds_stats on (ds.dow=ds_stats.dow and ds.a_int=ds_stats.a_int) limit 10 """, "outputDataset": { "id": "ds_train2", "type": "tabular", "params": { "unknownColumns": "add" } }, "runOnCreation": True } }) resp2 = mldb.query('select * from ds_train2') mldb.log(resp2) # equivalent join conditions should be returning the same dataset # this is a very weak check because the columns and the row ordering # of these two equivalent joins are currently very different self.assertEqual(len(resp), len(resp2), 'expected response sizes to match')
def test_fasttext_explain(self): mldb.log("explain") cls_config = { "my_fasttext": { "type": "fasttext", "verbosity" : 0, "dims" : 4, "epoch" : 5, } } tmp_file = tempfile.NamedTemporaryFile(prefix=os.getcwd() + '/build/x86_64/tmp/') mldb.put("/v1/procedures/trainer", { "type": "classifier.train", "params": { "trainingData": "SELECT {tokens.*} as features, Theme as label FROM bag_of_words", "modelFileUrl": "file:///" + tmp_file.name, "functionName" : 'myclassify', "algorithm": "my_fasttext", "mode": "categorical", "runOnCreation": True, "configuration": cls_config } }) mldb.put("/v1/functions/explain", { "type": "classifier.explain", "params": { "modelFileUrl": "file:///" + tmp_file.name, } }) res = mldb.query("""SELECT explain({features : {tokenize(lower(' hockey Alabama Futbol'), {splitChars:' ,.:;«»[]()%!?', quoteChar:'', minTokenLength: 2}) as tokens}, label : 'Politique'}) as * """) self.assertTableResultEquals(res, [ [ "_rowName", "bias", "explanation.tokens.alabama", "explanation.tokens.futbol", "explanation.tokens.hockey" ], [ "result", 0, -0.006820799317210913, -0.07053825259208679, -0.08547607064247131 ] ]); with self.assertRaisesRegex(ResponseException, "label not in model"): res = mldb.query("""SELECT explain({features : {tokenize(lower(' hockey Alabama Futbol'), {splitChars:' ,.:;«»[]()%!?', quoteChar:'', minTokenLength: 2}) as tokens}, label : 'Futurama'}) as * """)
def test_const_and_var(self): res = mldb.query( "SELECT __isconst(a AND true) as isconst FROM ds1 ORDER BY rowName()" ) self.assertTableResultEquals(res, [ [ '_rowName', 'isconst', ], ['row1', False], ]) res = mldb.query( "SELECT __isconst(true AND a) as isconst FROM ds1 ORDER BY rowName()" ) self.assertTableResultEquals(res, [ [ '_rowName', 'isconst', ], ['row1', False], ]) res = mldb.query( "SELECT __isconst(a AND a) as isconst FROM ds1 ORDER BY rowName()") self.assertTableResultEquals(res, [ [ '_rowName', 'isconst', ], ['row1', False], ])
def test_bad_alias_rhs_inner_join(self): with self.assertRaises(ResponseException): mldb.query(""" SELECT * FROM a INNER JOIN b ON a.rowName() = undefined.rowName() """)
def test_hour_equivalence(self): self.assertTableResultEquals( mldb.query("select INTERVAL '2H' = INTERVAL '120m' as equal"), [ ["_rowName", "equal"], ["result", True ] ] ) self.assertTableResultEquals( mldb.query("select INTERVAL '2 hour' = INTERVAL '2 HOUR' as equal"), [ ["_rowName", "equal"], ["result", True ] ] ) self.assertTableResultEquals( mldb.query("select INTERVAL '24 H' = INTERVAL '1440 m' as equal"), [ ["_rowName", "equal"], ["result", True ] ] ) self.assertTableResultEquals( mldb.query("select INTERVAL '24 H' = INTERVAL '86400 s' as equal"), [ ["_rowName", "equal"], ["result", True ] ] )
def test_mldbfb_516_aggregator_incorrect_with_join(self): ds = mldb.create_dataset({'id': 'ds516', 'type': 'sparse.mutable'}) ds.record_row('user3', [['behA', 1, 11], ['conv', 1, 70], ['behB', 1, 14], ['behA', 1, 14]]) ds.commit() ds = mldb.create_dataset({'id': 'conv', 'type': 'sparse.mutable'}) ds.record_row('user3', [['ts', 70, 0]]) ds.commit() res = mldb.query(""" SELECT temporal_count({ds516.*}) AS * FROM ds516 """) mldb.log(res) self.assertTableResultEquals( res, [['_rowName', 'behA', 'behB', 'conv'], ['user3', 2, 1, 1]]) res = mldb.query(""" SELECT temporal_count({ds516.* as *}) AS * FROM ds516 INNER JOIN conv ON ds516.rowName() = conv.rowName() """) mldb.log(res) self.assertTableResultEquals(res, [['_rowName', 'behA', 'behB', 'conv'], ['[user3]-[user3]', 2, 1, 1]]) mldb.log(res)
def test_it(self): ds = mldb.create_dataset({'id': 'ds', 'type': 'sparse.mutable'}) ds.record_row('row1', []) ds.commit() query = """ SELECT jseval(' {} return {{"foo" : "bar"}}; ', 'cols', {{*}} ) AS * FROM ds """ # the query works mldb.log(mldb.query(query.format(""))) # add an exception, good luck understanding what's going on now... try: mldb.query(query.format('throw "this query is weird";')) except ResponseException as exc: mldb.log(exc.response.json()) assert 'this query is weird' in exc.response.json()['error'] else: assert False, 'should not be here'
def test_columnPathElem(self): msg = "Cannot have a NULL column name" with self.assertRaisesRegex(ResponseException, msg): mldb.query(''' select COLUMN EXPR (AS columnPathElement(1) WHERE columnName() LIKE '%topics%Junk%') from example ''')
def test_it(self): res = mldb.query(""" SELECT parse_json('{"a" : 5}') """) self.assertEqual(res[1][1], 5) msg = 'Executing builtin function parse_json' with self.assertRaisesRegex(ResponseException, msg): res = mldb.query(""" SELECT parse_json('coco') """) res = mldb.query(""" SELECT try(parse_json('{"a" : 5}'), 'err') """) self.assertEqual(res[1][1], 5) res = mldb.query(""" SELECT try(parse_json('coco'), 'err') """) self.assertEqual(res[1][1], 'err') res = mldb.query(""" SELECT try(parse_json('coco')) """) self.assertRegex( res[1][1], "JSON passed to parse_json must be an object or an array")
def test_no_duplicate_rows_in_left_join_with_pipeline_exec(self): # the cross condition is always true resp = mldb.query(""" SELECT count(*) FROM left_table LEFT JOIN right_table ON left_table.c = right_table.c AND left_table.const > right_table.d """) self.assertEqual(resp[1][1], 1000, "expected 1000 rows to be returned") # the cross condition is always false resp = mldb.query(""" SELECT count(*) FROM left_table LEFT JOIN right_table ON left_table.c = right_table.c AND left_table.const < right_table.d """) self.assertEqual(resp[1][1], 100, "expected 100 rows to be returned") # the right condition is half the time true, the cross condition is always true resp = mldb.query(""" SELECT count(*) FROM left_table LEFT JOIN right_table ON left_table.c = right_table.c AND left_table.const > right_table.d AND right_table.d = 1 order by rowName() """) #mldb.log(resp) self.assertEqual(resp[1][1], 550, "expected 550 rows to be returned")
def test_spread(self): mldb.put( "/v1/procedures/split", { "type": "split", "params": { "labels": "SELECT * FROM ds1", "reproducible": True, "splits": [0.8, 0.2], "outputDatasets": [{ "id": "ds_train", "type": "sparse.mutable" }, { "id": "ds_test", "type": "sparse.mutable" }], } }) res1 = mldb.query("SELECT * FROM ds_train ORDER BY rowName() DESC") res2 = mldb.query("SELECT * FROM ds_test ORDER BY rowName() DESC") self.assertEqual( res1, [["_rowName", "y", "x"], ["3", 1, None], ["0", None, 1]]) self.assertEqual( res2, [["_rowName", "y", "x"], ["2", 1, None], ["1", None, 1]])
def test_const_userfunction_var(self): mldb.put('/v1/functions/fetch', {'type': 'fetcher'}) res = mldb.query( "SELECT __isconst(fetch({url: a})) as isconst FROM ds1 ORDER BY rowName()" ) self.assertTableResultEquals(res, [ [ '_rowName', 'isconst', ], ['row1', False], ]) mldb.put('/v1/functions/fetch2', { 'type': 'fetcher', 'deterministic': False }) res = mldb.query( "SELECT __isconst(fetch2({url: 'itdoesntreallymatter'})) as isconst FROM ds1 ORDER BY rowName()" ) self.assertTableResultEquals(res, [ [ '_rowName', 'isconst', ], ['row1', False], ])
def test_not_equivalent(self): self.assertTableResultEquals( mldb.query("select INTERVAL '1 day' = INTERVAL '24H' as equal"), [ ["_rowName", "equal"], ["result", False ] # because of daylight saving ] ) self.assertTableResultEquals( mldb.query("select INTERVAL '1 month' = INTERVAL '30day' as equal"), [ ["_rowName", "equal"], ["result", False ] # because months were not all created equal ] ) self.assertTableResultEquals( mldb.query("select INTERVAL '1 month' = INTERVAL '4 week' as equal"), [ ["_rowName", "equal"], ["result", False ] # because months were not all created equal ] ) self.assertTableResultEquals( mldb.query("select INTERVAL '1 year' = INTERVAL '365 day' as equal"), [ ["_rowName", "equal"], ["result", False ] # because of leap years ] )
def run_MLDBFB_545_with_ds_type(self, ds_type): id1 = ds_type + 'mldbfb545_1' ds = mldb.create_dataset({'id': id1, 'type': ds_type + '.mutable'}) ds.record_row('user1', [['converted', 'n', 0]]) ds.commit() id2 = ds_type + 'mldbfb545_2' ds = mldb.create_dataset({'id': id2, 'type': ds_type + '.mutable'}) ds.record_row('user2', [['blah', 'blah', 0]]) ds.commit() # query directly on the dataset works res = mldb.query(""" SELECT * FROM {} WHERE converted='c' LIMIT 1 """.format(id1)) self.assertEqual(len(res), 1) merge_id = ds_type + 'mldbfb545merged' mldb.put("/v1/datasets/" + merge_id, { "type": "merged", "params": { "datasets": [{ "id": id1 }, { "id": id2 }] } }) # query on the merged dataset yields incorrect results res = mldb.query(""" SELECT * FROM {} WHERE converted='c' LIMIT 1 """.format(merge_id)) mldb.log(res) self.assertEqual(len(res), 1)
def test_string_mixing(self): res = mldb.query("SELECT a.f['b'] FROM (SELECT {f: {b: 123}} AS a)") self.assertEqual(res, [['_rowName', '"a.f[\'b\']"'], ['result', 'b']]) with self.assertRaises(ResponseException): res = mldb.query( "SELECT a['f'].b FROM (SELECT {f: {b: 123}} AS a)")
def test_domain_parsing(self): self.assertTableResultEquals( mldb.query(""" select extract_domain('http://www.datacratic.com/pwetpwet/houa.html') as c1, extract_domain('http://datacratic.com/pwetpwet/houa.html') as c2, extract_domain('http://data.datacratic.com/pwetpwet/houa.html') as c3, extract_domain('http://www.datacratic.com/pwetpwet/houa.html', {removeSubdomain:1}) as c1nosub, extract_domain('http://datacratic.com/pwetpwet/houa.html', {removeSubdomain:1}) as c2nosub, extract_domain('http://data.datacratic.com/pwetpwet/houa.html', {removeSubdomain:1}) as c3nosub """), [["_rowName", "c1", "c1nosub", "c2", "c2nosub", "c3", "c3nosub"], [ "result", "www.datacratic.com", "datacratic.com", "datacratic.com", "datacratic.com", "data.datacratic.com", "datacratic.com" ]]) with self.assertRaisesRegex( ResponseException, 'Attempt to create a URL without a scheme'): mldb.query("SELECT extract_domain('pwet.com') as c4") self.assertTableResultEquals( mldb.query(""" select extract_domain(patate) as domain, extract_domain(value) as domain2 from ( select * from row_dataset({"domain": 'http://www.domain.com'}) ) """), [["_rowName", "domain", "domain2"], ["0", None, "www.domain.com"]])
def test_valid_aggregator_on_wildcard_builtin(self): mldb.query( "select count(*), earliest(temporal_earliest({*})) from sample group by x" ) mldb.query( "select count(*), earliest({horizontal_earliest({*})}) from sample group by x" )
def test_it(self): ds = mldb.create_dataset({'id': 'ds', 'type': 'sparse.mutable'}) ds.record_row('row1', [['colA', 1, 0]]) ds.commit() msg = "function avg expected 1 argument, got 2" with self.assertRaisesRegex(ResponseException, msg): mldb.query("SELECT avg(colA, 2) FROM ds")
def test_mixing_double_quotes(self): res = mldb.query( """SELECT a.f["b"] FROM (SELECT {f: {b: 123}} AS a)""") self.assertEqual(res, [['_rowName', 'a.f[b]'], ['result', 123]]) res = mldb.query( """SELECT a["f"].b FROM (SELECT {f: {b: 123}} AS a)""") self.assertEqual(res, [['_rowName', 'a.f[b]'], ['result', 123]])
def test_invalid_group_by_and_wildcard_builtin(self): with self.assertRaisesRegex( ResponseException, "Non-aggregator 'temporal_earliest\(\{\*\}\)' with GROUP BY clause is not allowed" ): mldb.query( "select temporal_earliest({*}) as earliest from sample group by x" )
def test_value_desc_on_wrong_params(self): with self.assertRaisesRegex( ResponseException, 'Binding builtin function sqrt: expected 1 argument, got 2'): mldb.query("select sqrt(2, NULL)") with self.assertRaisesRegex( ResponseException, 'Binding builtin function sqrt: expected 1 argument, got 2'): mldb.query("select sqrt(2, 1)")