def test_MLDB_1453(self): dataset_config = { 'type' : 'sparse.mutable', 'id' : 'test5', } dataset = mldb.create_dataset(dataset_config) dataset.record_row('myrow', [ [ "a", 0, self.ts ] ]) dataset.commit() query1 = mldb.get('/v1/query', q = 'SELECT a IS NOT TIMESTAMP as x, a IS TIMESTAMP as y from test5') self.assertFullResultEquals(query1.json(), [{"rowName":"myrow","rowHash":"fbdba4c9be68f633","columns":[["x",1,"2015-01-01T00:00:00Z"],["y",0,"2015-01-01T00:00:00Z"]]}] ) query1 = mldb.get('/v1/query', q = 'SELECT latest_timestamp(1) IS NOT TIMESTAMP as x, latest_timestamp(1) IS TIMESTAMP as y from test5') self.assertFullResultEquals(query1.json(), [{"rowName":"myrow","rowHash":"fbdba4c9be68f633","columns":[["x",0,"-Inf"],["y",1,"-Inf"]]}] ) query1 = mldb.get('/v1/query', q = "SELECT interval '3d' IS NOT INTERVAL as x, interval '3d' IS INTERVAL as x from test5") self.assertFullResultEquals(query1.json(), [{"rowName": "myrow","rowHash": "fbdba4c9be68f633","columns":[["x",0,"-Inf"],["x",1,"-Inf"]]}] )
def test_vertical_avg_is_avg(self): resp = mldb.get("/v1/query", q="SELECT avg(x) AS avg FROM test GROUP BY x") resp2 = mldb.get( "/v1/query", q="SELECT vertical_avg(x) AS avg FROM test GROUP BY x") self.assertFullResultEquals(resp.json(), resp2.json())
def test_progress(self): proc = { 'type': 'mock', 'params': { 'durationMs': 900, 'refreshRateMs': 100 } } res = mldb.post_async('/v1/procedures', { 'type': "serial", 'params': { 'steps': [proc, proc, proc, proc, proc] } }).json() proc_id = res['id'] run_id = res['status']['firstRun']['id'] time.sleep(0.5) url = '/v1/procedures/{}/runs/{}'.format(proc_id, run_id) res = mldb.get(url).json() self.assertEqual(res['state'], 'executing') self.assertTrue('subProgress' in res['progress']) self.assertEqual(len(res['progress']['steps']), 5) def reducer(x, y): return x + y['value'] total1 = reduce(reducer, res['progress']['steps'], 0) time.sleep(1) res = mldb.get(url).json() total2 = reduce(reducer, res['progress']['steps'], 0) self.assertGreater(total2, total1)
def train_svd_with_default(): svd_procedure = "/v1/procedures/svd" # svd procedure configuration svd_config = { 'type': 'svd.train', 'params': { "trainingData": "select * from dataset1", # first way to specify output dataset using default "rowOutputDataset": "svd_row", # second way to specify an output dataset using default "columnOutputDataset": { "id": "svd_column" } } } result = mldb.put(svd_procedure, svd_config) mldb.log(result) result = mldb.post(svd_procedure + '/runs') mldb.log(result) result = mldb.get('/v1/datasets/svd_column') assert result.json()['type'] == 'embedding', \ 'expected an embedding output dataset' result = mldb.get('/v1/datasets/svd_row') assert result.json()['type'] == 'embedding', \ 'expected an embedding output dataset'
def test_base(self): dataset_config = { 'type' : 'sparse.mutable', 'id' : 'toy' } dataset = mldb.create_dataset(dataset_config) dataset.record_row("rowA", [["feat1", 1, 0], ["feat2", 1, 0], ["feat3", 1, 0]]) dataset.record_row("rowB", [["feat1", 1, 0], ["feat2", 1, 0]]), dataset.record_row("rowC", [["feat1", 1, 0]]) dataset.commit() mldb.get( "/v1/query", q="select COLUMN EXPR (ORDER BY rowCount() DESC LIMIT 2) from toy") mldb.get( "/v1/query", q="""SELECT COLUMN EXPR ( WHERE regex_match(columnName(), 'feat[[:digit:]]') ORDER BY rowCount() DESC LIMIT 2) from toy""")
def test_vertical_count_is_count_star(self): resp = mldb.get("/v1/query", q="SELECT count(*) AS count FROM test GROUP BY x") resp2 = mldb.get( "/v1/query", q="SELECT vertical_count(*) AS count FROM test GROUP BY x") self.assertFullResultEquals(resp.json(), resp2.json())
def test_no_set_return(self): mldb.put( "/v1/plugins/mldb2114", { "type": "python", "params": { "source": { "routes": """ from mldb import mldb mldb.log('no return') """ } } }) msg = "Return value is required for route handlers but not set" with self.assertRaisesRegex(ResponseException, msg) as e: mldb.get('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaisesRegex(ResponseException, msg) as e: mldb.post('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaisesRegex(ResponseException, msg) as e: mldb.put('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaisesRegex(ResponseException, msg) as e: mldb.delete('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500)
def test_tokenize_ngram_range(self): result = mldb.get('/v1/query', q="""SELECT tokenize('I would want a burger', {splitChars: ' ', ngramRange: [1, 3], minTokenLength:2}) AS tokens""") self.find_column(result, "tokens.would_want_burger", 1) self.find_column(result, "tokens.burger", 1) self.find_column(result, "tokens.would_want", 1) self.find_column(result, "tokens.want", 1) self.find_column(result, "tokens.want_burger", 1) self.find_column(result, "tokens.would", 1) result = mldb.get( '/v1/query', q="""SELECT tokenize('I would want a burger I would want a burger', {splitChars: ' ', ngramRange: [3, 3], minTokenLength:2}) AS tokens""") self.find_column(result, "tokens.would_want_burger", 2) self.not_find_column(result, "tokens.would_want") result = mldb.get('/v1/query', q="""SELECT tokenize('I would want a burger', {splitChars: ' ', ngramRange: [1, 2]}) AS tokens""") self.find_column(result, "tokens.I_would", 1)
def test_vertical_earliest_is_earliest(self): resp = mldb.get("/v1/query", q="SELECT earliest({*}) AS count FROM test GROUP BY x") resp2 = mldb.get( "/v1/query", q="SELECT vertical_earliest({*}) AS count FROM test GROUP BY x") self.assertFullResultEquals(resp.json(), resp2.json())
def run_and_cancel(self, name, config): mldb.log('running ' + name) location = self.run_procedure_async(name, config) mldb.log('cancelling ' + name) resp = mldb.put(location + "/state", {'state': 'cancelled'}) self.assertEqual(resp.status_code, 200) running = True while running: resp = mldb.get(location + "/state") mldb.log(resp) if resp.json()['state'] == 'cancelled': running = False if resp.json()['state'] == 'finished': self.fail( "suspicious - the procedure finished before it was cancelled" ) if resp.json()['state'] == 'error': mldb.log(mldb.get(location)) self.fail( "the procedure generated an error before or after cancellation" ) sleep(0.1)
def test_set_return_0(self): mldb.put( "/v1/plugins/mldb2114", { "type": "python", "params": { "source": { "routes": """request.set_return("", 0)""" } } }) with self.assertRaises(ResponseException) as e: mldb.get('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaises(ResponseException) as e: mldb.post('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaises(ResponseException) as e: mldb.put('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500) with self.assertRaises(ResponseException) as e: mldb.delete('/v1/plugins/mldb2114/routes/foo') self.assertEqual(e.exception.response.status_code, 500)
def test_svd(self): # this is throwing because the not_yet_created dataset # does not exist with self.assertRaises(ResponseException) as re: mldb.put("/v1/datasets/training_data",{ "type": "merged", "params": { "datasets": [ {"id": "sample"}, {"id": "not_yet_created"} # attention ] } }) # we want to store output in 'not_yet_created' # the fact we tried to access 'not_yet_created' above # makes the first attempt to create it fail mldb.put("/v1/procedures/train_svd", { "type": "svd.train", "params": { "rowOutputDataset": "not_yet_created", # attention "outputColumn": "svd.embedding.00", "modelFileUrl": "file://tmp/svd.bin.test.gz", "trainingData": "select * from sample", "numSingularValues": 1, "runOnCreation": True } }) # this should now work mldb.get("/v1/query", q="select x from not_yet_created")
def test_query_first_row(self): # Example of a query passed straight to mongodb. The result comes back # formatted as an MLDB result. mldb.put('/v1/functions/mongo_query', { 'type' : 'mongodb.query', 'params' : { 'uriConnectionScheme' : self.connection_scheme, 'collection' : 'test_collection' } }) query = json.dumps({ 'type' : { '$eq' : 'nested_obj' } }) res = mldb.get('/v1/functions/mongo_query/application', input={'query' : query}).json() self.assertEqual(res['output']['type'], 'nested_obj') _id = res['output']['_id'] query = json.dumps({ '_id' : _id }) res = mldb.get('/v1/functions/mongo_query/application', input={'query' : query}).json() self.assertEqual(res['output']['type'], 'nested_obj')
def test_import_text_progress(self): tmp_file = tempfile.NamedTemporaryFile(dir='build/x86_64/tmp', mode='wt', encoding="utf-8") tmp_file.write('a,b,c\n') for i in range(100000): tmp_file.write('{},{},{}\n'.format(i, i * 10, i / 2)) tmp_file.flush() mldb.put( '/v1/procedures/import_da_file', { 'type': 'import.text', 'params': { 'dataFileUrl': 'file://' + tmp_file.name, 'outputDataset': { 'id': 'ds', 'type': 'tabular' } } }) location = mldb.post_async("/v1/procedures/import_da_file/runs") \ .headers['Location'] res = mldb.get(location).json() prev_count = 0 while res['state'] != 'finished': if res['state'] == 'executing': self.assertGreaterEqual(res['progress']['value'], prev_count) prev_count = res['progress']['value'] time.sleep(0.001) res = mldb.get(location).json() self.assertGreater(prev_count, 0)
def test_base(self): mldb.put('/v1/datasets/ds', { 'type': 'sparse.mutable', }) size = 123 for i in range(size): mldb.post( '/v1/datasets/ds/rows', { 'rowName': 'row{}'.format(i), 'columns': [['score', i, 1], ['index', i * 2, 2], ['prob', i * 3, 3]] }) mldb.post('/v1/datasets/ds/commit') mldb.post( '/v1/procedures', { 'type': 'ranking', 'params': { 'inputData': 'SELECT * FROM ds ORDER BY score', 'outputDataset': 'out', 'rankingType': 'index', 'runOnCreation': True } }) # MLDB-1267 mldb.log(mldb.query("SELECT * FROM out")) res = mldb.get("/v1/query", q="SELECT latest_timestamp({*}) FROM out", format='table') data = res.json() self.assertEqual(data[1][1], '1970-01-01T00:00:01Z') mldb.log(data[1]) mldb.put( '/v1/datasets/result', { 'type': 'merged', 'params': { 'datasets': [{ 'id': 'ds' }, { 'id': 'out' }] } }) res = mldb.get('/v1/query', q='SELECT score, rank FROM result ORDER BY rank', format='table') data = res.json() self.assertEqual(data[1][1], 0, str(data[1])) self.assertEqual(data[1][2], 0, str(data[1])) self.assertEqual(data[2][1], 1, str(data[2])) self.assertEqual(data[2][2], 1, str(data[2])) self.assertEqual(data[size][1], size - 1, str(data[size])) self.assertEqual(data[size][2], size - 1, str(data[size]))
def test_mldbfb_520_join(self): """ temporal_earliest doesn't yield correct result when used with join expressions. """ ds = mldb.create_dataset({ 'id': 'mldbfb520_join_left', 'type': 'sparse.mutable' }) ds.record_row('user1', [['behA', 1, 1], ['behA', 1, 2], ['behA', 1, 3], ['behB', 1, 9], ['behC', 1, 8]]) ds.commit() ds = mldb.create_dataset({ 'id': 'mldbfb520_join_right', 'type': 'sparse.mutable' }) ds.record_row( 'user1', [['behD', 1, 1], ['behD', 1, 2], ['behD', 1, 3], ['behB', 1, 9]]) ds.commit() query = """ SELECT temporal_earliest({ COLUMN EXPR (WHERE columnName() IN ('l.behA', 'l.behB', 'r.behD'))}) AS * FROM mldbfb520_join_left AS l INNER JOIN mldbfb520_join_right as r ON l.behB = r.behB """ res = mldb.get('/v1/query', q=query) expected = [{ "rowName": "[user1]-[user1]", "columns": [["l.behA", 1, "1970-01-01T00:00:01Z"], ["l.behB", 1, "1970-01-01T00:00:09Z"], ["r.behD", 1, "1970-01-01T00:00:01Z"]] }] mldb.log(res) self.assertFullResultEquals(res.json(), expected) query = """ SELECT temporal_latest({ COLUMN EXPR (WHERE columnName() IN ('l.behA', 'l.behB', 'r.behD'))}) AS * FROM mldbfb520_join_left AS l INNER JOIN mldbfb520_join_right as r ON l.behB = r.behB """ res = mldb.get('/v1/query', q=query) expected = [{ "rowName": "[user1]-[user1]", "columns": [["l.behA", 1, "1970-01-01T00:00:03Z"], ["l.behB", 1, "1970-01-01T00:00:09Z"], ["r.behD", 1, "1970-01-01T00:00:03Z"]] }] mldb.log(res) self.assertFullResultEquals(res.json(), expected)
def test_calling_from_application_double_params(self): data = {"data1": {"x": 1}, "data2": {"y": 2}} msg = "You cannot mix query string and body parameters" with self.assertRaisesRegex(ResponseException, msg): mldb.get("/v1/functions/func/application", input=json.dumps(data), data={"input": data})
def test_queries(self): rez = mldb.get( "/v1/query", q="select * from sample(toy, {rows: 25000, withReplacement: 1})") self.assertEqual(len(rez.json()), 25000) rez = mldb.get("/v1/query", q="select * from sample(toy, {rows: 25})") self.assertEqual(len(rez.json()), 25)
def assert_fail(qry): try: mldb.get('/v1/query', q=qry) except ResponseException as exc: result = exc.response else: assert False, 'should not be here' mldb.log(result.text)
def test_mldb_post_dataset(self): _id = 'époque' res = mldb.post('/v1/datasets', {'id': _id, 'type': 'sparse.mutable'}) mldb.log(mldb.get('/v1/datasets')) url = quote(('/v1/datasets/' + _id).encode('utf8')) mldb.log(url) res = mldb.get(url).json() self.assertEqual(res['id'], _id)
def test_select_x_works(self): # try something that should work # mldb.get asserts the result status_code is >= 200 and < 400 mldb.get("/v1/query", q="select x from sample") # assert the result, all unittest asserts are available and # assertTableResultEquals was added to facilitate validating query # results self.assertTableResultEquals(mldb.query("select x from sample"), [["_rowName", "x"], ["a", 1]])
def test_noIgnore_with_unknown_get(self): mldb.get("/v1/functions/noIgnore/application", input={ "a": 1, "b": 2, "c": 5 }) self.assertTableResultEquals( mldb.query("select noIgnore({a:1, b:2, c:5}) as *"), [["_rowName", "rez"], ["result", 3]])
def test_response_exception(self): url = "/unexisting" res = None try: mldb.get(url) except ResponseException as response_exception: res = response_exception.response pass self.assertEqual(res.status_code, 404) self.assertEqual(res.url, url)
def test_seed_works(self): # test seed works rez = mldb.get("/v1/query", q="select * from sample(toy, {rows: 1, seed: 5})") rez2 = mldb.get("/v1/query", q="select * from sample(toy, {rows: 1, seed: 5})") self.assertEqual(rez.json()[0], rez2.json()[0]) rez = mldb.get("/v1/query", q="select * from sample(toy, {rows: 1})") rez2 = mldb.get("/v1/query", q="select * from sample(toy, {rows: 1})") self.assertNotEqual(rez.json()[0], rez2.json()[0])
def test_threesplits(self): mldb.put( "/v1/procedures/split", { "type": "split", "params": { "reproducible": True, "labels": "SELECT * FROM ds4", "splits": [0.8, 0.1, 0.1], "foldImportance": 1.0, "outputDatasets": [{ "id": "ds_train", "type": "sparse.mutable" }, { "id": "ds_test", "type": "sparse.mutable" }, { "id": "ds_validate", "type": "sparse.mutable" }], } }) n = mldb.get('/v1/query', q="SELECT count(*) FROM ds_train", format='atom').json() self.assertEqual(19, n) n = mldb.get('/v1/query', q="SELECT count(*) FROM ds_test", format='atom').json() self.assertEqual(2, n) n = mldb.get('/v1/query', q="SELECT count(*) FROM ds_validate", format='atom').json() self.assertEqual(3, n) res1 = mldb.query("SELECT sum({*}) FROM ds_train") res2 = mldb.query("SELECT sum({*}) FROM ds_test") res3 = mldb.query("SELECT sum({*}) FROM ds_validate") self.assertEqual( res1, [["_rowName", "sum({*}).x", "sum({*}).y", "sum({*}).z"], ["[]", 8, 11, 7]]) self.assertEqual( res2, [["_rowName", "sum({*}).x", "sum({*}).y", "sum({*}).z"], ["[]", 1, 2, 1]]) self.assertEqual( res2, [["_rowName", "sum({*}).x", "sum({*}).y", "sum({*}).z"], ["[]", 1, 2, 1]])
def test_error(self): """ MLDB-1478 """ csv_conf = { "type": "import.text", "params": { 'dataFileUrl' : 'https://raw.githubusercontent.com/datacratic/mldb-pytanic-plugin/master/titanic_train.csv', "outputDataset": { "id": "titanic2", }, "runOnCreation": True } } mldb.put("/v1/procedures/csv_proc", csv_conf) with self.assertRaises(ResponseException) as re: res = mldb.get("/v1/query", q='select COLUMN EXPR (WHERE regex_match(columnName(), "P.*") ) from titanic2') mldb.log(re.exception.response.json()["error"]) expected = 'Binding error: Cannot read column \'"P.*"\' inside COLUMN EXPR' self.assertEqual(re.exception.response.json()["error"], expected) with self.assertRaises(ResponseException) as re: res = mldb.get("/v1/query", q='select COLUMN EXPR (WHERE regex_match(columnName(), {P.*}) ) from titanic2') mldb.log(re.exception.response.json()["error"]) expected = 'Binding error: Cannot use wildcard \'P.*\' inside COLUMN EXPR' self.assertEqual(re.exception.response.json()["error"], expected) with self.assertRaises(ResponseException) as re: res = mldb.get("/v1/query", q='SELECT a') mldb.log(re.exception.response.json()["error"]) expected = 'Cannot read column "a" with no FROM clause.' self.assertEqual(re.exception.response.json()["error"], expected) with self.assertRaises(ResponseException) as re: res = mldb.get("/v1/query", q='SELECT 1 named a') expected = 'Cannot read column "a" with no FROM clause.' mldb.log(re.exception.response.json()["error"]) self.assertEqual(re.exception.response.json()["error"], expected)
def test_query_no_query(self): # Example of a query passed straight to mongodb. The result comes back # formatted as an MLDB result. mldb.put('/v1/functions/mongo_query_no_query', { 'type' : 'mongodb.query', 'params' : { 'uriConnectionScheme' : self.connection_scheme, 'collection' : 'test_collection' } }) msg = 'You must define the parameter \\\\"query\\\\"' with self.assertRaisesRegex(ResponseException, msg): mldb.get('/v1/functions/mongo_query_no_query/application')
def execute_sequence(self, _id): url = '/v1/datasets/' + quote(_id, safe='') mldb.log(url) res = mldb.put(url, {'type': 'sparse.mutable'}) res = mldb.get(res.headers['Location']).json() self.assertEqual(res['id'], _id) res = mldb.get(url).json() self.assertEqual(res['id'], _id) mldb.delete(url) with self.assertMldbRaises(status_code=404): mldb.get(url) res = mldb.post('/v1/datasets', {'id': _id, 'type': 'sparse.mutable'}) res = mldb.get(res.headers['Location']).json() self.assertEqual(res['id'], _id) res = mldb.get(url).json() self.assertEqual(res['id'], _id) mldb.delete(url) with self.assertMldbRaises(status_code=404): mldb.get(url)
def test_token_extract_splitChars_and_limit_and_offset(self): result = mldb.get('/v1/query', q=""" SELECT token_extract('a b c d e f', 3, {' ' AS splitChars, 2 AS limit, 1 AS offset}) AS token""") self.assertEqual(result.json()[0]['columns'][0][1], "e") result = mldb.get('/v1/query', q=""" SELECT token_extract('a b c d e f', -1, {' ' AS splitChars, 2 AS limit, 1 AS offset}) AS token""") self.assertEqual(result.json()[0]['columns'][0][1], "c")
def test_MLDB_1370(self): dataset_config = { 'type' : 'sparse.mutable', 'id' : 'test4', } dataset4 = mldb.create_dataset(dataset_config) dataset4.record_row('myrow', [ [ "a", 0, self.ts ], ["a", 0, self.ts_plus_1d] ]) dataset4.commit() query1 = mldb.get('/v1/query', q = 'SELECT earliest_timestamp(a) as earliest from test4') query2 = mldb.get('/v1/query', q = 'SELECT earliest_timestamp({*}) as earliest from test4') self.assertFullResultEquals(query1.json(), query2.json())