Esempio n. 1
0
class TestGenerator(TestCase):
    create_statement = "create table test_table_float_fea " \
                       "(features float, label int)"
    drop_statement = "drop table if exists test_table_float_fea"
    insert_statement = "insert into test_table_float_fea (features,label)" \
                       " values(1.0, 0), (2.0, 1)"

    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_generator(self):
        driver = testing.get_driver()
        user, password, host, port, database, _ = parseMySQLDSN(
            testing.get_mysql_dsn())
        conn = connect(driver,
                       database,
                       user=user,
                       password=password,
                       host=host,
                       port=int(port))
        # prepare test data
        execute(driver, conn, self.drop_statement)
        execute(driver, conn, self.create_statement)
        execute(driver, conn, self.insert_statement)

        column_name_to_type = {
            "features": {
                "feature_name": "features",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": False,
                "shape": []
            }
        }
        label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
        gen = db_generator(conn, "SELECT * FROM test_table_float_fea",
                           label_meta)
        idx = 0
        for row, label in gen():
            features = read_features_from_row(row, ["features"], ["features"],
                                              column_name_to_type)
            d = (features, label)
            if idx == 0:
                self.assertEqual(d, (((1.0, ), ), 0))
            elif idx == 1:
                self.assertEqual(d, (((2.0, ), ), 1))
            idx += 1
        self.assertEqual(idx, 2)

    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_generate_fetch_size(self):
        label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
        gen = db_generator(testing.get_singleton_db_connection(),
                           'SELECT * FROM iris.train limit 10',
                           label_meta,
                           fetch_size=4)
        self.assertEqual(len([g for g in gen()]), 10)
Esempio n. 2
0
class TestXGBoostTrain(unittest.TestCase):
    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_train(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """
        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {
            "num_boost_round": 20,
        }
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        eval_result = train(ds, original_sql, select, val_select,
                            "xgboost.gbtree", "", None,
                            NumericColumn(FieldDesc(name="class")),
                            model_params, train_params, None,
                            "iris.xgboost_train_model_test", None)
        self.assertLess(eval_result['train']['merror'][-1], 0.01)
        self.assertLess(eval_result['validate']['merror'][-1], 0.01)
Esempio n. 3
0
class TestXGBoostTrain(TestCase):
    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_train(self):
        ds = testing.get_datasource()
        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        feature_column_names = [
            feature_metas[k]["feature_name"] for k in feature_metas
        ]
        is_pai = False
        pai_train_table = ""
        train_params = {"num_boost_round": 20}
        model_params = {"num_classes": 3}
        with tempfile.TemporaryDirectory() as tmp_dir_name:
            train_fn = os.path.join(tmp_dir_name, 'train.txt')
            val_fn = os.path.join(tmp_dir_name, 'val.txt')
            dtrain = xgb_dataset(ds, train_fn, select, feature_metas,
                                 feature_column_names, label_meta, is_pai,
                                 pai_train_table)
            dval = xgb_dataset(ds, val_fn, val_select, feature_metas,
                               feature_column_names, label_meta, is_pai,
                               pai_train_table)
            eval_result = train(dtrain, train_params, model_params, dval)
            self.assertLess(eval_result['train']['rmse'][-1], 0.01)
            self.assertLess(eval_result['validate']['rmse'][-1], 0.01)
Esempio n. 4
0
class TestFetchVerifyColumnNameAndType(unittest.TestCase):
    def generate_select(self, table, columns):
        return "SELECT %s FROM %s" % (",".join(columns), table)

    @unittest.skipUnless(testing.get_driver() in ["mysql", "hive"],
                         "skip non mysql/hive tests")
    def test_verify_column_name_and_type(self):
        conn = testing.get_singleton_db_connection()

        train_table = "iris.train"
        test_table = "iris.test"

        train_select = [
            "petal_length", "petal_width", "sepal_length", "sepal_width",
            "class"
        ]
        test_select = train_select
        verify_column_name_and_type(
            conn, self.generate_select(train_table, train_select),
            self.generate_select(test_table, test_select), "class")

        test_select = [
            "petal_length", "petal_width", "sepal_length", "sepal_width"
        ]
        verify_column_name_and_type(
            conn, self.generate_select(train_table, train_select),
            self.generate_select(test_table, test_select), "class")

        test_select = ["petal_length", "petal_width", "sepal_length"]
        with self.assertRaises(ValueError):
            verify_column_name_and_type(
                conn, self.generate_select(train_table, train_select),
                self.generate_select(test_table, test_select), "class")

        cursor = conn.cursor()
        name_and_type = dict(db.get_table_schema(conn, test_table))
        new_table_name = "iris.verifier_test_table"

        name_and_type["petal_length"] = "VARCHAR(255)"  # change the data type
        create_column_str = ",".join(
            ["%s %s" % (n, t) for n, t in name_and_type.items()])

        drop_sql = "DROP TABLE IF EXISTS %s" % new_table_name
        create_sql = "CREATE TABLE %s(%s)" % (new_table_name,
                                              create_column_str)
        cursor.execute(drop_sql)
        cursor.execute(create_sql)
        with self.assertRaises(ValueError):
            test_select = train_select
            verify_column_name_and_type(
                conn, self.generate_select(train_table, train_select),
                self.generate_select(new_table_name, test_select), "class")
        cursor.execute(drop_sql)
        cursor.close()
Esempio n. 5
0
class TestXGBoostTrain(unittest.TestCase):
    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_train(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """
        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {
            "num_boost_round": 20,
        }
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(ds, original_sql, select, val_select,
                                "xgboost.gbtree", "", None,
                                NumericColumn(FieldDesc(name="class")),
                                model_params, train_params, None,
                                "iris.xgboost_train_model_test", None)
            self.assertLess(eval_result['train']['merror'][-1], 0.01)
            self.assertLess(eval_result['validate']['merror'][-1], 0.01)

        with temp_file.TemporaryDirectory(as_cwd=True):
            pred_original_sql = """SELECT * FROM iris.test
            TO PREDICT iris.xgboost_pred_result.pred_val
            USING iris.xgboost_train_model_test;"""
            pred(ds, pred_original_sql, "SELECT * FROM iris.test",
                 "iris.xgboost_train_model_test", "pred_val", model_params,
                 "iris.xgboost_pred_result")

        with temp_file.TemporaryDirectory(as_cwd=True):
            explain_original_sql = """SELECT * FROM iris.test
            TO EXPLAIN iris.xgboost_train_model_test
            INTO iris.xgboost_explain_result;"""
            explain(ds, explain_original_sql, "SELECT * FROM iris.test",
                    "iris.xgboost_train_model_test", model_params,
                    "iris.xgboost_explain_result")

        with temp_file.TemporaryDirectory(as_cwd=True):
            evaluate_original_sql = """SELECT * FROM iris.test
            TO EVALUATE iris.xgboost_train_model_test
            WITH label_col=class
            INTO iris.xgboost_evaluate_result;"""
            evaluate(ds, evaluate_original_sql, "SELECT * FROM iris.test",
                     "class", "iris.xgboost_train_model_test", model_params,
                     "iris.xgboost_evaluate_result")
Esempio n. 6
0
    def test_mysql(self):
        driver = testing.get_driver()
        user, password, host, port, database, _ = parseMySQLDSN(
            testing.get_mysql_dsn())
        conn = connect(driver,
                       database,
                       user=user,
                       password=password,
                       host=host,
                       port=port)
        self._do_test(driver, conn)
        conn.close()

        conn = testing.get_singleton_db_connection()
        self._do_test(driver, conn)
Esempio n. 7
0
class TestFetchSamples(unittest.TestCase):
    @unittest.skipUnless(testing.get_driver() in ["mysql", "hive"],
                         "skip non mysql/hive tests")
    def test_fetch_sample(self):
        conn = testing.get_singleton_db_connection()

        select = "SELECT * FROM iris.train"
        name_and_type = db.selected_columns_and_types(conn, select)
        expect_field_names = [item[0] for item in name_and_type]
        expect_field_types = [item[1] for item in name_and_type]
        column_num = len(name_and_type)

        gen = fetch_samples(conn, select, n=0)
        self.assertTrue(gen is None)

        gen = fetch_samples(conn, select, n=-1)
        row_num = length(gen())
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertGreater(row_num, 25)

        gen = fetch_samples(conn, select, n=25)
        n = 0

        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))

        for rows in gen():
            self.assertEqual(len(rows), column_num)
            n += 1

        self.assertEqual(n, 25)

        gen = fetch_samples(conn, select, n=10)
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertEqual(length(gen()), 10)

        gen = fetch_samples(conn, "%s LIMIT 1" % select, n=1000)
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertEqual(length(gen()), 1)

        gen = fetch_samples(conn, select, n=row_num * 2)
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertEqual(length(gen()), row_num)
Esempio n. 8
0
    def test_hive(self):
        driver = testing.get_driver()
        user, password, host, port, database, _, _ = parseHiveDSN(
            testing.get_hive_dsn())
        conn = connect(driver,
                       database,
                       user=user,
                       password=password,
                       host=host,
                       port=port)
        self._do_test(driver,
                      conn,
                      hdfs_namenode_addr="127.0.0.1:8020",
                      hive_location="/sqlflow")
        conn.close()

        conn = testing.get_singleton_db_connection()
        self._do_test(driver, conn)
        self._do_test_hive_specified_db(driver,
                                        conn,
                                        hdfs_namenode_addr="127.0.0.1:8020",
                                        hive_location="/sqlflow")
Esempio n. 9
0
    def test_generator(self):
        driver = testing.get_driver()
        user, password, host, port, database, _ = parseMySQLDSN(
            testing.get_mysql_dsn())
        conn = connect(driver,
                       database,
                       user=user,
                       password=password,
                       host=host,
                       port=int(port))
        # prepare test data
        execute(driver, conn, self.drop_statement)
        execute(driver, conn, self.create_statement)
        execute(driver, conn, self.insert_statement)

        column_name_to_type = {
            "features": {
                "feature_name": "features",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": False,
                "shape": []
            }
        }
        label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
        gen = db_generator(conn, "SELECT * FROM test_table_float_fea",
                           label_meta)
        idx = 0
        for row, label in gen():
            features = read_features_from_row(row, ["features"], ["features"],
                                              column_name_to_type)
            d = (features, label)
            if idx == 0:
                self.assertEqual(d, (((1.0, ), ), 0))
            elif idx == 1:
                self.assertEqual(d, (((2.0, ), ), 1))
            idx += 1
        self.assertEqual(idx, 2)
Esempio n. 10
0
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

import unittest

from runtime import testing
from runtime.dbapi.pyalisa.pop import Pop


@unittest.skipUnless(testing.get_driver() == "alisa", "Skip non-alisa test")
class TestPop(unittest.TestCase):
    def test_signature(self):
        params = {
            "name": "由由",
            "age": "3",
            "homepage": "http://little4.kg?true"
        }
        sign = Pop.signature(params, "POST", "test_secret_key")
        self.assertEqual("6kvgvUDEHtFdZKj8+HhtAS1ovHY=", sign)

    def test_percent_encode(self):
        # "~-_." should not be encoded
        param = "~-_."
        encoded = Pop.percent_encode(param)
        self.assertEqual("~-_.", encoded)
Esempio n. 11
0
class TestDB(TestCase):

    create_statement = "create table test_db (features text, label int)"
    hive_create_statement = 'create table test_db (features string, ' \
                            'label int) ROW FORMAT DELIMITED FIELDS ' \
                            'TERMINATED BY "\001"'
    select_statement = "select * from test_db"
    drop_statement = "drop table if exists test_db"

    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_mysql(self):
        driver = testing.get_driver()
        user, password, host, port, database, _ = parseMySQLDSN(
            testing.get_mysql_dsn())
        conn = connect(driver,
                       database,
                       user=user,
                       password=password,
                       host=host,
                       port=port)
        self._do_test(driver, conn)
        conn.close()

        conn = testing.get_singleton_db_connection()
        self._do_test(driver, conn)

    @unittest.skipUnless(testing.get_driver() == "hive", "skip non hive tests")
    def test_hive(self):
        driver = testing.get_driver()
        user, password, host, port, database, _, _ = parseHiveDSN(
            testing.get_hive_dsn())
        conn = connect(driver,
                       database,
                       user=user,
                       password=password,
                       host=host,
                       port=port)
        self._do_test(driver,
                      conn,
                      hdfs_namenode_addr="127.0.0.1:8020",
                      hive_location="/sqlflow")
        conn.close()

        conn = testing.get_singleton_db_connection()
        self._do_test(driver, conn)
        self._do_test_hive_specified_db(driver,
                                        conn,
                                        hdfs_namenode_addr="127.0.0.1:8020",
                                        hive_location="/sqlflow")

    def _do_test_hive_specified_db(self,
                                   driver,
                                   conn,
                                   hdfs_namenode_addr="",
                                   hive_location=""):
        create_db = '''create database if not exists test_db'''
        create_tbl = '''create table test_db.tbl (features string, label int)
                        ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001"'''
        drop_tbl = '''drop table if exists test_db.tbl'''
        select_tbl = '''select * from test_db.tbl'''
        table_schema = ["label", "features"]
        values = [(1, '5,6,1,2')] * 10
        execute(driver, conn, create_db)
        execute(driver, conn, drop_tbl)
        execute(driver, conn, create_tbl)
        with buffered_db_writer(driver,
                                conn,
                                "test_db.tbl",
                                table_schema,
                                buff_size=10,
                                hdfs_namenode_addr=hdfs_namenode_addr,
                                hive_location=hive_location) as w:
            for row in values:
                w.write(row)

        field_names, data = execute(driver, conn, select_tbl)

        expect_features = ['5,6,1,2'] * 10
        expect_labels = [1] * 10

        self.assertEqual(field_names, ['features', 'label'])
        self.assertEqual(expect_features, data[0])
        self.assertEqual(expect_labels, data[1])

    def _do_test(self, driver, conn, hdfs_namenode_addr="", hive_location=""):
        table_name = "test_db"
        table_schema = ["label", "features"]
        values = [(1, '5,6,1,2')] * 10

        execute(driver, conn, self.drop_statement)

        if driver == "hive":
            execute(driver, conn, self.hive_create_statement)
        else:
            execute(driver, conn, self.create_statement)
        with buffered_db_writer(driver,
                                conn,
                                table_name,
                                table_schema,
                                buff_size=10,
                                hdfs_namenode_addr=hdfs_namenode_addr,
                                hive_location=hive_location) as w:
            for row in values:
                w.write(row)

        field_names, data = execute(driver, conn, self.select_statement)

        expect_features = ['5,6,1,2'] * 10
        expect_labels = [1] * 10

        self.assertEqual(field_names, ['features', 'label'])
        self.assertEqual(expect_features, data[0])
        self.assertEqual(expect_labels, data[1])
Esempio n. 12
0
    def test_get_table_schema(self):
        driver = testing.get_driver()
        conn = testing.get_singleton_db_connection()
        if driver == "mysql":
            schema = get_table_schema(conn, "iris.train")
            expect = (
                ('sepal_length', 'FLOAT'),
                ('sepal_width', 'FLOAT'),
                ('petal_length', 'FLOAT'),
                ('petal_width', 'FLOAT'),
                ('class', 'INT(11)'),
            )
            self.assertTrue(np.array_equal(expect, schema))

            schema = selected_columns_and_types(
                conn,
                "SELECT sepal_length, petal_width * 2.3 new_petal_width, "
                "class FROM iris.train")
            expect = [
                ("sepal_length", "FLOAT"),
                ("new_petal_width", "DOUBLE"),
                ("class", "INT"),
            ]
            self.assertTrue(np.array_equal(expect, schema))
        elif driver == "hive":
            schema = get_table_schema(conn, "iris.train")
            expect = (
                ('sepal_length', 'FLOAT'),
                ('sepal_width', 'FLOAT'),
                ('petal_length', 'FLOAT'),
                ('petal_width', 'FLOAT'),
                ('class', 'INT'),
            )
            self.assertTrue(np.array_equal(expect, schema))

            schema = selected_columns_and_types(
                conn,
                "SELECT sepal_length, petal_width * 2.3 AS new_petal_width, "
                "class FROM iris.train")
            expect = [
                ("sepal_length", "FLOAT"),
                ("new_petal_width", "FLOAT"),
                ("class", "INT"),
            ]
            self.assertTrue(np.array_equal(expect, schema))
        elif driver == "maxcompute":
            case_db = os.getenv("SQLFLOW_TEST_DB_MAXCOMPUTE_PROJECT")
            table = "%s.sqlflow_test_iris_train" % case_db
            schema = get_table_schema(conn, table)
            expect = [
                ('sepal_length', 'DOUBLE'),
                ('sepal_width', 'DOUBLE'),
                ('petal_length', 'DOUBLE'),
                ('petal_width', 'DOUBLE'),
                ('class', 'BIGINT'),
            ]
            self.assertTrue(np.array_equal(expect, schema))

            schema = selected_columns_and_types(
                conn,
                "SELECT sepal_length, petal_width * 2.3 new_petal_width, "
                "class FROM %s" % table)
            expect = [
                ("sepal_length", "DOUBLE"),
                ("new_petal_width", "DOUBLE"),
                ("class", "BIGINT"),
            ]
            self.assertTrue(np.array_equal(expect, schema))
Esempio n. 13
0
class TestLimitSelect(TestCase):
    def test_limit_select(self):
        self.assertEqual("SELECT * FROM t LIMIT 2",
                         limit_select("SELECT * FROM t LIMIT 30", 2))

        self.assertEqual("SELECT * FROM t LIMIT 30; \t",
                         limit_select("SELECT * FROM t LIMIT 30; \t", 100))

        self.assertEqual("SELECT * FROM t LIMIT 3",
                         limit_select("SELECT * FROM t", 3))

        self.assertEqual("SELECT * FROM t \t  LIMIT 4; ",
                         limit_select("SELECT * FROM t \t ; ", 4))


@unittest.skipIf(testing.get_driver() == "maxcompute", "skip non mysql tests")
class TestQuery(TestCase):
    def test_query(self):
        conn = connect_with_data_source(testing.get_datasource())
        gen = query(conn, "select * from iris.train limit 1")
        rows = [row for row in gen()]
        self.assertEqual(1, len(rows))

        query(conn, "drop table if exists A")

        query(conn, "create table A(a int);")
        query(conn, "insert into A values(1)")
        gen = query(conn, "select * from A;")
        rows = [row for row in gen()]
        self.assertEqual(1, len(rows))
Esempio n. 14
0
class TestXGBoostTrain(unittest.TestCase):
    def get_table_row_count(self, conn, table):
        ret = list(conn.query("SELECT COUNT(*) FROM %s" % table))
        self.assertEqual(len(ret), 1)
        ret = ret[0]
        self.assertEqual(len(ret), 1)
        return ret[0]

    def get_table_schema(self, conn, table):
        name_and_types = conn.get_table_schema(table)
        return dict(name_and_types)

    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_main(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"num_boost_round": 20}
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        save_name = "iris.xgboost_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(original_sql=original_sql,
                                model_image="sqlflow:step",
                                estimator_string="xgboost.gbtree",
                                datasource=ds,
                                select=select,
                                validation_select=val_select,
                                model_params=model_params,
                                train_params=train_params,
                                feature_column_map=None,
                                label_column=NumericColumn(
                                    FieldDesc(name=class_name)),
                                save=save_name)
            self.assertLess(eval_result['train']['merror'][-1], 0.01)
            self.assertLess(eval_result['validate']['merror'][-1], 0.01)

            conn = db.connect_with_data_source(ds)

            pred_select = "SELECT * FROM iris.test"
            pred(ds, pred_select, "iris.predict_result_table", class_name,
                 save_name)

            self.assertEqual(
                self.get_table_row_count(conn, "iris.test"),
                self.get_table_row_count(conn, "iris.predict_result_table"))

            schema1 = self.get_table_schema(conn, "iris.test")
            schema2 = self.get_table_schema(conn, "iris.predict_result_table")
            self.assertEqual(len(schema1), len(schema2))
            for name in schema1:
                if name == 'class':
                    self.assertEqual(schema2[name], "BIGINT")
                    continue

                self.assertTrue(name in schema2)
                self.assertEqual(schema1[name], schema2[name])

            diff_schema = schema2.keys() - schema1.keys()
            self.assertEqual(len(diff_schema), 0)

            evaluate(ds, pred_select, "iris.evaluate_result_table", save_name,
                     'class', ['accuracy_score'])
            eval_schema = self.get_table_schema(conn,
                                                "iris.evaluate_result_table")
            self.assertEqual(eval_schema.keys(),
                             set(['loss', 'accuracy_score']))
Esempio n. 15
0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

import unittest
from unittest import TestCase

from runtime import testing
from runtime.dbapi.hive import HiveConnection


@unittest.skipUnless(testing.get_driver() == "hive", "Skip non-hive test")
class TestHiveConnection(TestCase):
    def test_connecion(self):
        try:
            conn = HiveConnection(testing.get_datasource())
            conn.close()
        except:  # noqa: E722
            self.fail()

    def test_query(self):
        conn = HiveConnection(testing.get_datasource())
        rs = conn.query("select * from notexist limit 1")
        self.assertFalse(rs.success())
        self.assertTrue("Table not found" in rs.error())

        rs = conn.query("select * from train limit 1")
Esempio n. 16
0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License

import unittest
from unittest import TestCase

from runtime import testing
from runtime.dbapi.mysql import MySQLConnection


@unittest.skipUnless(testing.get_driver() == "mysql", "Skip non-mysql test")
class TestMySQLConnection(TestCase):
    def test_connecion(self):
        try:
            conn = MySQLConnection(testing.get_datasource())
            conn.close()
        except:  # noqa: E722
            self.fail()

    def test_query(self):
        conn = MySQLConnection(testing.get_datasource())
        rs = conn.query("select * from notexist limit 1")
        self.assertFalse(rs.success())

        rs = conn.query("select * from train limit 1")
        self.assertTrue(rs.success())
Esempio n. 17
0
class TestGenerator(TestCase):
    create_statement = """create table test_table_float_fea
(f1 float, f2 int, f3str VARCHAR(255),
f4sparse VARCHAR(255), f5dense VARCHAR(255), label int)"""
    drop_statement = "drop table if exists test_table_float_fea"
    insert_statement = """insert into test_table_float_fea
(f1,f2,f3str,f4sparse,f5dense,label)
values(1.0,1,'a','1:1.0 2:2.0','1,2,3',0), (NULL,NULL,NULL,NULL,'1,2,3',1)"""

    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_generator(self):
        conn = connect(testing.get_datasource())
        # prepare test data
        conn.execute(self.drop_statement)
        conn.execute(self.create_statement)
        conn.execute(self.insert_statement)

        column_name_to_type = {
            "f1": {
                "feature_name": "f1",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": False,
                "shape": []
            },
            "f2": {
                "feature_name": "f2",
                "delimiter": "",
                "dtype": "int64",
                "is_sparse": False,
                "shape": []
            },
            "f3str": {
                "feature_name": "f3str",
                "delimiter": "",
                "dtype": "string",
                "is_sparse": False,
                "shape": []
            },
            "f4sparse": {
                "feature_name": "f4sparse",
                "delimiter": "",
                "dtype": "float32",
                "is_sparse": True,
                "shape": [],
                "format": "kv"
            },
            "f5dense": {
                "feature_name": "f5dense",
                "delimiter": ",",
                "dtype": "int64",
                "is_sparse": False,
                "shape": [3]
            }
        }
        label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
        gen = db_generator(conn, "SELECT * FROM test_table_float_fea",
                           label_meta)
        idx = 0
        for row, label in gen():
            if idx == 0:
                features = read_features_from_row(
                    row, ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    column_name_to_type)
                self.assertEqual(1.0, features[0][0])
                self.assertEqual(1, features[1][0])
                self.assertEqual('a', features[2][0])
                self.assertTrue(
                    np.array_equal(np.array([[1], [2]]), features[3][0]))
                self.assertTrue(
                    np.array_equal(np.array([1., 2.], dtype=np.float32),
                                   features[3][1]))
                self.assertTrue(
                    np.array_equal(np.array([1, 2, 3]), features[4][0]))
                self.assertEqual(0, label)
            elif idx == 1:
                try:
                    features = read_features_from_row(
                        row, ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                        ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                        column_name_to_type)
                except Exception as e:
                    self.assertTrue(isinstance(e, ValueError))
                features = read_features_from_row(
                    row, ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    ["f1", "f2", "f3str", "f4sparse", "f5dense"],
                    column_name_to_type,
                    is_xgboost=True)
                self.assertEqual(XGBOOST_NULL_MAGIC, features[0][0])
                self.assertEqual(int(XGBOOST_NULL_MAGIC), features[1][0])
                self.assertEqual("", features[2][0])
                self.assertTrue(np.array_equal(np.array([]), features[3][0]))
                self.assertTrue(np.array_equal(np.array([]), features[3][1]))
                self.assertTrue(
                    np.array_equal(np.array([1, 2, 3]), features[4][0]))
                self.assertEqual(1, label)
            idx += 1
        self.assertEqual(idx, 2)

    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_generate_fetch_size(self):
        label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
        gen = db_generator(testing.get_singleton_db_connection(),
                           'SELECT * FROM iris.train limit 10', label_meta)
        self.assertEqual(len([g for g in gen()]), 10)
Esempio n. 18
0
    def check_main_impl(self, estimator):
        if testing.get_driver() != "mysql":
            return

        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN %s
        WITH
            model.hidden_units=[32,64],
            model.n_classes=3,
            validation.select="SELECT * FROM iris.test"
        LABEL class
        INTO iris.tensorflow_train_model_test;
        """ % estimator

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"batch_size": 10}
        model_params = {"n_classes": 3, "hidden_units": [32, 64]}
        save_name = "iris.tensorflow_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            train(original_sql=original_sql,
                  model_image="sqlflow:step",
                  estimator_string=estimator,
                  datasource=ds,
                  select=select,
                  validation_select=val_select,
                  model_params=model_params,
                  train_params=train_params,
                  validation_params=None,
                  feature_column_map=None,
                  label_column=NumericColumn(
                      FieldDesc(name=class_name, shape=[])),
                  save=save_name,
                  load=None)

        conn = db.connect_with_data_source(ds)

        pred_select = "SELECT * FROM iris.test"

        with temp_file.TemporaryDirectory(as_cwd=True):
            pred(ds, pred_select, "iris.predict_result_table", class_name,
                 save_name)

        self.assertEqual(
            self.get_table_row_count(conn, "iris.test"),
            self.get_table_row_count(conn, "iris.predict_result_table"))

        schema1 = self.get_table_schema(conn, "iris.test")
        schema2 = self.get_table_schema(conn, "iris.predict_result_table")
        self.assertEqual(len(schema1), len(schema2))
        for name in schema1:
            if name == 'class':
                self.assertEqual(schema2[name], "BIGINT")
                continue

            self.assertTrue(name in schema2)
            self.assertEqual(schema1[name], schema2[name])

        diff_schema = schema2.keys() - schema1.keys()
        self.assertEqual(len(diff_schema), 0)

        with temp_file.TemporaryDirectory(as_cwd=True):
            evaluate(ds, select, "iris.evaluate_result_table", save_name,
                     class_name, {'validation.metrics': 'Accuracy'})

        eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table")
        eval_schema = set([k.lower() for k in eval_schema.keys()])
        self.assertEqual(eval_schema, set(['loss', 'accuracy']))

        with temp_file.TemporaryDirectory(as_cwd=True):
            explain(ds, select, None, {"plot_type": "bar"},
                    "iris.explain_result_table", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table")
        self.assertEqual(
            explain_schema.keys(),
            set(['petal_length', 'petal_width', 'sepal_length',
                 'sepal_width']))
        conn.close()
Esempio n. 19
0

iris_feature_column_names = [
    "sepal_length",
    "sepal_width",
    "petal_length",
    "petal_width",
]

feature_column_map = {
    "feature_columns": [fc.NumericColumn(fd.FieldDesc(name="sepal_length"))]
}
label_column = fc.NumericColumn(fd.FieldDesc(name="class"))


@unittest.skipUnless(testing.get_driver() == "maxcompute"
                     and testing.get_submitter() == "pai",
                     "skip non PAI tests")
class SubmitPAITrainTask(TestCase):
    def test_submit_pai_train_task(self):
        model_params = dict()
        model_params["hidden_units"] = [10, 20]
        model_params["n_classes"] = 3

        original_sql = """
SELECT * FROM alifin_jtest_dev.sqlflow_test_iris_train
TO TRAIN DNNClassifier
WITH model.n_classes = 3, model.hidden_units = [10, 20]
LABEL class
INTO e2etest_pai_dnn;"""
Esempio n. 20
0
class TestDB(TestCase):

    create_statement = "create table test_db (features text, label int)"
    hive_create_statement = 'create table test_db (features string, ' \
                            'label int) ROW FORMAT DELIMITED FIELDS ' \
                            'TERMINATED BY "\001"'
    select_statement = "select * from test_db"
    drop_statement = "drop table if exists test_db"

    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_mysql(self):
        conn = connect(testing.get_datasource())
        self._do_test(conn)
        conn.close()

    @unittest.skipUnless(testing.get_driver() == "hive", "skip non hive tests")
    def test_hive(self):
        uri = testing.get_datasource()
        conn = connect(uri)
        self._do_test(conn)
        self._do_test_hive_specified_db(conn)

    def _do_test_hive_specified_db(self, conn):
        create_db = '''create database if not exists test_db'''
        create_tbl = '''create table test_db.tbl (features string, label int)
                        ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001"'''
        drop_tbl = '''drop table if exists test_db.tbl'''
        select_tbl = '''select * from test_db.tbl'''
        table_schema = ["label", "features"]
        values = [(1, '5,6,1,2')] * 10
        self.assertTrue(conn.execute(create_db))
        self.assertTrue(conn.execute(drop_tbl))
        self.assertTrue(conn.execute(create_tbl))

        with buffered_db_writer(conn,
                                "test_db.tbl",
                                table_schema,
                                buff_size=10) as w:
            for row in values:
                w.write(row)

        field_names, data = execute(conn, select_tbl)

        expect_result = [('5,6,1,2', 1)] * 10

        self.assertEqual(field_names, ['features', 'label'])
        self.assertEqual(expect_result, data)

    def _do_test(self, conn):
        table_name = "test_db"
        table_schema = ["features", "label"]
        values = [('5,6,1,2', 1)] * 10

        conn.execute(self.drop_statement)

        if conn.driver == "hive":
            conn.execute(self.hive_create_statement)
        else:
            conn.execute(self.create_statement)
        with buffered_db_writer(conn, table_name, table_schema,
                                buff_size=10) as w:
            for row in values:
                w.write(row)

        field_names, data = execute(conn, self.select_statement)

        self.assertEqual(table_schema, field_names)
        self.assertEqual(values, data)
Esempio n. 21
0
class TestLimitSelect(TestCase):
    def test_limit_select(self):
        self.assertEqual("SELECT * FROM t LIMIT 2",
                         limit_select("SELECT * FROM t LIMIT 30", 2))

        self.assertEqual("SELECT * FROM t LIMIT 30; \t",
                         limit_select("SELECT * FROM t LIMIT 30; \t", 100))

        self.assertEqual("SELECT * FROM t LIMIT 3",
                         limit_select("SELECT * FROM t", 3))

        self.assertEqual("SELECT * FROM t \t  LIMIT 4; ",
                         limit_select("SELECT * FROM t \t ; ", 4))


@unittest.skipIf(testing.get_driver() == "maxcompute", "skip maxcompute tests")
class TestQuery(TestCase):
    def test_query(self):
        conn = connect_with_data_source(testing.get_datasource())
        rs = conn.query("select * from iris.train limit 1")
        rows = [row for row in rs]
        self.assertEqual(1, len(rows))

        conn.execute("drop table if exists A")
        conn.execute("create table A(a int);")
        conn.execute("insert into A values(1)")
        rs = conn.query("select * from A;")
        rows = [row for row in rs]
        self.assertEqual(1, len(rows))

        conn.query("truncate table A")
Esempio n. 22
0
            self.assertEqual(fd.infer_string_data_format(s), DataFormat.PLAIN)


class TestGetMaxIndexOfKeyValueString(unittest.TestCase):
    def infer_index(self, string):
        field_desc = FieldDesc(shape=[0])
        fd.fill_kv_field_desc(string, field_desc)
        return field_desc.shape[0]

    def test_infer_index(self):
        self.assertEqual(self.infer_index("1:2 3:4 2:1"), 4)
        self.assertEqual(self.infer_index("7:2\t 3:-4 10:20"), 11)
        self.assertEqual(self.infer_index("1:7"), 2)


@unittest.skipUnless(testing.get_driver() in ["mysql", "hive"],
                     "skip non MySQL and Hive tests")
class TestFeatureDerivationWithMockedFeatures(unittest.TestCase):
    def check_json_dump(self, features):
        dump_json = json.dumps(features, cls=fc.JSONEncoderWithFeatureColumn)
        new_features = json.loads(dump_json,
                                  cls=fc.JSONDecoderWithFeatureColumn)
        new_dump_json = json.dumps(new_features,
                                   cls=fc.JSONEncoderWithFeatureColumn)
        self.assertEqual(dump_json, new_dump_json)

    def test_without_cross(self):
        features = {
            'feature_columns': [
                EmbeddingColumn(dimension=256, combiner="mean", name="c3"),
                EmbeddingColumn(category_column=CategoryIDColumn(
Esempio n. 23
0
class TestXGBoostTrain(unittest.TestCase):
    def get_table_row_count(self, conn, table):
        rs = conn.query("SELECT COUNT(*) FROM %s" % table)
        ret = list(rs)
        rs.close()
        self.assertEqual(len(ret), 1)
        ret = ret[0]
        self.assertEqual(len(ret), 1)
        return ret[0]

    def get_table_schema(self, conn, table):
        name_and_types = conn.get_table_schema(table)
        return dict(name_and_types)

    @unittest.skipUnless(testing.get_driver() == "mysql",
                         "skip non mysql tests")
    def test_main(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softprob",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        LABEL class
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"num_boost_round": 20}
        model_params = {"num_class": 3, "objective": "multi:softprob"}
        save_name = "iris.xgboost_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(datasource=ds,
                                original_sql=original_sql,
                                select=select,
                                validation_select=val_select,
                                estimator_string="xgboost.gbtree",
                                model_image="sqlflow:step",
                                feature_column_map=None,
                                label_column=NumericColumn(
                                    FieldDesc(name=class_name)),
                                model_params=model_params,
                                train_params=train_params,
                                validation_params=None,
                                save=save_name,
                                load=None)

        self.assertLess(eval_result['train']['merror'][-1], 0.01)
        self.assertLess(eval_result['validate']['merror'][-1], 0.01)

        conn = db.connect_with_data_source(ds)
        pred_select = "SELECT * FROM iris.test"

        with temp_file.TemporaryDirectory(as_cwd=True):
            result_column_names, train_label_idx = create_predict_table(
                conn, select, "iris.predict_result_table",
                FieldDesc(name=class_name), "class")
            predict(ds, pred_select, "iris.predict_result_table",
                    result_column_names, train_label_idx, save_name)

        self.assertEqual(
            self.get_table_row_count(conn, "iris.test"),
            self.get_table_row_count(conn, "iris.predict_result_table"))

        schema1 = self.get_table_schema(conn, "iris.test")
        schema2 = self.get_table_schema(conn, "iris.predict_result_table")
        self.assertEqual(len(schema1), len(schema2))
        for name in schema1:
            if name == 'class':
                self.assertEqual(schema2[name], "BIGINT")
                continue

            self.assertTrue(name in schema2)
            self.assertEqual(schema1[name], schema2[name])

        diff_schema = schema2.keys() - schema1.keys()
        self.assertEqual(len(diff_schema), 0)

        with temp_file.TemporaryDirectory(as_cwd=True):
            result_column_names = create_evaluate_table(
                conn, "iris.evaluate_result_table", ["accuracy_score"])
            evaluate(ds,
                     pred_select,
                     "iris.evaluate_result_table",
                     save_name,
                     label_name='class',
                     model_params={'validation.metrics': 'accuracy_score'},
                     result_column_names=result_column_names)

        eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table")
        self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score']))

        with temp_file.TemporaryDirectory(as_cwd=True):
            feature_column_names = [
                "petal_width", "petal_length", "sepal_width", "sepal_length"
            ]
            create_explain_table(conn, EstimatorType.XGBOOST, "TreeExplainer",
                                 "xgboost.gbtree", "iris.explain_result_table",
                                 feature_column_names)
            explain(ds, select, "TreeExplainer", {"plot_type": "decision"},
                    "iris.explain_result_table", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table")
        self.assertEqual(explain_schema.keys(), set(feature_column_names))

        with temp_file.TemporaryDirectory(as_cwd=True):
            create_explain_table(conn, EstimatorType.XGBOOST,
                                 "XGBoostExplainer", "xgboost.gbtree",
                                 "iris.explain_result_table_2",
                                 feature_column_names)
            explain(ds, select, "XGBoostExplainer", {},
                    "iris.explain_result_table_2", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table_2")
        self.assertEqual(explain_schema.keys(),
                         set(['feature', 'fscore', 'gain']))
        conn.close()