Esempio n. 1
0
 def test_generate_fetch_size(self):
     label_meta = {"feature_name": "label", "shape": [], "delimiter": ""}
     gen = db_generator(testing.get_singleton_db_connection(),
                        'SELECT * FROM iris.train limit 10',
                        label_meta,
                        fetch_size=4)
     self.assertEqual(len([g for g in gen()]), 10)
Esempio n. 2
0
    def test_mysql(self):
        driver = testing.get_driver()
        user, password, host, port, database, _ = parseMySQLDSN(
            testing.get_mysql_dsn())
        conn = connect(driver,
                       database,
                       user=user,
                       password=password,
                       host=host,
                       port=port)
        self._do_test(driver, conn)
        conn.close()

        conn = testing.get_singleton_db_connection()
        self._do_test(driver, conn)
Esempio n. 3
0
    def test_verify_column_name_and_type(self):
        conn = testing.get_singleton_db_connection()

        train_table = "iris.train"
        test_table = "iris.test"

        train_select = [
            "petal_length", "petal_width", "sepal_length", "sepal_width",
            "class"
        ]
        test_select = train_select
        verify_column_name_and_type(
            conn, self.generate_select(train_table, train_select),
            self.generate_select(test_table, test_select), "class")

        test_select = [
            "petal_length", "petal_width", "sepal_length", "sepal_width"
        ]
        verify_column_name_and_type(
            conn, self.generate_select(train_table, train_select),
            self.generate_select(test_table, test_select), "class")

        test_select = ["petal_length", "petal_width", "sepal_length"]
        with self.assertRaises(ValueError):
            verify_column_name_and_type(
                conn, self.generate_select(train_table, train_select),
                self.generate_select(test_table, test_select), "class")

        cursor = conn.cursor()
        name_and_type = dict(db.get_table_schema(conn, test_table))
        new_table_name = "iris.verifier_test_table"

        name_and_type["petal_length"] = "VARCHAR(255)"  # change the data type
        create_column_str = ",".join(
            ["%s %s" % (n, t) for n, t in name_and_type.items()])

        drop_sql = "DROP TABLE IF EXISTS %s" % new_table_name
        create_sql = "CREATE TABLE %s(%s)" % (new_table_name,
                                              create_column_str)
        cursor.execute(drop_sql)
        cursor.execute(create_sql)
        with self.assertRaises(ValueError):
            test_select = train_select
            verify_column_name_and_type(
                conn, self.generate_select(train_table, train_select),
                self.generate_select(new_table_name, test_select), "class")
        cursor.execute(drop_sql)
        cursor.close()
Esempio n. 4
0
    def test_fetch_sample(self):
        conn = testing.get_singleton_db_connection()

        select = "SELECT * FROM iris.train"
        name_and_type = db.selected_columns_and_types(conn, select)
        expect_field_names = [item[0] for item in name_and_type]
        expect_field_types = [item[1] for item in name_and_type]
        column_num = len(name_and_type)

        gen = fetch_samples(conn, select, n=0)
        self.assertTrue(gen is None)

        gen = fetch_samples(conn, select, n=-1)
        row_num = length(gen())
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertGreater(row_num, 25)

        gen = fetch_samples(conn, select, n=25)
        n = 0

        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))

        for rows in gen():
            self.assertEqual(len(rows), column_num)
            n += 1

        self.assertEqual(n, 25)

        gen = fetch_samples(conn, select, n=10)
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertEqual(length(gen()), 10)

        gen = fetch_samples(conn, "%s LIMIT 1" % select, n=1000)
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertEqual(length(gen()), 1)

        gen = fetch_samples(conn, select, n=row_num * 2)
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertEqual(length(gen()), row_num)
Esempio n. 5
0
    def test_no_column_clause(self):
        columns = [
            "sepal_length",
            "sepal_width",
            "petal_length",
            "petal_width",
        ]

        select = "select %s, class from iris.train" % ",".join(columns)

        conn = testing.get_singleton_db_connection()
        features = None
        label = NumericColumn(
            FieldDesc(name='class', dtype=DataType.INT64, shape=[1]))
        features, label = fd.infer_feature_columns(conn, select, features,
                                                   label)

        self.check_json_dump(features)
        self.check_json_dump(label)

        self.assertEqual(len(features), 1)
        self.assertTrue("feature_columns" in features)
        features = features["feature_columns"]
        self.assertEqual(len(features), 4)

        for i, f in enumerate(features):
            self.assertTrue(isinstance(f, NumericColumn))
            self.assertEqual(len(f.get_field_desc()), 1)
            field_desc = f.get_field_desc()[0]
            self.assertEqual(field_desc.name, columns[i])
            self.assertEqual(field_desc.dtype, DataType.FLOAT32)
            self.assertEqual(field_desc.format, DataFormat.PLAIN)
            self.assertFalse(field_desc.is_sparse)
            self.assertEqual(field_desc.shape, [1])

        self.assertTrue(isinstance(label, NumericColumn))
        self.assertEqual(len(label.get_field_desc()), 1)
        field_desc = label.get_field_desc()[0]
        self.assertEqual(field_desc.name, "class")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [])
Esempio n. 6
0
    def test_hive(self):
        driver = testing.get_driver()
        user, password, host, port, database, _, _ = parseHiveDSN(
            testing.get_hive_dsn())
        conn = connect(driver,
                       database,
                       user=user,
                       password=password,
                       host=host,
                       port=port)
        self._do_test(driver,
                      conn,
                      hdfs_namenode_addr="127.0.0.1:8020",
                      hive_location="/sqlflow")
        conn.close()

        conn = testing.get_singleton_db_connection()
        self._do_test(driver, conn)
        self._do_test_hive_specified_db(driver,
                                        conn,
                                        hdfs_namenode_addr="127.0.0.1:8020",
                                        hive_location="/sqlflow")
Esempio n. 7
0
    def test_with_cross(self):
        c1 = NumericColumn(
            FieldDesc(name='c1', dtype=DataType.INT64, shape=[1]))
        c2 = NumericColumn(
            FieldDesc(name='c2', dtype=DataType.INT64, shape=[1]))
        c4 = NumericColumn(
            FieldDesc(name='c4', dtype=DataType.INT64, shape=[1]))
        c5 = NumericColumn(
            FieldDesc(name='c5',
                      dtype=DataType.INT64,
                      shape=[1],
                      is_sparse=True))

        features = {
            'feature_columns': [
                c1,
                c2,
                CrossColumn([c4, c5], 128),
                CrossColumn([c1, c2], 256),
            ]
        }

        label = NumericColumn(
            FieldDesc(name='class', dtype=DataType.INT64, shape=[1]))
        select = "select c1, c2, c3, c4, c5, class " \
                 "from feature_derivation_case.train"

        conn = testing.get_singleton_db_connection()
        features, label = fd.infer_feature_columns(conn, select, features,
                                                   label)

        self.check_json_dump(features)
        self.check_json_dump(label)

        self.assertEqual(len(features), 1)
        self.assertTrue("feature_columns" in features)
        features = features["feature_columns"]
        self.assertEqual(len(features), 5)

        fc1 = features[0]
        self.assertTrue(isinstance(fc1, NumericColumn))
        self.assertEqual(len(fc1.get_field_desc()), 1)
        field_desc = fc1.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c1")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc2 = features[1]
        self.assertTrue(isinstance(fc2, NumericColumn))
        self.assertEqual(len(fc2.get_field_desc()), 1)
        field_desc = fc2.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c2")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc3 = features[2]
        self.assertTrue(isinstance(fc3, NumericColumn))
        self.assertEqual(len(fc3.get_field_desc()), 1)
        field_desc = fc3.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c3")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [4])

        fc4 = features[3]
        self.assertTrue(isinstance(fc4, CrossColumn))
        self.assertEqual(len(fc4.get_field_desc()), 2)
        field_desc1 = fc4.get_field_desc()[0]
        self.assertEqual(field_desc1.name, "c4")
        self.assertEqual(field_desc1.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc1.format, DataFormat.CSV)
        self.assertEqual(field_desc1.shape, [4])
        self.assertFalse(field_desc1.is_sparse)
        field_desc2 = fc4.get_field_desc()[1]
        self.assertEqual(field_desc2.name, "c5")
        self.assertEqual(field_desc2.dtype, DataType.INT64)
        self.assertEqual(field_desc2.format, DataFormat.CSV)
        self.assertTrue(field_desc2.is_sparse)

        fc5 = features[4]
        self.assertTrue(isinstance(fc5, CrossColumn))
        self.assertEqual(len(fc4.get_field_desc()), 2)
        field_desc1 = fc5.get_field_desc()[0]
        self.assertEqual(field_desc1.name, "c1")
        self.assertEqual(field_desc1.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc1.format, DataFormat.PLAIN)
        self.assertEqual(field_desc1.shape, [1])
        self.assertFalse(field_desc1.is_sparse)
        field_desc2 = fc5.get_field_desc()[1]
        self.assertEqual(field_desc2.name, "c2")
        self.assertEqual(field_desc2.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc2.format, DataFormat.PLAIN)
        self.assertEqual(field_desc2.shape, [1])
        self.assertFalse(field_desc2.is_sparse)

        self.assertTrue(isinstance(label, NumericColumn))
        self.assertEqual(len(label.get_field_desc()), 1)
        field_desc = label.get_field_desc()[0]
        self.assertEqual(field_desc.name, "class")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [])
Esempio n. 8
0
    def test_without_cross(self):
        features = {
            'feature_columns': [
                EmbeddingColumn(dimension=256, combiner="mean", name="c3"),
                EmbeddingColumn(category_column=CategoryIDColumn(
                    FieldDesc(name="c5",
                              dtype=DataType.INT64,
                              shape=[10000],
                              delimiter=",",
                              is_sparse=True),
                    bucket_size=5000),
                                dimension=64,
                                combiner="sqrtn",
                                name="c5"),
            ]
        }

        label = NumericColumn(
            FieldDesc(name="class", dtype=DataType.INT64, shape=[1]))

        select = "select c1, c2, c3, c4, c5, c6, class " \
                 "from feature_derivation_case.train"
        conn = testing.get_singleton_db_connection()
        features, label = fd.infer_feature_columns(conn, select, features,
                                                   label)

        self.check_json_dump(features)
        self.check_json_dump(label)

        self.assertEqual(len(features), 1)
        self.assertTrue("feature_columns" in features)
        features = features["feature_columns"]
        self.assertEqual(len(features), 6)

        fc1 = features[0]
        self.assertTrue(isinstance(fc1, NumericColumn))
        self.assertEqual(len(fc1.get_field_desc()), 1)
        field_desc = fc1.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c1")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc2 = features[1]
        self.assertTrue(isinstance(fc2, NumericColumn))
        self.assertEqual(len(fc2.get_field_desc()), 1)
        field_desc = fc2.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c2")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc3 = features[2]
        self.assertTrue(isinstance(fc3, EmbeddingColumn))
        self.assertEqual(len(fc3.get_field_desc()), 1)
        field_desc = fc3.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c3")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [4])
        self.assertEqual(fc3.dimension, 256)
        self.assertEqual(fc3.combiner, "mean")
        self.assertEqual(fc3.name, "c3")
        self.assertTrue(isinstance(fc3.category_column, CategoryIDColumn))
        self.assertEqual(fc3.category_column.bucket_size, 10)

        fc4 = features[3]
        self.assertTrue(isinstance(fc4, NumericColumn))
        self.assertEqual(len(fc4.get_field_desc()), 1)
        field_desc = fc4.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c4")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [4])

        fc5 = features[4]
        self.assertTrue(isinstance(fc5, EmbeddingColumn))
        self.assertEqual(len(fc5.get_field_desc()), 1)
        field_desc = fc5.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c5")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertTrue(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [10000])
        self.assertEqual(fc5.dimension, 64)
        self.assertEqual(fc5.combiner, "sqrtn")
        self.assertEqual(fc5.name, "c5")
        self.assertTrue(isinstance(fc5.category_column, CategoryIDColumn))
        self.assertEqual(fc5.category_column.bucket_size, 5000)

        fc6 = features[5]
        self.assertTrue(isinstance(fc6, EmbeddingColumn))
        self.assertEqual(len(fc6.get_field_desc()), 1)
        field_desc = fc6.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c6")
        self.assertEqual(field_desc.dtype, DataType.STRING)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])
        self.assertEqual(field_desc.vocabulary, set(['FEMALE', 'MALE',
                                                     'NULL']))
        self.assertEqual(fc6.dimension, 128)
        self.assertEqual(fc6.combiner, "sum")
        self.assertEqual(fc6.name, "c6")
        self.assertTrue(isinstance(fc6.category_column, CategoryIDColumn))
        self.assertEqual(fc6.category_column.bucket_size, 3)

        self.assertTrue(isinstance(label, NumericColumn))
        self.assertEqual(len(label.get_field_desc()), 1)
        field_desc = label.get_field_desc()[0]
        self.assertEqual(field_desc.name, "class")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [])
Esempio n. 9
0
    def test_get_table_schema(self):
        driver = testing.get_driver()
        conn = testing.get_singleton_db_connection()
        if driver == "mysql":
            schema = get_table_schema(conn, "iris.train")
            expect = (
                ('sepal_length', 'FLOAT'),
                ('sepal_width', 'FLOAT'),
                ('petal_length', 'FLOAT'),
                ('petal_width', 'FLOAT'),
                ('class', 'INT(11)'),
            )
            self.assertTrue(np.array_equal(expect, schema))

            schema = selected_columns_and_types(
                conn,
                "SELECT sepal_length, petal_width * 2.3 new_petal_width, "
                "class FROM iris.train")
            expect = [
                ("sepal_length", "FLOAT"),
                ("new_petal_width", "DOUBLE"),
                ("class", "INT"),
            ]
            self.assertTrue(np.array_equal(expect, schema))
        elif driver == "hive":
            schema = get_table_schema(conn, "iris.train")
            expect = (
                ('sepal_length', 'FLOAT'),
                ('sepal_width', 'FLOAT'),
                ('petal_length', 'FLOAT'),
                ('petal_width', 'FLOAT'),
                ('class', 'INT'),
            )
            self.assertTrue(np.array_equal(expect, schema))

            schema = selected_columns_and_types(
                conn,
                "SELECT sepal_length, petal_width * 2.3 AS new_petal_width, "
                "class FROM iris.train")
            expect = [
                ("sepal_length", "FLOAT"),
                ("new_petal_width", "FLOAT"),
                ("class", "INT"),
            ]
            self.assertTrue(np.array_equal(expect, schema))
        elif driver == "maxcompute":
            case_db = os.getenv("SQLFLOW_TEST_DB_MAXCOMPUTE_PROJECT")
            table = "%s.sqlflow_test_iris_train" % case_db
            schema = get_table_schema(conn, table)
            expect = [
                ('sepal_length', 'DOUBLE'),
                ('sepal_width', 'DOUBLE'),
                ('petal_length', 'DOUBLE'),
                ('petal_width', 'DOUBLE'),
                ('class', 'BIGINT'),
            ]
            self.assertTrue(np.array_equal(expect, schema))

            schema = selected_columns_and_types(
                conn,
                "SELECT sepal_length, petal_width * 2.3 new_petal_width, "
                "class FROM %s" % table)
            expect = [
                ("sepal_length", "DOUBLE"),
                ("new_petal_width", "DOUBLE"),
                ("class", "BIGINT"),
            ]
            self.assertTrue(np.array_equal(expect, schema))