def test_generate_fetch_size(self): label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(testing.get_singleton_db_connection(), 'SELECT * FROM iris.train limit 10', label_meta, fetch_size=4) self.assertEqual(len([g for g in gen()]), 10)
def test_mysql(self): driver = testing.get_driver() user, password, host, port, database, _ = parseMySQLDSN( testing.get_mysql_dsn()) conn = connect(driver, database, user=user, password=password, host=host, port=port) self._do_test(driver, conn) conn.close() conn = testing.get_singleton_db_connection() self._do_test(driver, conn)
def test_verify_column_name_and_type(self): conn = testing.get_singleton_db_connection() train_table = "iris.train" test_table = "iris.test" train_select = [ "petal_length", "petal_width", "sepal_length", "sepal_width", "class" ] test_select = train_select verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(test_table, test_select), "class") test_select = [ "petal_length", "petal_width", "sepal_length", "sepal_width" ] verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(test_table, test_select), "class") test_select = ["petal_length", "petal_width", "sepal_length"] with self.assertRaises(ValueError): verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(test_table, test_select), "class") cursor = conn.cursor() name_and_type = dict(db.get_table_schema(conn, test_table)) new_table_name = "iris.verifier_test_table" name_and_type["petal_length"] = "VARCHAR(255)" # change the data type create_column_str = ",".join( ["%s %s" % (n, t) for n, t in name_and_type.items()]) drop_sql = "DROP TABLE IF EXISTS %s" % new_table_name create_sql = "CREATE TABLE %s(%s)" % (new_table_name, create_column_str) cursor.execute(drop_sql) cursor.execute(create_sql) with self.assertRaises(ValueError): test_select = train_select verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(new_table_name, test_select), "class") cursor.execute(drop_sql) cursor.close()
def test_fetch_sample(self): conn = testing.get_singleton_db_connection() select = "SELECT * FROM iris.train" name_and_type = db.selected_columns_and_types(conn, select) expect_field_names = [item[0] for item in name_and_type] expect_field_types = [item[1] for item in name_and_type] column_num = len(name_and_type) gen = fetch_samples(conn, select, n=0) self.assertTrue(gen is None) gen = fetch_samples(conn, select, n=-1) row_num = length(gen()) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertGreater(row_num, 25) gen = fetch_samples(conn, select, n=25) n = 0 self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) for rows in gen(): self.assertEqual(len(rows), column_num) n += 1 self.assertEqual(n, 25) gen = fetch_samples(conn, select, n=10) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertEqual(length(gen()), 10) gen = fetch_samples(conn, "%s LIMIT 1" % select, n=1000) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertEqual(length(gen()), 1) gen = fetch_samples(conn, select, n=row_num * 2) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertEqual(length(gen()), row_num)
def test_no_column_clause(self): columns = [ "sepal_length", "sepal_width", "petal_length", "petal_width", ] select = "select %s, class from iris.train" % ",".join(columns) conn = testing.get_singleton_db_connection() features = None label = NumericColumn( FieldDesc(name='class', dtype=DataType.INT64, shape=[1])) features, label = fd.infer_feature_columns(conn, select, features, label) self.check_json_dump(features) self.check_json_dump(label) self.assertEqual(len(features), 1) self.assertTrue("feature_columns" in features) features = features["feature_columns"] self.assertEqual(len(features), 4) for i, f in enumerate(features): self.assertTrue(isinstance(f, NumericColumn)) self.assertEqual(len(f.get_field_desc()), 1) field_desc = f.get_field_desc()[0] self.assertEqual(field_desc.name, columns[i]) self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) self.assertTrue(isinstance(label, NumericColumn)) self.assertEqual(len(label.get_field_desc()), 1) field_desc = label.get_field_desc()[0] self.assertEqual(field_desc.name, "class") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [])
def test_hive(self): driver = testing.get_driver() user, password, host, port, database, _, _ = parseHiveDSN( testing.get_hive_dsn()) conn = connect(driver, database, user=user, password=password, host=host, port=port) self._do_test(driver, conn, hdfs_namenode_addr="127.0.0.1:8020", hive_location="/sqlflow") conn.close() conn = testing.get_singleton_db_connection() self._do_test(driver, conn) self._do_test_hive_specified_db(driver, conn, hdfs_namenode_addr="127.0.0.1:8020", hive_location="/sqlflow")
def test_with_cross(self): c1 = NumericColumn( FieldDesc(name='c1', dtype=DataType.INT64, shape=[1])) c2 = NumericColumn( FieldDesc(name='c2', dtype=DataType.INT64, shape=[1])) c4 = NumericColumn( FieldDesc(name='c4', dtype=DataType.INT64, shape=[1])) c5 = NumericColumn( FieldDesc(name='c5', dtype=DataType.INT64, shape=[1], is_sparse=True)) features = { 'feature_columns': [ c1, c2, CrossColumn([c4, c5], 128), CrossColumn([c1, c2], 256), ] } label = NumericColumn( FieldDesc(name='class', dtype=DataType.INT64, shape=[1])) select = "select c1, c2, c3, c4, c5, class " \ "from feature_derivation_case.train" conn = testing.get_singleton_db_connection() features, label = fd.infer_feature_columns(conn, select, features, label) self.check_json_dump(features) self.check_json_dump(label) self.assertEqual(len(features), 1) self.assertTrue("feature_columns" in features) features = features["feature_columns"] self.assertEqual(len(features), 5) fc1 = features[0] self.assertTrue(isinstance(fc1, NumericColumn)) self.assertEqual(len(fc1.get_field_desc()), 1) field_desc = fc1.get_field_desc()[0] self.assertEqual(field_desc.name, "c1") self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) fc2 = features[1] self.assertTrue(isinstance(fc2, NumericColumn)) self.assertEqual(len(fc2.get_field_desc()), 1) field_desc = fc2.get_field_desc()[0] self.assertEqual(field_desc.name, "c2") self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) fc3 = features[2] self.assertTrue(isinstance(fc3, NumericColumn)) self.assertEqual(len(fc3.get_field_desc()), 1) field_desc = fc3.get_field_desc()[0] self.assertEqual(field_desc.name, "c3") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.CSV) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [4]) fc4 = features[3] self.assertTrue(isinstance(fc4, CrossColumn)) self.assertEqual(len(fc4.get_field_desc()), 2) field_desc1 = fc4.get_field_desc()[0] self.assertEqual(field_desc1.name, "c4") self.assertEqual(field_desc1.dtype, DataType.FLOAT32) self.assertEqual(field_desc1.format, DataFormat.CSV) self.assertEqual(field_desc1.shape, [4]) self.assertFalse(field_desc1.is_sparse) field_desc2 = fc4.get_field_desc()[1] self.assertEqual(field_desc2.name, "c5") self.assertEqual(field_desc2.dtype, DataType.INT64) self.assertEqual(field_desc2.format, DataFormat.CSV) self.assertTrue(field_desc2.is_sparse) fc5 = features[4] self.assertTrue(isinstance(fc5, CrossColumn)) self.assertEqual(len(fc4.get_field_desc()), 2) field_desc1 = fc5.get_field_desc()[0] self.assertEqual(field_desc1.name, "c1") self.assertEqual(field_desc1.dtype, DataType.FLOAT32) self.assertEqual(field_desc1.format, DataFormat.PLAIN) self.assertEqual(field_desc1.shape, [1]) self.assertFalse(field_desc1.is_sparse) field_desc2 = fc5.get_field_desc()[1] self.assertEqual(field_desc2.name, "c2") self.assertEqual(field_desc2.dtype, DataType.FLOAT32) self.assertEqual(field_desc2.format, DataFormat.PLAIN) self.assertEqual(field_desc2.shape, [1]) self.assertFalse(field_desc2.is_sparse) self.assertTrue(isinstance(label, NumericColumn)) self.assertEqual(len(label.get_field_desc()), 1) field_desc = label.get_field_desc()[0] self.assertEqual(field_desc.name, "class") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [])
def test_without_cross(self): features = { 'feature_columns': [ EmbeddingColumn(dimension=256, combiner="mean", name="c3"), EmbeddingColumn(category_column=CategoryIDColumn( FieldDesc(name="c5", dtype=DataType.INT64, shape=[10000], delimiter=",", is_sparse=True), bucket_size=5000), dimension=64, combiner="sqrtn", name="c5"), ] } label = NumericColumn( FieldDesc(name="class", dtype=DataType.INT64, shape=[1])) select = "select c1, c2, c3, c4, c5, c6, class " \ "from feature_derivation_case.train" conn = testing.get_singleton_db_connection() features, label = fd.infer_feature_columns(conn, select, features, label) self.check_json_dump(features) self.check_json_dump(label) self.assertEqual(len(features), 1) self.assertTrue("feature_columns" in features) features = features["feature_columns"] self.assertEqual(len(features), 6) fc1 = features[0] self.assertTrue(isinstance(fc1, NumericColumn)) self.assertEqual(len(fc1.get_field_desc()), 1) field_desc = fc1.get_field_desc()[0] self.assertEqual(field_desc.name, "c1") self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) fc2 = features[1] self.assertTrue(isinstance(fc2, NumericColumn)) self.assertEqual(len(fc2.get_field_desc()), 1) field_desc = fc2.get_field_desc()[0] self.assertEqual(field_desc.name, "c2") self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) fc3 = features[2] self.assertTrue(isinstance(fc3, EmbeddingColumn)) self.assertEqual(len(fc3.get_field_desc()), 1) field_desc = fc3.get_field_desc()[0] self.assertEqual(field_desc.name, "c3") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.CSV) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [4]) self.assertEqual(fc3.dimension, 256) self.assertEqual(fc3.combiner, "mean") self.assertEqual(fc3.name, "c3") self.assertTrue(isinstance(fc3.category_column, CategoryIDColumn)) self.assertEqual(fc3.category_column.bucket_size, 10) fc4 = features[3] self.assertTrue(isinstance(fc4, NumericColumn)) self.assertEqual(len(fc4.get_field_desc()), 1) field_desc = fc4.get_field_desc()[0] self.assertEqual(field_desc.name, "c4") self.assertEqual(field_desc.dtype, DataType.FLOAT32) self.assertEqual(field_desc.format, DataFormat.CSV) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [4]) fc5 = features[4] self.assertTrue(isinstance(fc5, EmbeddingColumn)) self.assertEqual(len(fc5.get_field_desc()), 1) field_desc = fc5.get_field_desc()[0] self.assertEqual(field_desc.name, "c5") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.CSV) self.assertTrue(field_desc.is_sparse) self.assertEqual(field_desc.shape, [10000]) self.assertEqual(fc5.dimension, 64) self.assertEqual(fc5.combiner, "sqrtn") self.assertEqual(fc5.name, "c5") self.assertTrue(isinstance(fc5.category_column, CategoryIDColumn)) self.assertEqual(fc5.category_column.bucket_size, 5000) fc6 = features[5] self.assertTrue(isinstance(fc6, EmbeddingColumn)) self.assertEqual(len(fc6.get_field_desc()), 1) field_desc = fc6.get_field_desc()[0] self.assertEqual(field_desc.name, "c6") self.assertEqual(field_desc.dtype, DataType.STRING) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [1]) self.assertEqual(field_desc.vocabulary, set(['FEMALE', 'MALE', 'NULL'])) self.assertEqual(fc6.dimension, 128) self.assertEqual(fc6.combiner, "sum") self.assertEqual(fc6.name, "c6") self.assertTrue(isinstance(fc6.category_column, CategoryIDColumn)) self.assertEqual(fc6.category_column.bucket_size, 3) self.assertTrue(isinstance(label, NumericColumn)) self.assertEqual(len(label.get_field_desc()), 1) field_desc = label.get_field_desc()[0] self.assertEqual(field_desc.name, "class") self.assertEqual(field_desc.dtype, DataType.INT64) self.assertEqual(field_desc.format, DataFormat.PLAIN) self.assertFalse(field_desc.is_sparse) self.assertEqual(field_desc.shape, [])
def test_get_table_schema(self): driver = testing.get_driver() conn = testing.get_singleton_db_connection() if driver == "mysql": schema = get_table_schema(conn, "iris.train") expect = ( ('sepal_length', 'FLOAT'), ('sepal_width', 'FLOAT'), ('petal_length', 'FLOAT'), ('petal_width', 'FLOAT'), ('class', 'INT(11)'), ) self.assertTrue(np.array_equal(expect, schema)) schema = selected_columns_and_types( conn, "SELECT sepal_length, petal_width * 2.3 new_petal_width, " "class FROM iris.train") expect = [ ("sepal_length", "FLOAT"), ("new_petal_width", "DOUBLE"), ("class", "INT"), ] self.assertTrue(np.array_equal(expect, schema)) elif driver == "hive": schema = get_table_schema(conn, "iris.train") expect = ( ('sepal_length', 'FLOAT'), ('sepal_width', 'FLOAT'), ('petal_length', 'FLOAT'), ('petal_width', 'FLOAT'), ('class', 'INT'), ) self.assertTrue(np.array_equal(expect, schema)) schema = selected_columns_and_types( conn, "SELECT sepal_length, petal_width * 2.3 AS new_petal_width, " "class FROM iris.train") expect = [ ("sepal_length", "FLOAT"), ("new_petal_width", "FLOAT"), ("class", "INT"), ] self.assertTrue(np.array_equal(expect, schema)) elif driver == "maxcompute": case_db = os.getenv("SQLFLOW_TEST_DB_MAXCOMPUTE_PROJECT") table = "%s.sqlflow_test_iris_train" % case_db schema = get_table_schema(conn, table) expect = [ ('sepal_length', 'DOUBLE'), ('sepal_width', 'DOUBLE'), ('petal_length', 'DOUBLE'), ('petal_width', 'DOUBLE'), ('class', 'BIGINT'), ] self.assertTrue(np.array_equal(expect, schema)) schema = selected_columns_and_types( conn, "SELECT sepal_length, petal_width * 2.3 new_petal_width, " "class FROM %s" % table) expect = [ ("sepal_length", "DOUBLE"), ("new_petal_width", "DOUBLE"), ("class", "BIGINT"), ] self.assertTrue(np.array_equal(expect, schema))