Example #1
0
    def test_numeric_column(self):
        nc = NumericColumn(FieldDesc(name='c1', shape=(2, 3)))

        for model_type in [TENSORFLOW, XGBOOST]:
            compiled_nc = self.compile_fc(nc, model_type)
            self.assertEqual(compiled_nc.key, 'c1')
            self.assertEqual(compiled_nc.shape, (2, 3))
Example #2
0
def derive_label(label, fd_map):
    """
    Derive the feature column of the label.

    Args:
        label (FeatureColumn): the FeatureColumn object of the label.
        fd_map: (dict[str -> FieldDesc]): a FieldDesc map, where the key is the
            field name.

    Returns:
        A derived NumericColumn of the label.
    """
    label_name = label.get_field_desc()[0].name if label is not None else None
    if not label_name:
        return  # NOTE: clustering model may not specify Label

    label_field_desc = fd_map[label_name]
    assert label_field_desc is not None, \
        "deriveLabel: LABEL COLUMN '%s' not found" % label_name

    # use shape [] if label shape is [1] for TensorFlow scalar label
    # shape should be [].
    shape = label_field_desc.shape
    if shape is None or (len(shape) == 1 and shape[0] == 1):
        label_field_desc.shape = []

    return NumericColumn(label_field_desc)
Example #3
0
    def test_train(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {
            "num_boost_round": 20,
            "original_sql": original_sql,
            "feature_column_map": None,
            "label_column": NumericColumn(FieldDesc(name="class")),
            "model_image": "sqlflow:step"
        }
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        eval_result = train(ds, "xgboost.gbtree", select, val_select,
                            model_params, "iris.xgboost_train_model_test",
                            None, train_params)
        self.assertLess(eval_result['train']['merror'][-1], 0.01)
        self.assertLess(eval_result['validate']['merror'][-1], 0.01)
Example #4
0
    def test_cross_column(self):
        cc = CrossColumn(['c1', NumericColumn(FieldDesc(name='c2'))], 4096)
        compiled_cc = self.compile_fc(cc, TENSORFLOW)
        self.assertEqual(list(compiled_cc.keys), ['c1', 'c2'])
        self.assertEqual(compiled_cc.hash_bucket_size, 4096)

        with self.assertRaises(AssertionError):
            self.compile_fc(cc, XGBOOST)
Example #5
0
    def test_bucket_column(self):
        nc = NumericColumn(FieldDesc(name='c1', shape=(1, )))
        bc = BucketColumn(nc, (-10, -5, 3, 7))

        for model_type in [TENSORFLOW, XGBOOST]:
            compiled_bc = self.compile_fc(bc, model_type)
            self.assertEqual(compiled_bc.source_column.key, 'c1')
            self.assertEqual(compiled_bc.boundaries, (-10, -5, 3, 7))
Example #6
0
    def test_no_column_clause(self):
        columns = [
            "sepal_length",
            "sepal_width",
            "petal_length",
            "petal_width",
        ]

        select = "select %s, class from iris.train" % ",".join(columns)

        conn = testing.get_singleton_db_connection()
        features = None
        label = NumericColumn(
            FieldDesc(name='class', dtype=DataType.INT64, shape=[1]))
        features, label = fd.infer_feature_columns(conn, select, features,
                                                   label)

        self.check_json_dump(features)
        self.check_json_dump(label)

        self.assertEqual(len(features), 1)
        self.assertTrue("feature_columns" in features)
        features = features["feature_columns"]
        self.assertEqual(len(features), 4)

        for i, f in enumerate(features):
            self.assertTrue(isinstance(f, NumericColumn))
            self.assertEqual(len(f.get_field_desc()), 1)
            field_desc = f.get_field_desc()[0]
            self.assertEqual(field_desc.name, columns[i])
            self.assertEqual(field_desc.dtype, DataType.FLOAT32)
            self.assertEqual(field_desc.format, DataFormat.PLAIN)
            self.assertFalse(field_desc.is_sparse)
            self.assertEqual(field_desc.shape, [1])

        self.assertTrue(isinstance(label, NumericColumn))
        self.assertEqual(len(label.get_field_desc()), 1)
        field_desc = label.get_field_desc()[0]
        self.assertEqual(field_desc.name, "class")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [])
Example #7
0
    def test_train(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """
        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {
            "num_boost_round": 20,
        }
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(ds, original_sql, select, val_select,
                                "xgboost.gbtree", "", None,
                                NumericColumn(FieldDesc(name="class")),
                                model_params, train_params, None,
                                "iris.xgboost_train_model_test", None)
            self.assertLess(eval_result['train']['merror'][-1], 0.01)
            self.assertLess(eval_result['validate']['merror'][-1], 0.01)

        with temp_file.TemporaryDirectory(as_cwd=True):
            pred_original_sql = """SELECT * FROM iris.test
            TO PREDICT iris.xgboost_pred_result.pred_val
            USING iris.xgboost_train_model_test;"""
            pred(ds, pred_original_sql, "SELECT * FROM iris.test",
                 "iris.xgboost_train_model_test", "pred_val", model_params,
                 "iris.xgboost_pred_result")

        with temp_file.TemporaryDirectory(as_cwd=True):
            explain_original_sql = """SELECT * FROM iris.test
            TO EXPLAIN iris.xgboost_train_model_test
            INTO iris.xgboost_explain_result;"""
            explain(ds, explain_original_sql, "SELECT * FROM iris.test",
                    "iris.xgboost_train_model_test", model_params,
                    "iris.xgboost_explain_result")

        with temp_file.TemporaryDirectory(as_cwd=True):
            evaluate_original_sql = """SELECT * FROM iris.test
            TO EVALUATE iris.xgboost_train_model_test
            WITH label_col=class
            INTO iris.xgboost_evaluate_result;"""
            evaluate(ds, evaluate_original_sql, "SELECT * FROM iris.test",
                     "class", "iris.xgboost_train_model_test", model_params,
                     "iris.xgboost_evaluate_result")
Example #8
0
def new_feature_column(field_desc):
    """
    Create a new FeatureColumn object by the given FieldDesc object.

    Args:
        field_desc (FieldDesc): a given FieldDesc object.

    Returns:
        If field_desc.dtype is STRING, return an EmbeddingColumn object.
        Otherwise, return a NumericColumn object.
    """
    if field_desc.dtype != DataType.STRING:
        return NumericColumn(field_desc)
    else:
        category_column = CategoryIDColumn(field_desc,
                                           len(field_desc.vocabulary))
        # NOTE(typhoonzero): a default embedding size of 128 is enough
        # for most cases.
        embedding = EmbeddingColumn(category_column=category_column,
                                    dimension=128,
                                    combiner="sum")
        embedding.name = field_desc.name
        return embedding
Example #9
0
    def test_with_cross(self):
        c1 = NumericColumn(
            FieldDesc(name='c1', dtype=DataType.INT64, shape=[1]))
        c2 = NumericColumn(
            FieldDesc(name='c2', dtype=DataType.INT64, shape=[1]))
        c4 = NumericColumn(
            FieldDesc(name='c4', dtype=DataType.INT64, shape=[1]))
        c5 = NumericColumn(
            FieldDesc(name='c5',
                      dtype=DataType.INT64,
                      shape=[1],
                      is_sparse=True))

        features = {
            'feature_columns': [
                c1,
                c2,
                CrossColumn([c4, c5], 128),
                CrossColumn([c1, c2], 256),
            ]
        }

        label = NumericColumn(
            FieldDesc(name='class', dtype=DataType.INT64, shape=[1]))
        select = "select c1, c2, c3, c4, c5, class " \
                 "from feature_derivation_case.train"

        conn = testing.get_singleton_db_connection()
        features, label = fd.infer_feature_columns(conn, select, features,
                                                   label)

        self.check_json_dump(features)
        self.check_json_dump(label)

        self.assertEqual(len(features), 1)
        self.assertTrue("feature_columns" in features)
        features = features["feature_columns"]
        self.assertEqual(len(features), 5)

        fc1 = features[0]
        self.assertTrue(isinstance(fc1, NumericColumn))
        self.assertEqual(len(fc1.get_field_desc()), 1)
        field_desc = fc1.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c1")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc2 = features[1]
        self.assertTrue(isinstance(fc2, NumericColumn))
        self.assertEqual(len(fc2.get_field_desc()), 1)
        field_desc = fc2.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c2")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc3 = features[2]
        self.assertTrue(isinstance(fc3, NumericColumn))
        self.assertEqual(len(fc3.get_field_desc()), 1)
        field_desc = fc3.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c3")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [4])

        fc4 = features[3]
        self.assertTrue(isinstance(fc4, CrossColumn))
        self.assertEqual(len(fc4.get_field_desc()), 2)
        field_desc1 = fc4.get_field_desc()[0]
        self.assertEqual(field_desc1.name, "c4")
        self.assertEqual(field_desc1.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc1.format, DataFormat.CSV)
        self.assertEqual(field_desc1.shape, [4])
        self.assertFalse(field_desc1.is_sparse)
        field_desc2 = fc4.get_field_desc()[1]
        self.assertEqual(field_desc2.name, "c5")
        self.assertEqual(field_desc2.dtype, DataType.INT64)
        self.assertEqual(field_desc2.format, DataFormat.CSV)
        self.assertTrue(field_desc2.is_sparse)

        fc5 = features[4]
        self.assertTrue(isinstance(fc5, CrossColumn))
        self.assertEqual(len(fc4.get_field_desc()), 2)
        field_desc1 = fc5.get_field_desc()[0]
        self.assertEqual(field_desc1.name, "c1")
        self.assertEqual(field_desc1.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc1.format, DataFormat.PLAIN)
        self.assertEqual(field_desc1.shape, [1])
        self.assertFalse(field_desc1.is_sparse)
        field_desc2 = fc5.get_field_desc()[1]
        self.assertEqual(field_desc2.name, "c2")
        self.assertEqual(field_desc2.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc2.format, DataFormat.PLAIN)
        self.assertEqual(field_desc2.shape, [1])
        self.assertFalse(field_desc2.is_sparse)

        self.assertTrue(isinstance(label, NumericColumn))
        self.assertEqual(len(label.get_field_desc()), 1)
        field_desc = label.get_field_desc()[0]
        self.assertEqual(field_desc.name, "class")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [])
Example #10
0
    def test_without_cross(self):
        features = {
            'feature_columns': [
                EmbeddingColumn(dimension=256, combiner="mean", name="c3"),
                EmbeddingColumn(category_column=CategoryIDColumn(
                    FieldDesc(name="c5",
                              dtype=DataType.INT64,
                              shape=[10000],
                              delimiter=",",
                              is_sparse=True),
                    bucket_size=5000),
                                dimension=64,
                                combiner="sqrtn",
                                name="c5"),
            ]
        }

        label = NumericColumn(
            FieldDesc(name="class", dtype=DataType.INT64, shape=[1]))

        select = "select c1, c2, c3, c4, c5, c6, class " \
                 "from feature_derivation_case.train"
        conn = testing.get_singleton_db_connection()
        features, label = fd.infer_feature_columns(conn, select, features,
                                                   label)

        self.check_json_dump(features)
        self.check_json_dump(label)

        self.assertEqual(len(features), 1)
        self.assertTrue("feature_columns" in features)
        features = features["feature_columns"]
        self.assertEqual(len(features), 6)

        fc1 = features[0]
        self.assertTrue(isinstance(fc1, NumericColumn))
        self.assertEqual(len(fc1.get_field_desc()), 1)
        field_desc = fc1.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c1")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc2 = features[1]
        self.assertTrue(isinstance(fc2, NumericColumn))
        self.assertEqual(len(fc2.get_field_desc()), 1)
        field_desc = fc2.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c2")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc3 = features[2]
        self.assertTrue(isinstance(fc3, EmbeddingColumn))
        self.assertEqual(len(fc3.get_field_desc()), 1)
        field_desc = fc3.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c3")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [4])
        self.assertEqual(fc3.dimension, 256)
        self.assertEqual(fc3.combiner, "mean")
        self.assertEqual(fc3.name, "c3")
        self.assertTrue(isinstance(fc3.category_column, CategoryIDColumn))
        self.assertEqual(fc3.category_column.bucket_size, 10)

        fc4 = features[3]
        self.assertTrue(isinstance(fc4, NumericColumn))
        self.assertEqual(len(fc4.get_field_desc()), 1)
        field_desc = fc4.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c4")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [4])

        fc5 = features[4]
        self.assertTrue(isinstance(fc5, EmbeddingColumn))
        self.assertEqual(len(fc5.get_field_desc()), 1)
        field_desc = fc5.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c5")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertTrue(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [10000])
        self.assertEqual(fc5.dimension, 64)
        self.assertEqual(fc5.combiner, "sqrtn")
        self.assertEqual(fc5.name, "c5")
        self.assertTrue(isinstance(fc5.category_column, CategoryIDColumn))
        self.assertEqual(fc5.category_column.bucket_size, 5000)

        fc6 = features[5]
        self.assertTrue(isinstance(fc6, EmbeddingColumn))
        self.assertEqual(len(fc6.get_field_desc()), 1)
        field_desc = fc6.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c6")
        self.assertEqual(field_desc.dtype, DataType.STRING)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])
        self.assertEqual(field_desc.vocabulary, set(['FEMALE', 'MALE',
                                                     'NULL']))
        self.assertEqual(fc6.dimension, 128)
        self.assertEqual(fc6.combiner, "sum")
        self.assertEqual(fc6.name, "c6")
        self.assertTrue(isinstance(fc6.category_column, CategoryIDColumn))
        self.assertEqual(fc6.category_column.bucket_size, 3)

        self.assertTrue(isinstance(label, NumericColumn))
        self.assertEqual(len(label.get_field_desc()), 1)
        field_desc = label.get_field_desc()[0]
        self.assertEqual(field_desc.name, "class")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [])
Example #11
0
    def test_main(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softprob",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        LABEL class
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"num_boost_round": 20}
        model_params = {"num_class": 3, "objective": "multi:softprob"}
        save_name = "iris.xgboost_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(datasource=ds,
                                original_sql=original_sql,
                                select=select,
                                validation_select=val_select,
                                estimator_string="xgboost.gbtree",
                                model_image="sqlflow:step",
                                feature_column_map=None,
                                label_column=NumericColumn(
                                    FieldDesc(name=class_name)),
                                model_params=model_params,
                                train_params=train_params,
                                validation_params=None,
                                save=save_name,
                                load=None)

        self.assertLess(eval_result['train']['merror'][-1], 0.01)
        self.assertLess(eval_result['validate']['merror'][-1], 0.01)

        conn = db.connect_with_data_source(ds)
        pred_select = "SELECT * FROM iris.test"

        with temp_file.TemporaryDirectory(as_cwd=True):
            result_column_names, train_label_idx = create_predict_table(
                conn, select, "iris.predict_result_table",
                FieldDesc(name=class_name), "class")
            predict(ds, pred_select, "iris.predict_result_table",
                    result_column_names, train_label_idx, save_name)

        self.assertEqual(
            self.get_table_row_count(conn, "iris.test"),
            self.get_table_row_count(conn, "iris.predict_result_table"))

        schema1 = self.get_table_schema(conn, "iris.test")
        schema2 = self.get_table_schema(conn, "iris.predict_result_table")
        self.assertEqual(len(schema1), len(schema2))
        for name in schema1:
            if name == 'class':
                self.assertEqual(schema2[name], "BIGINT")
                continue

            self.assertTrue(name in schema2)
            self.assertEqual(schema1[name], schema2[name])

        diff_schema = schema2.keys() - schema1.keys()
        self.assertEqual(len(diff_schema), 0)

        with temp_file.TemporaryDirectory(as_cwd=True):
            result_column_names = create_evaluate_table(
                conn, "iris.evaluate_result_table", ["accuracy_score"])
            evaluate(ds,
                     pred_select,
                     "iris.evaluate_result_table",
                     save_name,
                     label_name='class',
                     model_params={'validation.metrics': 'accuracy_score'},
                     result_column_names=result_column_names)

        eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table")
        self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score']))

        with temp_file.TemporaryDirectory(as_cwd=True):
            feature_column_names = [
                "petal_width", "petal_length", "sepal_width", "sepal_length"
            ]
            create_explain_table(conn, EstimatorType.XGBOOST, "TreeExplainer",
                                 "xgboost.gbtree", "iris.explain_result_table",
                                 feature_column_names)
            explain(ds, select, "TreeExplainer", {"plot_type": "decision"},
                    "iris.explain_result_table", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table")
        self.assertEqual(explain_schema.keys(), set(feature_column_names))

        with temp_file.TemporaryDirectory(as_cwd=True):
            create_explain_table(conn, EstimatorType.XGBOOST,
                                 "XGBoostExplainer", "xgboost.gbtree",
                                 "iris.explain_result_table_2",
                                 feature_column_names)
            explain(ds, select, "XGBoostExplainer", {},
                    "iris.explain_result_table_2", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table_2")
        self.assertEqual(explain_schema.keys(),
                         set(['feature', 'fscore', 'gain']))
        conn.close()
Example #12
0
    def check_main_impl(self, estimator):
        if testing.get_driver() != "mysql":
            return

        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN %s
        WITH
            model.hidden_units=[32,64],
            model.n_classes=3,
            validation.select="SELECT * FROM iris.test"
        LABEL class
        INTO iris.tensorflow_train_model_test;
        """ % estimator

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"batch_size": 10}
        model_params = {"n_classes": 3, "hidden_units": [32, 64]}
        save_name = "iris.tensorflow_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            train(original_sql=original_sql,
                  model_image="sqlflow:step",
                  estimator_string=estimator,
                  datasource=ds,
                  select=select,
                  validation_select=val_select,
                  model_params=model_params,
                  train_params=train_params,
                  validation_params=None,
                  feature_column_map=None,
                  label_column=NumericColumn(
                      FieldDesc(name=class_name, shape=[])),
                  save=save_name,
                  load=None)

        conn = db.connect_with_data_source(ds)

        pred_select = "SELECT * FROM iris.test"

        with temp_file.TemporaryDirectory(as_cwd=True):
            pred(ds, pred_select, "iris.predict_result_table", class_name,
                 save_name)

        self.assertEqual(
            self.get_table_row_count(conn, "iris.test"),
            self.get_table_row_count(conn, "iris.predict_result_table"))

        schema1 = self.get_table_schema(conn, "iris.test")
        schema2 = self.get_table_schema(conn, "iris.predict_result_table")
        self.assertEqual(len(schema1), len(schema2))
        for name in schema1:
            if name == 'class':
                self.assertEqual(schema2[name], "BIGINT")
                continue

            self.assertTrue(name in schema2)
            self.assertEqual(schema1[name], schema2[name])

        diff_schema = schema2.keys() - schema1.keys()
        self.assertEqual(len(diff_schema), 0)

        with temp_file.TemporaryDirectory(as_cwd=True):
            evaluate(ds, select, "iris.evaluate_result_table", save_name,
                     class_name, {'validation.metrics': 'Accuracy'})

        eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table")
        eval_schema = set([k.lower() for k in eval_schema.keys()])
        self.assertEqual(eval_schema, set(['loss', 'accuracy']))

        with temp_file.TemporaryDirectory(as_cwd=True):
            explain(ds, select, None, {"plot_type": "bar"},
                    "iris.explain_result_table", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table")
        self.assertEqual(
            explain_schema.keys(),
            set(['petal_length', 'petal_width', 'sepal_length',
                 'sepal_width']))
        conn.close()
Example #13
0
    def test_metadata(self):
        original_sql = '''
        SELECT c1, c2, class FROM my_db.train_table
        TO TRAIN my_docker_image:latest/DNNClassifier
        WITH
            model.n_classes = 3,
            model.hidden_units = [16, 32],
            validation.select="SELECT c1, c2, class FROM my_db.val_table"
        INTO my_db.my_dnn_model;
        '''

        select = "SELECT c1, c2, class FROM my_db.train_table"
        validation_select = "SELECT c1, c2, class FROM my_db.val_table"
        model_repo_image = "my_docker_image:latest"
        estimator = "DNNClassifier"
        attributes = {
            'n_classes': 3,
            'hidden_units': [16, 32],
        }

        features = {
            'feature_columns': [
                NumericColumn(FieldDesc(name='c1', shape=[3], delimiter=",")),
                NumericColumn(FieldDesc(name='c2', shape=[1])),
            ],
        }

        label = NumericColumn(FieldDesc(name='class', shape=[5],
                                        delimiter=','))

        def check_metadata(meta):
            self.assertEqual(meta['original_sql'], original_sql)
            self.assertEqual(meta['select'], select)
            self.assertEqual(meta['validation_select'], validation_select)
            self.assertEqual(meta['model_repo_image'], model_repo_image)
            self.assertEqual(meta['class_name'], estimator)
            self.assertEqual(meta['attributes'], attributes)
            meta_features = meta['features']
            meta_label = meta['label']
            self.assertEqual(len(meta_features), 1)
            self.assertEqual(len(meta_features['feature_columns']), 2)
            meta_features = meta_features['feature_columns']
            self.assertEqual(type(meta_features[0]), NumericColumn)
            self.assertEqual(type(meta_features[1]), NumericColumn)
            field_desc = meta_features[0].get_field_desc()[0]
            self.assertEqual(field_desc.name, 'c1')
            self.assertEqual(field_desc.shape, [3])
            self.assertEqual(field_desc.delimiter, ',')
            field_desc = meta_features[1].get_field_desc()[0]
            self.assertEqual(field_desc.name, 'c2')
            self.assertEqual(field_desc.shape, [1])
            self.assertEqual(type(meta_label), NumericColumn)
            field_desc = meta_label.get_field_desc()[0]
            self.assertEqual(field_desc.name, 'class')
            self.assertEqual(field_desc.shape, [5])
            self.assertEqual(field_desc.delimiter, ',')
            self.assertEqual(meta['evaluation'], {'accuracy': 0.5})
            self.assertEqual(meta['my_data'], 0.25)

        meta = collect_metadata(original_sql,
                                select,
                                validation_select,
                                model_repo_image,
                                estimator,
                                attributes,
                                features,
                                label, {'accuracy': 0.5},
                                my_data=0.25)

        check_metadata(meta)

        save_metadata(self.file_name, meta)
        meta = load_metadata(self.file_name)
        check_metadata(meta)
Example #14
0
    def test_main(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"num_boost_round": 20}
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        save_name = "iris.xgboost_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(original_sql=original_sql,
                                model_image="sqlflow:step",
                                estimator_string="xgboost.gbtree",
                                datasource=ds,
                                select=select,
                                validation_select=val_select,
                                model_params=model_params,
                                train_params=train_params,
                                feature_column_map=None,
                                label_column=NumericColumn(
                                    FieldDesc(name=class_name)),
                                save=save_name)
            self.assertLess(eval_result['train']['merror'][-1], 0.01)
            self.assertLess(eval_result['validate']['merror'][-1], 0.01)

            conn = db.connect_with_data_source(ds)

            pred_select = "SELECT * FROM iris.test"
            pred(ds, pred_select, "iris.predict_result_table", class_name,
                 save_name)

            self.assertEqual(
                self.get_table_row_count(conn, "iris.test"),
                self.get_table_row_count(conn, "iris.predict_result_table"))

            schema1 = self.get_table_schema(conn, "iris.test")
            schema2 = self.get_table_schema(conn, "iris.predict_result_table")
            self.assertEqual(len(schema1), len(schema2))
            for name in schema1:
                if name == 'class':
                    self.assertEqual(schema2[name], "BIGINT")
                    continue

                self.assertTrue(name in schema2)
                self.assertEqual(schema1[name], schema2[name])

            diff_schema = schema2.keys() - schema1.keys()
            self.assertEqual(len(diff_schema), 0)

            evaluate(ds, pred_select, "iris.evaluate_result_table", save_name,
                     'class', ['accuracy_score'])
            eval_schema = self.get_table_schema(conn,
                                                "iris.evaluate_result_table")
            self.assertEqual(eval_schema.keys(),
                             set(['loss', 'accuracy_score']))