Ejemplo n.º 1
0
    def test_category_id_column(self):
        cc = CategoryIDColumn(FieldDesc(name='c1'), 128)

        for model_type in [TENSORFLOW, XGBOOST]:
            compiled_cc = self.compile_fc(cc, model_type)
            self.assertEqual(compiled_cc.key, 'c1')
            self.assertEqual(compiled_cc.num_buckets, 128)

        cc = CategoryIDColumn(FieldDesc(name='c1', vocabulary=set(['a', 'b'])),
                              128)
        for model_type in [TENSORFLOW, XGBOOST]:
            compiled_cc = self.compile_fc(cc, model_type)
            vocab = sorted(compiled_cc.vocabulary_list)
            self.assertEqual(vocab, ['a', 'b'])
Ejemplo n.º 2
0
 def test_category_hash_column(self):
     chc = CategoryHashColumn(FieldDesc(name='c1', dtype=DataType.STRING),
                              32)
     for model_type in [TENSORFLOW, XGBOOST]:
         compiled_chc = self.compile_fc(chc, model_type)
         self.assertEqual(compiled_chc.key, 'c1')
         self.assertEqual(compiled_chc.hash_bucket_size, 32)
Ejemplo n.º 3
0
    def test_numeric_column(self):
        nc = NumericColumn(FieldDesc(name='c1', shape=(2, 3)))

        for model_type in [TENSORFLOW, XGBOOST]:
            compiled_nc = self.compile_fc(nc, model_type)
            self.assertEqual(compiled_nc.key, 'c1')
            self.assertEqual(compiled_nc.shape, (2, 3))
Ejemplo n.º 4
0
    def test_train(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {
            "num_boost_round": 20,
            "original_sql": original_sql,
            "feature_column_map": None,
            "label_column": NumericColumn(FieldDesc(name="class")),
            "model_image": "sqlflow:step"
        }
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        eval_result = train(ds, "xgboost.gbtree", select, val_select,
                            model_params, "iris.xgboost_train_model_test",
                            None, train_params)
        self.assertLess(eval_result['train']['merror'][-1], 0.01)
        self.assertLess(eval_result['validate']['merror'][-1], 0.01)
Ejemplo n.º 5
0
    def test_bucket_column(self):
        nc = NumericColumn(FieldDesc(name='c1', shape=(1, )))
        bc = BucketColumn(nc, (-10, -5, 3, 7))

        for model_type in [TENSORFLOW, XGBOOST]:
            compiled_bc = self.compile_fc(bc, model_type)
            self.assertEqual(compiled_bc.source_column.key, 'c1')
            self.assertEqual(compiled_bc.boundaries, (-10, -5, 3, 7))
Ejemplo n.º 6
0
    def test_cross_column(self):
        cc = CrossColumn(['c1', NumericColumn(FieldDesc(name='c2'))], 4096)
        compiled_cc = self.compile_fc(cc, TENSORFLOW)
        self.assertEqual(list(compiled_cc.keys), ['c1', 'c2'])
        self.assertEqual(compiled_cc.hash_bucket_size, 4096)

        with self.assertRaises(AssertionError):
            self.compile_fc(cc, XGBOOST)
Ejemplo n.º 7
0
    def test_indicator_column(self):
        cc = CategoryIDColumn(FieldDesc(name='c1'), 128)
        ic = IndicatorColumn(category_column=cc)

        for model_type in [TENSORFLOW, XGBOOST]:
            compiled_chc = self.compile_fc(ic, model_type)
            compiled_cc = compiled_chc.categorical_column
            self.assertEqual(compiled_cc.key, 'c1')
            self.assertEqual(compiled_cc.num_buckets, 128)
Ejemplo n.º 8
0
    def test_seq_category_id_column(self):
        scc = SeqCategoryIDColumn(FieldDesc(name='c1'), 64)
        compiled_scc = self.compile_fc(scc, TENSORFLOW)
        # NOTE: TensorFlow SeqCategoryIDColumn does not have key
        # attribute
        # self.assertEqual(compiled_scc.key, 'c1')
        self.assertEqual(compiled_scc.num_buckets, 64)

        with self.assertRaises(AssertionError):
            self.compile_fc(scc, XGBOOST)
Ejemplo n.º 9
0
    def get_field_desc(self):
        descs = []
        for k in self.keys:
            if isinstance(k, six.string_types):
                descs.append(
                    FieldDesc(name=k, dtype=DataType.STRING, shape=[1]))
            elif isinstance(k, NumericColumn):
                descs.extend(k.get_field_desc())
            else:
                raise ValueError("unsupported type %s" % type(k))

        return descs
Ejemplo n.º 10
0
def new_default_field_desc(name):
    """
    Create a new default FieldDesc object.

    Args:
        name: the FieldDesc name.

    Returns:
        A FieldDesc object whose name is the given name,
        and the data type is INT.
    """
    return FieldDesc(name=name, dtype=DataType.INT64)
Ejemplo n.º 11
0
    def test_train(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """
        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {
            "num_boost_round": 20,
        }
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(ds, original_sql, select, val_select,
                                "xgboost.gbtree", "", None,
                                NumericColumn(FieldDesc(name="class")),
                                model_params, train_params, None,
                                "iris.xgboost_train_model_test", None)
            self.assertLess(eval_result['train']['merror'][-1], 0.01)
            self.assertLess(eval_result['validate']['merror'][-1], 0.01)

        with temp_file.TemporaryDirectory(as_cwd=True):
            pred_original_sql = """SELECT * FROM iris.test
            TO PREDICT iris.xgboost_pred_result.pred_val
            USING iris.xgboost_train_model_test;"""
            pred(ds, pred_original_sql, "SELECT * FROM iris.test",
                 "iris.xgboost_train_model_test", "pred_val", model_params,
                 "iris.xgboost_pred_result")

        with temp_file.TemporaryDirectory(as_cwd=True):
            explain_original_sql = """SELECT * FROM iris.test
            TO EXPLAIN iris.xgboost_train_model_test
            INTO iris.xgboost_explain_result;"""
            explain(ds, explain_original_sql, "SELECT * FROM iris.test",
                    "iris.xgboost_train_model_test", model_params,
                    "iris.xgboost_explain_result")

        with temp_file.TemporaryDirectory(as_cwd=True):
            evaluate_original_sql = """SELECT * FROM iris.test
            TO EVALUATE iris.xgboost_train_model_test
            WITH label_col=class
            INTO iris.xgboost_evaluate_result;"""
            evaluate(ds, evaluate_original_sql, "SELECT * FROM iris.test",
                     "class", "iris.xgboost_train_model_test", model_params,
                     "iris.xgboost_evaluate_result")
Ejemplo n.º 12
0
    def test_embedding_column(self):
        chc = CategoryHashColumn(FieldDesc(name='c1', dtype=DataType.STRING),
                                 32)
        ec = EmbeddingColumn(category_column=chc, combiner='sum', dimension=23)

        compiled_ec = self.compile_fc(ec, TENSORFLOW)
        self.assertEqual(compiled_ec.combiner, 'sum')
        self.assertEqual(compiled_ec.dimension, 23)

        compiled_chc = compiled_ec.categorical_column
        self.assertEqual(compiled_chc.key, 'c1')
        self.assertEqual(compiled_chc.hash_bucket_size, 32)

        with self.assertRaises(AssertionError):
            self.compile_fc(ec, XGBOOST)
Ejemplo n.º 13
0
    def test_no_column_clause(self):
        columns = [
            "sepal_length",
            "sepal_width",
            "petal_length",
            "petal_width",
        ]

        select = "select %s, class from iris.train" % ",".join(columns)

        conn = testing.get_singleton_db_connection()
        features = None
        label = NumericColumn(
            FieldDesc(name='class', dtype=DataType.INT64, shape=[1]))
        features, label = fd.infer_feature_columns(conn, select, features,
                                                   label)

        self.check_json_dump(features)
        self.check_json_dump(label)

        self.assertEqual(len(features), 1)
        self.assertTrue("feature_columns" in features)
        features = features["feature_columns"]
        self.assertEqual(len(features), 4)

        for i, f in enumerate(features):
            self.assertTrue(isinstance(f, NumericColumn))
            self.assertEqual(len(f.get_field_desc()), 1)
            field_desc = f.get_field_desc()[0]
            self.assertEqual(field_desc.name, columns[i])
            self.assertEqual(field_desc.dtype, DataType.FLOAT32)
            self.assertEqual(field_desc.format, DataFormat.PLAIN)
            self.assertFalse(field_desc.is_sparse)
            self.assertEqual(field_desc.shape, [1])

        self.assertTrue(isinstance(label, NumericColumn))
        self.assertEqual(len(label.get_field_desc()), 1)
        field_desc = label.get_field_desc()[0]
        self.assertEqual(field_desc.name, "class")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [])
Ejemplo n.º 14
0
 def _from_dict(cls, d):
     field_desc = FieldDesc.from_dict(d["field_desc"])
     bucket_size = d["bucket_size"]
     return SeqCategoryIDColumn(field_desc, bucket_size)
Ejemplo n.º 15
0
 def _from_dict(cls, d):
     fd = FieldDesc.from_dict(d["field_desc"])
     return NumericColumn(fd)
Ejemplo n.º 16
0
    def test_main(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softprob",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        LABEL class
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"num_boost_round": 20}
        model_params = {"num_class": 3, "objective": "multi:softprob"}
        save_name = "iris.xgboost_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(datasource=ds,
                                original_sql=original_sql,
                                select=select,
                                validation_select=val_select,
                                estimator_string="xgboost.gbtree",
                                model_image="sqlflow:step",
                                feature_column_map=None,
                                label_column=NumericColumn(
                                    FieldDesc(name=class_name)),
                                model_params=model_params,
                                train_params=train_params,
                                validation_params=None,
                                save=save_name,
                                load=None)

        self.assertLess(eval_result['train']['merror'][-1], 0.01)
        self.assertLess(eval_result['validate']['merror'][-1], 0.01)

        conn = db.connect_with_data_source(ds)
        pred_select = "SELECT * FROM iris.test"

        with temp_file.TemporaryDirectory(as_cwd=True):
            result_column_names, train_label_idx = create_predict_table(
                conn, select, "iris.predict_result_table",
                FieldDesc(name=class_name), "class")
            predict(ds, pred_select, "iris.predict_result_table",
                    result_column_names, train_label_idx, save_name)

        self.assertEqual(
            self.get_table_row_count(conn, "iris.test"),
            self.get_table_row_count(conn, "iris.predict_result_table"))

        schema1 = self.get_table_schema(conn, "iris.test")
        schema2 = self.get_table_schema(conn, "iris.predict_result_table")
        self.assertEqual(len(schema1), len(schema2))
        for name in schema1:
            if name == 'class':
                self.assertEqual(schema2[name], "BIGINT")
                continue

            self.assertTrue(name in schema2)
            self.assertEqual(schema1[name], schema2[name])

        diff_schema = schema2.keys() - schema1.keys()
        self.assertEqual(len(diff_schema), 0)

        with temp_file.TemporaryDirectory(as_cwd=True):
            result_column_names = create_evaluate_table(
                conn, "iris.evaluate_result_table", ["accuracy_score"])
            evaluate(ds,
                     pred_select,
                     "iris.evaluate_result_table",
                     save_name,
                     label_name='class',
                     model_params={'validation.metrics': 'accuracy_score'},
                     result_column_names=result_column_names)

        eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table")
        self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score']))

        with temp_file.TemporaryDirectory(as_cwd=True):
            feature_column_names = [
                "petal_width", "petal_length", "sepal_width", "sepal_length"
            ]
            create_explain_table(conn, EstimatorType.XGBOOST, "TreeExplainer",
                                 "xgboost.gbtree", "iris.explain_result_table",
                                 feature_column_names)
            explain(ds, select, "TreeExplainer", {"plot_type": "decision"},
                    "iris.explain_result_table", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table")
        self.assertEqual(explain_schema.keys(), set(feature_column_names))

        with temp_file.TemporaryDirectory(as_cwd=True):
            create_explain_table(conn, EstimatorType.XGBOOST,
                                 "XGBoostExplainer", "xgboost.gbtree",
                                 "iris.explain_result_table_2",
                                 feature_column_names)
            explain(ds, select, "XGBoostExplainer", {},
                    "iris.explain_result_table_2", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table_2")
        self.assertEqual(explain_schema.keys(),
                         set(['feature', 'fscore', 'gain']))
        conn.close()
Ejemplo n.º 17
0
    def test_main(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"num_boost_round": 20}
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        save_name = "iris.xgboost_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(original_sql=original_sql,
                                model_image="sqlflow:step",
                                estimator_string="xgboost.gbtree",
                                datasource=ds,
                                select=select,
                                validation_select=val_select,
                                model_params=model_params,
                                train_params=train_params,
                                feature_column_map=None,
                                label_column=NumericColumn(
                                    FieldDesc(name=class_name)),
                                save=save_name)
            self.assertLess(eval_result['train']['merror'][-1], 0.01)
            self.assertLess(eval_result['validate']['merror'][-1], 0.01)

            conn = db.connect_with_data_source(ds)

            pred_select = "SELECT * FROM iris.test"
            pred(ds, pred_select, "iris.predict_result_table", class_name,
                 save_name)

            self.assertEqual(
                self.get_table_row_count(conn, "iris.test"),
                self.get_table_row_count(conn, "iris.predict_result_table"))

            schema1 = self.get_table_schema(conn, "iris.test")
            schema2 = self.get_table_schema(conn, "iris.predict_result_table")
            self.assertEqual(len(schema1), len(schema2))
            for name in schema1:
                if name == 'class':
                    self.assertEqual(schema2[name], "BIGINT")
                    continue

                self.assertTrue(name in schema2)
                self.assertEqual(schema1[name], schema2[name])

            diff_schema = schema2.keys() - schema1.keys()
            self.assertEqual(len(diff_schema), 0)

            evaluate(ds, pred_select, "iris.evaluate_result_table", save_name,
                     'class', ['accuracy_score'])
            eval_schema = self.get_table_schema(conn,
                                                "iris.evaluate_result_table")
            self.assertEqual(eval_schema.keys(),
                             set(['loss', 'accuracy_score']))
Ejemplo n.º 18
0
    def test_without_cross(self):
        features = {
            'feature_columns': [
                EmbeddingColumn(dimension=256, combiner="mean", name="c3"),
                EmbeddingColumn(category_column=CategoryIDColumn(
                    FieldDesc(name="c5",
                              dtype=DataType.INT64,
                              shape=[10000],
                              delimiter=",",
                              is_sparse=True),
                    bucket_size=5000),
                                dimension=64,
                                combiner="sqrtn",
                                name="c5"),
            ]
        }

        label = NumericColumn(
            FieldDesc(name="class", dtype=DataType.INT64, shape=[1]))

        select = "select c1, c2, c3, c4, c5, c6, class " \
                 "from feature_derivation_case.train"
        conn = testing.get_singleton_db_connection()
        features, label = fd.infer_feature_columns(conn, select, features,
                                                   label)

        self.check_json_dump(features)
        self.check_json_dump(label)

        self.assertEqual(len(features), 1)
        self.assertTrue("feature_columns" in features)
        features = features["feature_columns"]
        self.assertEqual(len(features), 6)

        fc1 = features[0]
        self.assertTrue(isinstance(fc1, NumericColumn))
        self.assertEqual(len(fc1.get_field_desc()), 1)
        field_desc = fc1.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c1")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc2 = features[1]
        self.assertTrue(isinstance(fc2, NumericColumn))
        self.assertEqual(len(fc2.get_field_desc()), 1)
        field_desc = fc2.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c2")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc3 = features[2]
        self.assertTrue(isinstance(fc3, EmbeddingColumn))
        self.assertEqual(len(fc3.get_field_desc()), 1)
        field_desc = fc3.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c3")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [4])
        self.assertEqual(fc3.dimension, 256)
        self.assertEqual(fc3.combiner, "mean")
        self.assertEqual(fc3.name, "c3")
        self.assertTrue(isinstance(fc3.category_column, CategoryIDColumn))
        self.assertEqual(fc3.category_column.bucket_size, 10)

        fc4 = features[3]
        self.assertTrue(isinstance(fc4, NumericColumn))
        self.assertEqual(len(fc4.get_field_desc()), 1)
        field_desc = fc4.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c4")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [4])

        fc5 = features[4]
        self.assertTrue(isinstance(fc5, EmbeddingColumn))
        self.assertEqual(len(fc5.get_field_desc()), 1)
        field_desc = fc5.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c5")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertTrue(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [10000])
        self.assertEqual(fc5.dimension, 64)
        self.assertEqual(fc5.combiner, "sqrtn")
        self.assertEqual(fc5.name, "c5")
        self.assertTrue(isinstance(fc5.category_column, CategoryIDColumn))
        self.assertEqual(fc5.category_column.bucket_size, 5000)

        fc6 = features[5]
        self.assertTrue(isinstance(fc6, EmbeddingColumn))
        self.assertEqual(len(fc6.get_field_desc()), 1)
        field_desc = fc6.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c6")
        self.assertEqual(field_desc.dtype, DataType.STRING)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])
        self.assertEqual(field_desc.vocabulary, set(['FEMALE', 'MALE',
                                                     'NULL']))
        self.assertEqual(fc6.dimension, 128)
        self.assertEqual(fc6.combiner, "sum")
        self.assertEqual(fc6.name, "c6")
        self.assertTrue(isinstance(fc6.category_column, CategoryIDColumn))
        self.assertEqual(fc6.category_column.bucket_size, 3)

        self.assertTrue(isinstance(label, NumericColumn))
        self.assertEqual(len(label.get_field_desc()), 1)
        field_desc = label.get_field_desc()[0]
        self.assertEqual(field_desc.name, "class")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [])
Ejemplo n.º 19
0
    def test_with_cross(self):
        c1 = NumericColumn(
            FieldDesc(name='c1', dtype=DataType.INT64, shape=[1]))
        c2 = NumericColumn(
            FieldDesc(name='c2', dtype=DataType.INT64, shape=[1]))
        c4 = NumericColumn(
            FieldDesc(name='c4', dtype=DataType.INT64, shape=[1]))
        c5 = NumericColumn(
            FieldDesc(name='c5',
                      dtype=DataType.INT64,
                      shape=[1],
                      is_sparse=True))

        features = {
            'feature_columns': [
                c1,
                c2,
                CrossColumn([c4, c5], 128),
                CrossColumn([c1, c2], 256),
            ]
        }

        label = NumericColumn(
            FieldDesc(name='class', dtype=DataType.INT64, shape=[1]))
        select = "select c1, c2, c3, c4, c5, class " \
                 "from feature_derivation_case.train"

        conn = testing.get_singleton_db_connection()
        features, label = fd.infer_feature_columns(conn, select, features,
                                                   label)

        self.check_json_dump(features)
        self.check_json_dump(label)

        self.assertEqual(len(features), 1)
        self.assertTrue("feature_columns" in features)
        features = features["feature_columns"]
        self.assertEqual(len(features), 5)

        fc1 = features[0]
        self.assertTrue(isinstance(fc1, NumericColumn))
        self.assertEqual(len(fc1.get_field_desc()), 1)
        field_desc = fc1.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c1")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc2 = features[1]
        self.assertTrue(isinstance(fc2, NumericColumn))
        self.assertEqual(len(fc2.get_field_desc()), 1)
        field_desc = fc2.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c2")
        self.assertEqual(field_desc.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [1])

        fc3 = features[2]
        self.assertTrue(isinstance(fc3, NumericColumn))
        self.assertEqual(len(fc3.get_field_desc()), 1)
        field_desc = fc3.get_field_desc()[0]
        self.assertEqual(field_desc.name, "c3")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.CSV)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [4])

        fc4 = features[3]
        self.assertTrue(isinstance(fc4, CrossColumn))
        self.assertEqual(len(fc4.get_field_desc()), 2)
        field_desc1 = fc4.get_field_desc()[0]
        self.assertEqual(field_desc1.name, "c4")
        self.assertEqual(field_desc1.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc1.format, DataFormat.CSV)
        self.assertEqual(field_desc1.shape, [4])
        self.assertFalse(field_desc1.is_sparse)
        field_desc2 = fc4.get_field_desc()[1]
        self.assertEqual(field_desc2.name, "c5")
        self.assertEqual(field_desc2.dtype, DataType.INT64)
        self.assertEqual(field_desc2.format, DataFormat.CSV)
        self.assertTrue(field_desc2.is_sparse)

        fc5 = features[4]
        self.assertTrue(isinstance(fc5, CrossColumn))
        self.assertEqual(len(fc4.get_field_desc()), 2)
        field_desc1 = fc5.get_field_desc()[0]
        self.assertEqual(field_desc1.name, "c1")
        self.assertEqual(field_desc1.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc1.format, DataFormat.PLAIN)
        self.assertEqual(field_desc1.shape, [1])
        self.assertFalse(field_desc1.is_sparse)
        field_desc2 = fc5.get_field_desc()[1]
        self.assertEqual(field_desc2.name, "c2")
        self.assertEqual(field_desc2.dtype, DataType.FLOAT32)
        self.assertEqual(field_desc2.format, DataFormat.PLAIN)
        self.assertEqual(field_desc2.shape, [1])
        self.assertFalse(field_desc2.is_sparse)

        self.assertTrue(isinstance(label, NumericColumn))
        self.assertEqual(len(label.get_field_desc()), 1)
        field_desc = label.get_field_desc()[0]
        self.assertEqual(field_desc.name, "class")
        self.assertEqual(field_desc.dtype, DataType.INT64)
        self.assertEqual(field_desc.format, DataFormat.PLAIN)
        self.assertFalse(field_desc.is_sparse)
        self.assertEqual(field_desc.shape, [])
Ejemplo n.º 20
0
    def test_metadata(self):
        original_sql = '''
        SELECT c1, c2, class FROM my_db.train_table
        TO TRAIN my_docker_image:latest/DNNClassifier
        WITH
            model.n_classes = 3,
            model.hidden_units = [16, 32],
            validation.select="SELECT c1, c2, class FROM my_db.val_table"
        INTO my_db.my_dnn_model;
        '''

        select = "SELECT c1, c2, class FROM my_db.train_table"
        validation_select = "SELECT c1, c2, class FROM my_db.val_table"
        model_repo_image = "my_docker_image:latest"
        estimator = "DNNClassifier"
        attributes = {
            'n_classes': 3,
            'hidden_units': [16, 32],
        }

        features = {
            'feature_columns': [
                NumericColumn(FieldDesc(name='c1', shape=[3], delimiter=",")),
                NumericColumn(FieldDesc(name='c2', shape=[1])),
            ],
        }

        label = NumericColumn(FieldDesc(name='class', shape=[5],
                                        delimiter=','))

        def check_metadata(meta):
            self.assertEqual(meta['original_sql'], original_sql)
            self.assertEqual(meta['select'], select)
            self.assertEqual(meta['validation_select'], validation_select)
            self.assertEqual(meta['model_repo_image'], model_repo_image)
            self.assertEqual(meta['class_name'], estimator)
            self.assertEqual(meta['attributes'], attributes)
            meta_features = meta['features']
            meta_label = meta['label']
            self.assertEqual(len(meta_features), 1)
            self.assertEqual(len(meta_features['feature_columns']), 2)
            meta_features = meta_features['feature_columns']
            self.assertEqual(type(meta_features[0]), NumericColumn)
            self.assertEqual(type(meta_features[1]), NumericColumn)
            field_desc = meta_features[0].get_field_desc()[0]
            self.assertEqual(field_desc.name, 'c1')
            self.assertEqual(field_desc.shape, [3])
            self.assertEqual(field_desc.delimiter, ',')
            field_desc = meta_features[1].get_field_desc()[0]
            self.assertEqual(field_desc.name, 'c2')
            self.assertEqual(field_desc.shape, [1])
            self.assertEqual(type(meta_label), NumericColumn)
            field_desc = meta_label.get_field_desc()[0]
            self.assertEqual(field_desc.name, 'class')
            self.assertEqual(field_desc.shape, [5])
            self.assertEqual(field_desc.delimiter, ',')
            self.assertEqual(meta['evaluation'], {'accuracy': 0.5})
            self.assertEqual(meta['my_data'], 0.25)

        meta = collect_metadata(original_sql,
                                select,
                                validation_select,
                                model_repo_image,
                                estimator,
                                attributes,
                                features,
                                label, {'accuracy': 0.5},
                                my_data=0.25)

        check_metadata(meta)

        save_metadata(self.file_name, meta)
        meta = load_metadata(self.file_name)
        check_metadata(meta)
Ejemplo n.º 21
0
 def infer_index(self, string):
     field_desc = FieldDesc(shape=[0])
     fd.fill_kv_field_desc(string, field_desc)
     return field_desc.shape[0]
Ejemplo n.º 22
0
    def check_main_impl(self, estimator):
        if testing.get_driver() != "mysql":
            return

        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN %s
        WITH
            model.hidden_units=[32,64],
            model.n_classes=3,
            validation.select="SELECT * FROM iris.test"
        LABEL class
        INTO iris.tensorflow_train_model_test;
        """ % estimator

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"batch_size": 10}
        model_params = {"n_classes": 3, "hidden_units": [32, 64]}
        save_name = "iris.tensorflow_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            train(original_sql=original_sql,
                  model_image="sqlflow:step",
                  estimator_string=estimator,
                  datasource=ds,
                  select=select,
                  validation_select=val_select,
                  model_params=model_params,
                  train_params=train_params,
                  validation_params=None,
                  feature_column_map=None,
                  label_column=NumericColumn(
                      FieldDesc(name=class_name, shape=[])),
                  save=save_name,
                  load=None)

        conn = db.connect_with_data_source(ds)

        pred_select = "SELECT * FROM iris.test"

        with temp_file.TemporaryDirectory(as_cwd=True):
            pred(ds, pred_select, "iris.predict_result_table", class_name,
                 save_name)

        self.assertEqual(
            self.get_table_row_count(conn, "iris.test"),
            self.get_table_row_count(conn, "iris.predict_result_table"))

        schema1 = self.get_table_schema(conn, "iris.test")
        schema2 = self.get_table_schema(conn, "iris.predict_result_table")
        self.assertEqual(len(schema1), len(schema2))
        for name in schema1:
            if name == 'class':
                self.assertEqual(schema2[name], "BIGINT")
                continue

            self.assertTrue(name in schema2)
            self.assertEqual(schema1[name], schema2[name])

        diff_schema = schema2.keys() - schema1.keys()
        self.assertEqual(len(diff_schema), 0)

        with temp_file.TemporaryDirectory(as_cwd=True):
            evaluate(ds, select, "iris.evaluate_result_table", save_name,
                     class_name, {'validation.metrics': 'Accuracy'})

        eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table")
        eval_schema = set([k.lower() for k in eval_schema.keys()])
        self.assertEqual(eval_schema, set(['loss', 'accuracy']))

        with temp_file.TemporaryDirectory(as_cwd=True):
            explain(ds, select, None, {"plot_type": "bar"},
                    "iris.explain_result_table", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table")
        self.assertEqual(
            explain_schema.keys(),
            set(['petal_length', 'petal_width', 'sepal_length',
                 'sepal_width']))
        conn.close()