Esempio n. 1
0
    def save_to_oss(self, oss_model_dir, local_dir=None):
        """
        This save function would archive all the files on local_dir
        into a tarball, and save it into OSS model directory.

        Args:
            oss_model_dir (str): the OSS model directory to save.
                It is in the format of oss://bucket/path/to/dir/.
            local_dir (str): the local directory to save.

        Returns:
            None.
        """
        if local_dir is None:
            local_dir = os.getcwd()

        with temp_file.TemporaryDirectory() as tmp_dir:
            tarball = os.path.join(tmp_dir, TARBALL_NAME)
            self._zip(local_dir, tarball)
            oss.save_file(oss_model_dir, tarball, TARBALL_NAME)

        with temp_file.TemporaryDirectory() as tmp_dir:
            model_obj_file = os.path.join(tmp_dir, MODEL_OBJ_FILE_NAME)
            with open(model_obj_file, "w") as f:
                f.write(
                    json.dumps(self._to_dict(),
                               cls=JSONEncoderWithFeatureColumn))
            oss.save_file(oss_model_dir, model_obj_file, MODEL_OBJ_FILE_NAME)
Esempio n. 2
0
    def test_train(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """
        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {
            "num_boost_round": 20,
        }
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(ds, original_sql, select, val_select,
                                "xgboost.gbtree", "", None,
                                NumericColumn(FieldDesc(name="class")),
                                model_params, train_params, None,
                                "iris.xgboost_train_model_test", None)
            self.assertLess(eval_result['train']['merror'][-1], 0.01)
            self.assertLess(eval_result['validate']['merror'][-1], 0.01)

        with temp_file.TemporaryDirectory(as_cwd=True):
            pred_original_sql = """SELECT * FROM iris.test
            TO PREDICT iris.xgboost_pred_result.pred_val
            USING iris.xgboost_train_model_test;"""
            pred(ds, pred_original_sql, "SELECT * FROM iris.test",
                 "iris.xgboost_train_model_test", "pred_val", model_params,
                 "iris.xgboost_pred_result")

        with temp_file.TemporaryDirectory(as_cwd=True):
            explain_original_sql = """SELECT * FROM iris.test
            TO EXPLAIN iris.xgboost_train_model_test
            INTO iris.xgboost_explain_result;"""
            explain(ds, explain_original_sql, "SELECT * FROM iris.test",
                    "iris.xgboost_train_model_test", model_params,
                    "iris.xgboost_explain_result")

        with temp_file.TemporaryDirectory(as_cwd=True):
            evaluate_original_sql = """SELECT * FROM iris.test
            TO EVALUATE iris.xgboost_train_model_test
            WITH label_col=class
            INTO iris.xgboost_evaluate_result;"""
            evaluate(ds, evaluate_original_sql, "SELECT * FROM iris.test",
                     "class", "iris.xgboost_train_model_test", model_params,
                     "iris.xgboost_evaluate_result")
Esempio n. 3
0
    def test_save_load_db(self):
        table = "sqlflow_models.test_model"
        meta = {"model_params": {"n_classes": 3}}
        m = Model(EstimatorType.XGBOOST, meta)
        datasource = get_datasource()

        # save mode
        with temp_file.TemporaryDirectory() as d:
            m.save_to_db(datasource, table, d)

        # load model
        with temp_file.TemporaryDirectory() as d:
            m = Model.load_from_db(datasource, table, d)
            self.assertEqual(m._meta, meta)
Esempio n. 4
0
    def load_from_oss(oss_model_dir, local_dir=None):
        """
        Load the saved model from OSS and unzip it on local_dir.

        Args:
            oss_model_dir (str): the OSS model directory to load.
                It is in the format of oss://bucket/path/to/dir/.
            local_dir (str): the local directory to load.

        Returns:
            Model: a Model object represent the model type and meta
            information.
        """
        if local_dir is None:
            local_dir = os.getcwd()

        with temp_file.TemporaryDirectory() as tmp_dir:
            tarball = os.path.join(tmp_dir, TARBALL_NAME)
            oss.load_file(oss_model_dir, tarball, TARBALL_NAME)
            Model._unzip(local_dir, tarball)

            model_obj_file = os.path.join(tmp_dir, MODEL_OBJ_FILE_NAME)
            oss.load_file(oss_model_dir, model_obj_file, MODEL_OBJ_FILE_NAME)
            with open(model_obj_file, "r") as f:
                d = json.loads(f.read(), cls=JSONDecoderWithFeatureColumn)
                model = Model._from_dict(d)
            return model
Esempio n. 5
0
    def test_tar(self):
        with temp_file.TemporaryDirectory(as_cwd=True):
            # create the test file tree:
            #
            # |-sqlflow_tar
            #   |-sqlflow_sub_dir
            #     |-hello.py
            test_dir = "sqlflow_tar"
            test_sub_dir = "sqlflow_sub_dir"
            test_py_file = "hello.py"
            test_py_content = "print('hello SQLFlow!')"

            fullpath = os.path.join(test_dir, test_sub_dir)
            os.makedirs(fullpath)
            with open(os.path.join(fullpath, test_py_file), "w") as f:
                f.write(test_py_content)

            zip_dir(fullpath, "sqlflow.tar.gz")
            unzip_dir("sqlflow.tar.gz", "output")
            self.assertTrue(
                os.path.isdir("output/sqlflow_tar/sqlflow_sub_dir"))
            self.assertTrue(
                os.path.isfile("output/sqlflow_tar/sqlflow_sub_dir/hello.py"))
            with open(os.path.join(fullpath, test_py_file), "r") as f:
                self.assertEqual(f.read(), test_py_content)
Esempio n. 6
0
    def load_from_db(datasource, table, local_dir=None):
        """
        Load the saved model from DBMS and unzip it on local_dir.

        Args:
            datasource (str): the connection string to DBMS
            table (str): the table name which saved in DBMS
            local_dir (str): the local directory to load.

        Returns:
            Model: a Model object represent the model type and meta
            information.
        """
        if local_dir is None:
            local_dir = os.getcwd()

        model_zoo_addr, table, tag = _decompose_model_name(table)
        if model_zoo_addr:
            gen, metadata = load_model_from_model_zoo(model_zoo_addr, table,
                                                      tag)
        else:
            gen, metadata = read_with_generator_and_metadata(datasource, table)

        with temp_file.TemporaryDirectory() as tmp_dir:
            tarball = os.path.join(tmp_dir, TARBALL_NAME)
            with open(tarball, "wb") as f:
                for data in gen():
                    f.write(bytes(data))

            Model._unzip(local_dir, tarball, load_from_db=True)

        return Model._from_dict(metadata)
Esempio n. 7
0
    def save_to_db(self, datasource, table, local_dir=None):
        """
        This save function would archive all the files on local_dir
        into a tarball, and save it into DBMS with the specified table
        name.

        Args:
            datasource (str): the connection string to DBMS.
            table (str): the saved table name.
            local_dir (str): the local directory to save.

        Returns:
            None.
        """
        if local_dir is None:
            local_dir = os.getcwd()

        with temp_file.TemporaryDirectory() as tmp_dir:
            tarball = os.path.join(tmp_dir, TARBALL_NAME)
            self._zip(local_dir, tarball)

            def _bytes_reader(filename, buf_size=8 * 32):
                def _gen():
                    with open(filename, "rb") as f:
                        while True:
                            data = f.read(buf_size)
                            if data:
                                yield data
                            else:
                                break

                return _gen

            write_with_generator(datasource, table, _bytes_reader(tarball))
Esempio n. 8
0
def print_oss_image(oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name):
    auth = oss2.Auth(oss_ak, oss_sk)
    bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name)

    with temp_file.TemporaryDirectory(as_cwd=True):
        local_file_name = "summary.png"
        bucket.get_object_to_file(oss_dest, local_file_name)
        print_image_as_base64_html(local_file_name)
Esempio n. 9
0
def shap_explain(booster, datasource, dataset, summary_params, result_table):

    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32)
                  ] * len(columns)
        _create_table(conn, result_table, columns, dtypes)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))

        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]

        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    filename = 'summary.png'
    with temp_file.TemporaryDirectory(as_cwd=True):
        explainer.plot_and_save(plot_func, filename=filename)
        with open(filename, 'rb') as f:
            img = f.read()

    img = base64.b64encode(img)
    if six.PY3:
        img = img.decode('utf-8')
    img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \
          % img
    print(img)
Esempio n. 10
0
    def save_to_db(self,
                   datasource,
                   table,
                   local_dir=None,
                   oss_model_dir=None):
        """
        This save function would archive all the files on local_dir
        into a tarball, and save it into DBMS with the specified table
        name.

        Args:
            datasource (str): the connection string to DBMS.
            table (str): the saved table name.
            local_dir (str): the local directory to save.

        Returns:
            None.
        """
        if local_dir is None:
            local_dir = os.getcwd()

        conn = connect_with_data_source(datasource)

        if oss_model_dir:
            cur_dir = os.getcwd()
            os.chdir(local_dir)
            oss.load_dir(oss_model_dir)
            os.chdir(cur_dir)

        if "." not in table:
            project_name = conn.param("database")
            table = project_name + "." + table

        with temp_file.TemporaryDirectory() as tmp_dir:
            tarball = os.path.join(tmp_dir, TARBALL_NAME)
            self._zip(local_dir, tarball)

            def _bytes_reader(filename, buf_size=8 * 32):
                def _gen():
                    with open(filename, "rb") as f:
                        while True:
                            data = f.read(buf_size)
                            if data:
                                yield data
                            else:
                                break

                return _gen

            write_with_generator_and_metadata(datasource, table,
                                              _bytes_reader(tarball),
                                              self._to_dict())

        conn.persist_table(table)
        conn.close()
        return table
Esempio n. 11
0
    def save_to_oss(self, oss_model_dir, local_dir=None):
        """
        This save function would archive all the files on local_dir
        into a tarball, and save it into OSS model directory.

        Args:
            oss_model_dir (str): the OSS model directory to save.
                It is in the format of oss://bucket/path/to/dir/.
            local_dir (str): the local directory to save.

        Returns:
            None.
        """
        if local_dir is None:
            local_dir = os.getcwd()

        with temp_file.TemporaryDirectory() as tmp_dir:
            tarball = os.path.join(tmp_dir, TARBALL_NAME)
            self._zip(local_dir, tarball)
            oss.save_file(oss_model_dir, tarball, TARBALL_NAME)
Esempio n. 12
0
    def load_from_oss(oss_model_dir, local_dir=None):
        """
        Load the saved model from OSS and unzip it on local_dir.

        Args:
            oss_model_dir (str): the OSS model directory to load.
                It is in the format of oss://bucket/path/to/dir/.
            local_dir (str): the local directory to load.

        Returns:
            Model: a Model object represent the model type and meta
            information.
        """
        if local_dir is None:
            local_dir = os.getcwd()

        with temp_file.TemporaryDirectory() as tmp_dir:
            tarball = os.path.join(tmp_dir, TARBALL_NAME)
            oss.load_file(oss_model_dir, tarball, TARBALL_NAME)
            return Model._unzip(local_dir, tarball)
Esempio n. 13
0
 def load_from_oss():
     with temp_file.TemporaryDirectory() as d:
         return Model.load_from_oss(oss_model_path, d)
Esempio n. 14
0
def submit_pai_explain(datasource,
                       original_sql,
                       select,
                       model,
                       model_params,
                       result_table,
                       explainer="TreeExplainer",
                       user=""):
    """This function pack need params and resource to a tarball
    and submit a explain task to PAI

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model: string
            Model to load and do prediction.
        model_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """
    params = dict(locals())

    # format resultTable name to "db.table" to let the codegen form a
    # submitting argument of format "odps://project/tables/table_name"
    project = table_ops.get_project(datasource)
    if result_table:
        if result_table.count(".") == 0:
            result_table = "%s.%s" % (project, result_table)
        params["result_table"] = result_table

    # used to save the explain image
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    params["oss_dest"] = "explain_images/%s/%s" % (user, timestamp)
    add_env_to_params(params, "SQLFLOW_OSS_AK", "oss_ak")
    add_env_to_params(params, "SQLFLOW_OSS_SK", "oss_sk")
    add_env_to_params(params, "SQLFLOW_OSS_ALISA_ENDPOINT", "oss_endpoint")
    add_env_to_params(params, "SQLFLOW_OSS_ALISA_BUCKET", "oss_bucket_name")

    meta = Model.load_metadata_from_db(datasource, model)
    model_type = meta.get_type()
    estimator = meta.get_meta("class_name")
    label_name = model_params.get("label_col")
    if label_name is None:
        label_column = meta.get_meta("label")
        if label_column is not None:
            label_name = label_column.get_field_desc()[0].name

    setup_explain_entry(params, model_type)

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model,
                                                       user=user)

    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
    # is like: "SELECT fields,... FROM table"
    with table_ops.create_tmp_tables_guard(select, datasource) as data_table:
        params["pai_table"] = data_table

        # Create explain result table
        if result_table:
            conn = db.connect_with_data_source(datasource)
            feature_columns = meta.get_meta("features")
            estimator_string = meta.get_meta("class_name")
            field_descs = get_ordered_field_descs(feature_columns)
            feature_column_names = [fd.name for fd in field_descs]
            create_explain_table(conn, meta.get_type(), explainer,
                                 estimator_string, result_table,
                                 feature_column_names)
            conn.close()

        if not try_pai_local_run(params, oss_model_path):
            with temp_file.TemporaryDirectory(prefix="sqlflow",
                                              dir="/tmp") as cwd:
                prepare_archive(cwd, estimator, oss_model_path, params)
                cmd = get_pai_explain_cmd(
                    datasource, project, oss_model_path, model, data_table,
                    result_table, model_type, model_params,
                    "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                    "file://" + os.path.join(cwd, PARAMS_FILE), label_name)
                submit_pai_task(cmd, datasource)

    if result_table:
        print('Saved result into: {}'.format(result_table))
    else:
        print_oss_image(params["oss_dest"], params["oss_ak"], params["oss_sk"],
                        params["oss_endpoint"], params["oss_bucket_name"])
Esempio n. 15
0
    def test_main(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softprob",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        LABEL class
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"num_boost_round": 20}
        model_params = {"num_class": 3, "objective": "multi:softprob"}
        save_name = "iris.xgboost_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(datasource=ds,
                                original_sql=original_sql,
                                select=select,
                                validation_select=val_select,
                                estimator_string="xgboost.gbtree",
                                model_image="sqlflow:step",
                                feature_column_map=None,
                                label_column=NumericColumn(
                                    FieldDesc(name=class_name)),
                                model_params=model_params,
                                train_params=train_params,
                                validation_params=None,
                                save=save_name,
                                load=None)

        self.assertLess(eval_result['train']['merror'][-1], 0.01)
        self.assertLess(eval_result['validate']['merror'][-1], 0.01)

        conn = db.connect_with_data_source(ds)
        pred_select = "SELECT * FROM iris.test"

        with temp_file.TemporaryDirectory(as_cwd=True):
            result_column_names, train_label_idx = create_predict_table(
                conn, select, "iris.predict_result_table",
                FieldDesc(name=class_name), "class")
            predict(ds, pred_select, "iris.predict_result_table",
                    result_column_names, train_label_idx, save_name)

        self.assertEqual(
            self.get_table_row_count(conn, "iris.test"),
            self.get_table_row_count(conn, "iris.predict_result_table"))

        schema1 = self.get_table_schema(conn, "iris.test")
        schema2 = self.get_table_schema(conn, "iris.predict_result_table")
        self.assertEqual(len(schema1), len(schema2))
        for name in schema1:
            if name == 'class':
                self.assertEqual(schema2[name], "BIGINT")
                continue

            self.assertTrue(name in schema2)
            self.assertEqual(schema1[name], schema2[name])

        diff_schema = schema2.keys() - schema1.keys()
        self.assertEqual(len(diff_schema), 0)

        with temp_file.TemporaryDirectory(as_cwd=True):
            result_column_names = create_evaluate_table(
                conn, "iris.evaluate_result_table", ["accuracy_score"])
            evaluate(ds,
                     pred_select,
                     "iris.evaluate_result_table",
                     save_name,
                     label_name='class',
                     model_params={'validation.metrics': 'accuracy_score'},
                     result_column_names=result_column_names)

        eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table")
        self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score']))

        with temp_file.TemporaryDirectory(as_cwd=True):
            feature_column_names = [
                "petal_width", "petal_length", "sepal_width", "sepal_length"
            ]
            create_explain_table(conn, EstimatorType.XGBOOST, "TreeExplainer",
                                 "xgboost.gbtree", "iris.explain_result_table",
                                 feature_column_names)
            explain(ds, select, "TreeExplainer", {"plot_type": "decision"},
                    "iris.explain_result_table", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table")
        self.assertEqual(explain_schema.keys(), set(feature_column_names))

        with temp_file.TemporaryDirectory(as_cwd=True):
            create_explain_table(conn, EstimatorType.XGBOOST,
                                 "XGBoostExplainer", "xgboost.gbtree",
                                 "iris.explain_result_table_2",
                                 feature_column_names)
            explain(ds, select, "XGBoostExplainer", {},
                    "iris.explain_result_table_2", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table_2")
        self.assertEqual(explain_schema.keys(),
                         set(['feature', 'fscore', 'gain']))
        conn.close()
Esempio n. 16
0
def pred(datasource, select, result_table, pred_label_name, model):
    """
    Do prediction using a trained model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        pred_label_name (str): the output label name to predict.
        model (Model|str): the model object or where to load the model.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, pred_label_name)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=None,
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            raw_data_dir=raw_data_dir)  # NOTE: default to use external memory

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            feature_file_name = os.path.join(
                tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
def submit_pai_evaluate(datasource,
                        original_sql,
                        select,
                        label_name,
                        model,
                        model_params,
                        result_table,
                        user=""):
    """Submit a PAI evaluation task

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model: string
            Model to load and do prediction.
        label_name: string
            The label name to evaluate.
        model_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """

    params = dict(locals())
    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)
    params["result_table"] = result_table

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model,
                                                       user=user)

    model_type, estimator = pai_model.get_saved_model_type_and_estimator(
        datasource, model)
    if model_type == EstimatorType.PAIML:
        raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.")

    if model_type == EstimatorType.XGBOOST:
        params["entry_type"] = "evaluate_xgb"
        validation_metrics = model_params.get("validation.metrics",
                                              "accuracy_score")
    else:
        params["entry_type"] = "evaluate_tf"
        validation_metrics = model_params.get("validation.metrics", "Accuracy")

    validation_metrics = [m.strip() for m in validation_metrics.split(",")]
    with db.connect_with_data_source(datasource) as conn:
        result_column_names = create_evaluate_table(conn, result_table,
                                                    validation_metrics)

    with table_ops.create_tmp_tables_guard(select, datasource) as data_table:
        params["pai_table"] = data_table
        params["result_column_names"] = result_column_names

        if try_pai_local_run(params, oss_model_path):
            return

        conf = cluster_conf.get_cluster_config(model_params)
        with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd:
            prepare_archive(cwd, estimator, oss_model_path, params)
            cmd = get_pai_tf_cmd(
                conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model,
                oss_model_path, data_table, "", result_table, project)
            submit_pai_task(cmd, datasource)
Esempio n. 18
0
def predict(datasource,
            select,
            result_table,
            result_column_names,
            train_label_idx,
            model,
            extra_result_cols=[],
            pai_table=None):
    """TBD
    """
    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        # NOTE(typhoonzero): must run Model.load_from_db in a temp
        # directory, calling pyodps in current directory on PAI
        # workers will cause paiio fails.
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    model_params = model.get_meta("attributes")
    fc_map_ir = model.get_meta("features")
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    is_pai = True if pai_table else False
    if is_pai:
        conn = PaiIOConnection.from_table(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(datasource=datasource,
                            fn=pred_fn,
                            dataset_sql=select,
                            feature_metas=feature_metas,
                            feature_column_names=feature_column_names,
                            label_meta=None,
                            cache=True,
                            batch_size=10000,
                            transform_fn=transform_fn,
                            raw_data_dir=raw_data_dir,
                            is_pai=is_pai,
                            pai_table=pai_table,
                            pai_single_file=True,
                            feature_column_code=fc_map_ir)

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            if is_pai:
                feature_file_name = os.path.join(tmp_dir_name,
                                                 "predict.txt.raw")
            else:
                feature_file_name = os.path.join(
                    tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Esempio n. 19
0
def submit_pai_predict(datasource,
                       original_sql,
                       select,
                       model,
                       label_name,
                       pred_params,
                       result_table,
                       user=""):
    """This function pack needed params and resource to a tarball
    and submit a prediction task to PAI

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model: string
            Model to load and do prediction.
        label_name: string
            Name of the label column, if not exist in select.
        pred_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """
    params = dict(locals())

    # format resultTable name to "db.table" to let the codegen form a
    # submitting argument of format "odps://project/tables/table_name"
    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)

    model_metas = Model.load_metadata_from_db(datasource, model)
    model_type = model_metas.get_type()
    estimator = model_metas.get_meta("class_name")
    setup_predict_entry(params, model_type)

    train_label = model_metas.get_meta("label")
    if train_label is not None:
        train_label_desc = train_label.get_field_desc()[0]
    else:
        train_label_desc = None

    if pred_params is None:
        extra_result_cols = []
    else:
        extra_result_cols = pred_params.get("predict.extra_outputs", "")
        extra_result_cols = [
            c.strip() for c in extra_result_cols.split(",") if c.strip()
        ]

    with db.connect_with_data_source(datasource) as conn:
        result_column_names, train_label_idx = create_predict_table(
            conn, select, result_table, train_label_desc, label_name,
            extra_result_cols)

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model,
                                                       user=user)

    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
    # is like: "SELECT fields,... FROM table"
    with table_ops.create_tmp_tables_guard(select, datasource) as data_table:
        del params["label_name"]
        params["pai_table"] = data_table
        params["result_column_names"] = result_column_names
        params["train_label_idx"] = train_label_idx
        params["extra_result_cols"] = extra_result_cols

        if try_pai_local_run(params, oss_model_path):
            return

        with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd:
            prepare_archive(cwd, estimator, oss_model_path, params)

            cmd = get_pai_predict_cmd(
                datasource, project, oss_model_path, model, data_table,
                result_table, model_type, pred_params,
                "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                "file://" + os.path.join(cwd, PARAMS_FILE))
            submit_pai_task(cmd, datasource)
Esempio n. 20
0
 def save_to_oss():
     with temp_file.TemporaryDirectory() as d:
         m.save_to_oss(oss_model_path, d)
Esempio n. 21
0
def evaluate(datasource,
             select,
             result_table,
             model,
             pred_label_name=None,
             model_params=None):
    """
    Do evaluation to a trained XGBoost model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        model (Model|str): the model object or where to load the model.
        pred_label_name (str): the label column name.
        model_params (dict): the parameters for evaluation.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    if model_params is None:
        model_params = {}

    validation_metrics = model_params.get("validation.metrics", "Accuracy")
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    if pred_label_name:
        train_label_desc.name = pred_label_name

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")
    conn = db.connect_with_data_source(datasource)

    result_column_names = create_evaluate_table(conn, result_table,
                                                validation_metrics)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=train_label_desc.to_dict(dtype_to_string=True),
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn)

        for i, pred_dmatrix in enumerate(dpred):
            feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()
Esempio n. 22
0
def local_train(original_sql,
                model_image,
                estimator_string,
                datasource,
                select,
                validation_select,
                model_params,
                train_params,
                feature_metas,
                feature_column_names,
                feature_column_map,
                label_column,
                transform_fn,
                save,
                load="",
                is_pai=False,
                oss_model_dir=""):
    disk_cache = train_params.pop("disk_cache", False)
    batch_size = train_params.pop("batch_size", None)
    if batch_size is not None and batch_size < 0:
        batch_size = None

    epoch = train_params.pop("epoch", 1)
    num_workers = train_params.pop("num_workers", 1)
    label_meta_dict = label_column.get_field_desc()[0].to_dict(
        dtype_to_string=True)

    def build_dataset(fn, slct):
        return xgb_dataset(datasource,
                           fn,
                           slct,
                           feature_metas,
                           feature_column_names,
                           label_meta_dict,
                           cache=disk_cache,
                           batch_size=batch_size,
                           epoch=epoch,
                           transform_fn=transform_fn)

    file_name = "my_model"
    if load:
        Model.load_from_db(datasource, load)
        bst = xgb.Booster()
        bst.load_model(file_name)
    else:
        bst = None

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        train_fn = os.path.join(tmp_dir_name, 'train.txt')
        val_fn = os.path.join(tmp_dir_name, 'val.txt')
        train_dataset = build_dataset(train_fn, select)
        if validation_select:
            val_dataset = build_dataset(val_fn, validation_select)
        else:
            val_dataset = None

        eval_result = dict()
        watchlist = [None]
        if val_dataset:
            # The `xgboost.train` API only accepts the XGBoost DMatrix
            # object as the training or validation dataset, so we should
            # convert the generator to DMatrix.
            if isinstance(val_dataset, types.GeneratorType):
                val_dataset = list(val_dataset)[0]
            watchlist.append((val_dataset, "validate"))

        for per_batch_dmatrix in train_dataset:
            watchlist[0] = (per_batch_dmatrix, "train")
            bst = xgb.train(model_params,
                            per_batch_dmatrix,
                            evals=watchlist,
                            evals_result=eval_result,
                            xgb_model=bst,
                            **train_params)
            print("Evaluation result: %s" % eval_result)

    meta = collect_metadata(original_sql=original_sql,
                            select=select,
                            validation_select=validation_select,
                            model_repo_image=model_image,
                            class_name=estimator_string,
                            attributes=model_params,
                            features=feature_column_map,
                            label=label_column,
                            evaluation=eval_result,
                            num_workers=num_workers)

    save_model_to_local_file(bst, model_params, file_name)
    model = Model(EstimatorType.XGBOOST, meta)
    model.save_to_db(datasource, save)
    if is_pai and len(oss_model_dir) > 0:
        # TODO(typhoonzero): remove this since we are saving metas into db now.
        save_model(oss_model_dir, "my_model", model_params, train_params,
                   feature_metas, feature_column_names, label_meta_dict,
                   feature_column_map)

    return eval_result
Esempio n. 23
0
    def test_main(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softmax",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"num_boost_round": 20}
        model_params = {"num_class": 3, "objective": "multi:softmax"}
        save_name = "iris.xgboost_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(original_sql=original_sql,
                                model_image="sqlflow:step",
                                estimator_string="xgboost.gbtree",
                                datasource=ds,
                                select=select,
                                validation_select=val_select,
                                model_params=model_params,
                                train_params=train_params,
                                feature_column_map=None,
                                label_column=NumericColumn(
                                    FieldDesc(name=class_name)),
                                save=save_name)
            self.assertLess(eval_result['train']['merror'][-1], 0.01)
            self.assertLess(eval_result['validate']['merror'][-1], 0.01)

            conn = db.connect_with_data_source(ds)

            pred_select = "SELECT * FROM iris.test"
            pred(ds, pred_select, "iris.predict_result_table", class_name,
                 save_name)

            self.assertEqual(
                self.get_table_row_count(conn, "iris.test"),
                self.get_table_row_count(conn, "iris.predict_result_table"))

            schema1 = self.get_table_schema(conn, "iris.test")
            schema2 = self.get_table_schema(conn, "iris.predict_result_table")
            self.assertEqual(len(schema1), len(schema2))
            for name in schema1:
                if name == 'class':
                    self.assertEqual(schema2[name], "BIGINT")
                    continue

                self.assertTrue(name in schema2)
                self.assertEqual(schema1[name], schema2[name])

            diff_schema = schema2.keys() - schema1.keys()
            self.assertEqual(len(diff_schema), 0)

            evaluate(ds, pred_select, "iris.evaluate_result_table", save_name,
                     'class', ['accuracy_score'])
            eval_schema = self.get_table_schema(conn,
                                                "iris.evaluate_result_table")
            self.assertEqual(eval_schema.keys(),
                             set(['loss', 'accuracy_score']))
Esempio n. 24
0
def submit_pai_predict(datasource,
                       original_sql,
                       select,
                       model,
                       label_name,
                       model_params,
                       result_table,
                       user=""):
    """This function pack needed params and resource to a tarball
    and submit a prediction task to PAI

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model: string
            Model to load and do prediction.
        label_name: string
            Name of the label column, if not exist in select.
        model_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """
    params = dict(locals())

    # format resultTable name to "db.table" to let the codegen form a
    # submitting argument of format "odps://project/tables/table_name"
    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)

    model_type, estimator = \
        pai_model.get_saved_model_type_and_estimator(
            datasource, model)
    setup_predict_entry(params, model_type)

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model,
                                                       user=user)

    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
    # is like: "SELECT fields,... FROM table"
    with table_ops.create_tmp_tables_guard(select, datasource) as data_table:
        params["pai_table"] = data_table
        params["oss_model_path"] = oss_model_path
        params["model"] = ""

        if try_pai_local_run(params, oss_model_path):
            return

        with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd:
            prepare_archive(cwd, estimator, oss_model_path, params)

            cmd = get_pai_predict_cmd(
                datasource, project, oss_model_path, model, data_table,
                result_table, model_type, model_params,
                "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                "file://" + os.path.join(cwd, PARAMS_FILE))
            submit_pai_task(cmd, datasource)
Esempio n. 25
0
def shap_explain(booster, datasource, select, summary_params, result_table,
                 model):
    train_fc_map = model.get_meta("features")
    label_meta = model.get_meta("label").get_field_desc()[0].to_dict(
        dtype_to_string=True)

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    dataset = xgb_shap_dataset(datasource, select, feature_column_names,
                               label_meta, feature_metas, transform_fn)

    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32)
                  ] * len(columns)
        _create_table(conn, result_table, columns, dtypes)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))

        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]

        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    filename = 'summary.png'
    with temp_file.TemporaryDirectory(as_cwd=True):
        explainer.plot_and_save(plot_func, filename=filename)
        with open(filename, 'rb') as f:
            img = f.read()

    img = base64.b64encode(img)
    if six.PY3:
        img = img.decode('utf-8')
    img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \
          % img
    print(img)
Esempio n. 26
0
def evaluate(datasource,
             select,
             result_table,
             model,
             label_name=None,
             model_params=None,
             result_column_names=[],
             pai_table=None):
    """TBD
    """
    if model_params is None:
        model_params = {}
    validation_metrics = model_params.get("validation.metrics",
                                          "accuracy_score")
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]

    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    model_params = model.get_meta("attributes")
    fc_map_ir = model.get_meta("features")
    train_label = model.get_meta("label")
    train_label_desc = train_label.get_field_desc()[0]

    if label_name:
        train_label_desc.name = label_name

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    is_pai = True if pai_table else False
    if is_pai:
        conn = PaiIOConnection.from_table(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=train_label_desc.to_dict(dtype_to_string=True),
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            is_pai=is_pai,
            pai_table=pai_table,
            pai_single_file=True,
            feature_column_code=fc_map_ir)

        for i, pred_dmatrix in enumerate(dpred):
            if is_pai:
                feature_file_name = pred_fn
            else:
                feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()
Esempio n. 27
0
def explain(datasource,
            select,
            explainer,
            model_params,
            result_table,
            model,
            pai_table="",
            oss_model_path="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None):
    """TBD
    """
    if model_params is None:
        model_params = {}

    summary_params = dict()
    for k in model_params:
        if k.startswith("summary."):
            summary_key = k.replace("summary.", "")
            summary_params[summary_key] = model_params[k]

    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    fc_map_ir = model.get_meta("features")
    label_meta = model.get_meta("label").get_field_desc()[0].to_dict(
        dtype_to_string=True)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    is_pai = True if pai_table else False
    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST)
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    dataset = xgb_shap_dataset(datasource, select, feature_column_names,
                               label_meta, feature_metas, is_pai, pai_table,
                               transform_fn)

    if explainer == "XGBoostExplainer":
        xgb_native_explain(bst, datasource, result_table)
    else:
        # when explainer is "" or "TreeExplainer" use SHAP by default.
        shap_explain(bst,
                     datasource,
                     dataset,
                     summary_params,
                     result_table,
                     is_pai=is_pai,
                     oss_dest=oss_dest,
                     oss_ak=oss_ak,
                     oss_sk=oss_sk,
                     oss_endpoint=oss_endpoint,
                     oss_bucket_name=oss_bucket_name)
Esempio n. 28
0
    def check_main_impl(self, estimator):
        if testing.get_driver() != "mysql":
            return

        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN %s
        WITH
            model.hidden_units=[32,64],
            model.n_classes=3,
            validation.select="SELECT * FROM iris.test"
        LABEL class
        INTO iris.tensorflow_train_model_test;
        """ % estimator

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"batch_size": 10}
        model_params = {"n_classes": 3, "hidden_units": [32, 64]}
        save_name = "iris.tensorflow_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            train(original_sql=original_sql,
                  model_image="sqlflow:step",
                  estimator_string=estimator,
                  datasource=ds,
                  select=select,
                  validation_select=val_select,
                  model_params=model_params,
                  train_params=train_params,
                  validation_params=None,
                  feature_column_map=None,
                  label_column=NumericColumn(
                      FieldDesc(name=class_name, shape=[])),
                  save=save_name,
                  load=None)

        conn = db.connect_with_data_source(ds)

        pred_select = "SELECT * FROM iris.test"

        with temp_file.TemporaryDirectory(as_cwd=True):
            pred(ds, pred_select, "iris.predict_result_table", class_name,
                 save_name)

        self.assertEqual(
            self.get_table_row_count(conn, "iris.test"),
            self.get_table_row_count(conn, "iris.predict_result_table"))

        schema1 = self.get_table_schema(conn, "iris.test")
        schema2 = self.get_table_schema(conn, "iris.predict_result_table")
        self.assertEqual(len(schema1), len(schema2))
        for name in schema1:
            if name == 'class':
                self.assertEqual(schema2[name], "BIGINT")
                continue

            self.assertTrue(name in schema2)
            self.assertEqual(schema1[name], schema2[name])

        diff_schema = schema2.keys() - schema1.keys()
        self.assertEqual(len(diff_schema), 0)

        with temp_file.TemporaryDirectory(as_cwd=True):
            evaluate(ds, select, "iris.evaluate_result_table", save_name,
                     class_name, {'validation.metrics': 'Accuracy'})

        eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table")
        eval_schema = set([k.lower() for k in eval_schema.keys()])
        self.assertEqual(eval_schema, set(['loss', 'accuracy']))

        with temp_file.TemporaryDirectory(as_cwd=True):
            explain(ds, select, None, {"plot_type": "bar"},
                    "iris.explain_result_table", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table")
        self.assertEqual(
            explain_schema.keys(),
            set(['petal_length', 'petal_width', 'sepal_length',
                 'sepal_width']))
        conn.close()
Esempio n. 29
0
def predict(datasource,
            select,
            result_table,
            label_name,
            model,
            pai_table="",
            oss_model_path=""):
    """PAI XGBoost prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_name: prediction label column
        oss_model_path: the model path on OSS
    """
    is_pai = True if pai_table != "" else False
    if is_pai:
        # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
        # in xgboost/train.py
        oss.load_file(oss_model_path, "my_model")
        (estimator, model_params, train_params, feature_metas,
         feature_column_names, train_label_desc,
         fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")
    else:
        if isinstance(model, six.string_types):
            model = Model.load_from_db(datasource, model)
        else:
            assert isinstance(
                model, Model), "not supported model type %s" % type(model)

        model_params = model.get_meta("attributes")
        fc_map_ir = model.get_meta("features")
        train_label_desc = model.get_meta("label").get_field_desc()[0]

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, label_name)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=None,
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            raw_data_dir=raw_data_dir)  # NOTE: default to use external memory

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            feature_file_name = os.path.join(
                tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Esempio n. 30
0
def train(original_sql,
          model_image,
          estimator_string,
          datasource,
          select,
          validation_select,
          model_params,
          train_params,
          feature_column_map,
          label_column,
          save,
          load=None):
    """
    Train, evaluate and save the XGBoost model locally.

    Args:
        original_sql (str): the original SQL statement.
        model_image (str): the model repo docker image.
        estimator (str): the XGBoost booster type like xgboost.gbtree.
        datasource (str): the database connection URI.
        select (str): the SQL statement for training.
        validation_select (str): the SQL statement for evaluation.
        model_params (dict): the XGBoost model parameters.
        train_params (dict): the training parameters, can have
                             disk_cache(bool), batch_size(int), epoch(int)
                             settings in the dict.
        feature_column_map (dict): the feature column map to do derivation.
        label_column (FeatureColumn): the label column.
        save (str): the table name to save the trained model and meta.
        load (str): the table name to load the pretrained model.

    Returns:
        A dict which indicates the evaluation result.
    """
    conn = db.connect_with_data_source(datasource)
    fc_map_ir, fc_label_ir = infer_feature_columns(conn,
                                                   select,
                                                   feature_column_map,
                                                   label_column,
                                                   n=1000)
    fc_map = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST)

    feature_column_list = fc_map["feature_columns"]
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])
    label_meta = label_column.get_field_desc()[0].to_dict()

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *feature_column_list)

    disk_cache = False
    batch_size = None
    epoch = 1
    if "disk_cache" in train_params:
        disk_cache = train_params.pop("disk_cache")
    if "batch_size" in train_params:
        batch_size = train_params.pop("batch_size")
    if "epoch" in train_params:
        epoch = train_params.pop("epoch")

    def build_dataset(fn, slct):
        return xgb_dataset(datasource,
                           fn,
                           slct,
                           feature_metas,
                           feature_column_names,
                           label_meta,
                           cache=disk_cache,
                           batch_size=batch_size,
                           epoch=epoch,
                           transform_fn=transform_fn)

    file_name = "my_model"
    if load:
        Model.load_from_db(datasource, load)
        bst = xgb.Booster()
        bst.load_model(file_name)
    else:
        bst = None

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        train_fn = os.path.join(tmp_dir_name, 'train.txt')
        val_fn = os.path.join(tmp_dir_name, 'val.txt')
        train_dataset = build_dataset(train_fn, select)
        if validation_select:
            val_dataset = build_dataset(val_fn, validation_select)
        else:
            val_dataset = None

        eval_result = dict()
        watchlist = [None]
        if val_dataset:
            # The `xgboost.train` API only accepts the XGBoost DMatrix
            # object as the training or validation dataset, so we should
            # convert the generator to DMatrix.
            if isinstance(val_dataset, types.GeneratorType):
                val_dataset = list(val_dataset)[0]
            watchlist.append((val_dataset, "validate"))

        for per_batch_dmatrix in train_dataset:
            watchlist[0] = (per_batch_dmatrix, "train")
            bst = xgb.train(model_params,
                            per_batch_dmatrix,
                            evals=watchlist,
                            evals_result=eval_result,
                            xgb_model=bst,
                            **train_params)
            print("Evaluation result: %s" % eval_result)

    meta = collect_metadata(original_sql=original_sql,
                            select=select,
                            validation_select=validation_select,
                            model_repo_image=model_image,
                            class_name=estimator_string,
                            attributes=model_params,
                            features=fc_map_ir,
                            label=fc_label_ir,
                            evaluation=eval_result,
                            num_workers=1)

    save_model_to_local_file(bst, model_params, file_name)
    model = Model(EstimatorType.XGBOOST, meta)
    model.save_to_db(datasource, save)
    return eval_result