Beispiel #1
0
def submit_local_pred(datasource,
                      original_sql,
                      select,
                      model,
                      label_name,
                      pred_params,
                      result_table,
                      user=""):
    model = Model.load_from_db(datasource, model)
    if model.get_type() == EstimatorType.XGBOOST:
        pred_func = xgboost_pred
    else:
        pred_func = tf_pred

    if model.get_meta("label") is None:
        train_label_desc = None
    else:
        train_label_desc = model.get_meta("label").get_field_desc()[0]

    if pred_params is None:
        extra_result_cols = []
    else:
        extra_result_cols = pred_params.get("predict.extra_outputs", "")
        extra_result_cols = [
            c.strip() for c in extra_result_cols.split(",") if c.strip()
        ]

    with db.connect_with_data_source(datasource) as conn:
        result_column_names, train_label_idx = create_predict_table(
            conn, select, result_table, train_label_desc, label_name,
            extra_result_cols)

    pred_func(datasource=datasource,
              select=select,
              result_table=result_table,
              result_column_names=result_column_names,
              train_label_idx=train_label_idx,
              model=model,
              extra_result_cols=extra_result_cols)
Beispiel #2
0
def predict(datasource,
            select,
            result_table,
            label_name,
            model,
            pai_table="",
            oss_model_path=""):
    """PAI XGBoost prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_name: prediction label column
        oss_model_path: the model path on OSS
    """
    is_pai = True if pai_table != "" else False
    if is_pai:
        # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
        # in xgboost/train.py
        oss.load_file(oss_model_path, "my_model")
        (estimator, model_params, train_params, feature_metas,
         feature_column_names, train_label_desc,
         fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")
    else:
        if isinstance(model, six.string_types):
            model = Model.load_from_db(datasource, model)
        else:
            assert isinstance(
                model, Model), "not supported model type %s" % type(model)

        model_params = model.get_meta("attributes")
        fc_map_ir = model.get_meta("features")
        train_label_desc = model.get_meta("label").get_field_desc()[0]

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, label_name)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=None,
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            raw_data_dir=raw_data_dir)  # NOTE: default to use external memory

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            feature_file_name = os.path.join(
                tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Beispiel #3
0
def predict_step(datasource,
                 select,
                 result_table,
                 label_name,
                 model,
                 pai_table=None):
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    label_meta = model.get_meta("label")
    train_label_desc = label_meta.get_field_desc()[0] if label_meta else None
    train_label_name = train_label_desc.name if train_label_desc else None
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())

    is_pai = True if pai_table else False
    if is_pai:
        select = "SELECT * FROM %s" % pai_table

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, label_name)

    if is_pai:
        conn.close()
        conn = PaiIOConnection.from_table(pai_table)
        select = None

    selected_cols = result_column_names[0:-1]
    if train_label_idx >= 0:
        selected_cols = selected_cols[0:train_label_idx] + [
            train_label_name
        ] + selected_cols[train_label_idx:]

    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)
    predict_generator = db.db_generator(conn, select)

    pop_optimizer_and_loss(model_params)
    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      label_name, conn, predict_generator, selected_cols)
    else:
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(result_table, feature_column_names, feature_metas,
                          train_label_name, label_name, conn,
                          predict_generator, selected_cols)

    print("Done predicting. Predict table : %s" % result_table)
    conn.close()
Beispiel #4
0
    def test_main(self):
        ds = testing.get_datasource()
        original_sql = """SELECT * FROM iris.train
        TO TRAIN xgboost.gbtree
        WITH
            objective="multi:softprob",
            num_boost_round=20,
            num_class=3,
            validation.select="SELECT * FROM iris.test"
        LABEL class
        INTO iris.xgboost_train_model_test;
        """

        select = "SELECT * FROM iris.train"
        val_select = "SELECT * FROM iris.test"
        train_params = {"num_boost_round": 20}
        model_params = {"num_class": 3, "objective": "multi:softprob"}
        save_name = "iris.xgboost_train_model_test"
        class_name = "class"

        with temp_file.TemporaryDirectory(as_cwd=True):
            eval_result = train(datasource=ds,
                                original_sql=original_sql,
                                select=select,
                                validation_select=val_select,
                                estimator_string="xgboost.gbtree",
                                model_image="sqlflow:step",
                                feature_column_map=None,
                                label_column=NumericColumn(
                                    FieldDesc(name=class_name)),
                                model_params=model_params,
                                train_params=train_params,
                                validation_params=None,
                                save=save_name,
                                load=None)

        self.assertLess(eval_result['train']['merror'][-1], 0.01)
        self.assertLess(eval_result['validate']['merror'][-1], 0.01)

        conn = db.connect_with_data_source(ds)
        pred_select = "SELECT * FROM iris.test"

        with temp_file.TemporaryDirectory(as_cwd=True):
            result_column_names, train_label_idx = create_predict_table(
                conn, select, "iris.predict_result_table",
                FieldDesc(name=class_name), "class")
            predict(ds, pred_select, "iris.predict_result_table",
                    result_column_names, train_label_idx, save_name)

        self.assertEqual(
            self.get_table_row_count(conn, "iris.test"),
            self.get_table_row_count(conn, "iris.predict_result_table"))

        schema1 = self.get_table_schema(conn, "iris.test")
        schema2 = self.get_table_schema(conn, "iris.predict_result_table")
        self.assertEqual(len(schema1), len(schema2))
        for name in schema1:
            if name == 'class':
                self.assertEqual(schema2[name], "BIGINT")
                continue

            self.assertTrue(name in schema2)
            self.assertEqual(schema1[name], schema2[name])

        diff_schema = schema2.keys() - schema1.keys()
        self.assertEqual(len(diff_schema), 0)

        with temp_file.TemporaryDirectory(as_cwd=True):
            result_column_names = create_evaluate_table(
                conn, "iris.evaluate_result_table", ["accuracy_score"])
            evaluate(ds,
                     pred_select,
                     "iris.evaluate_result_table",
                     save_name,
                     label_name='class',
                     model_params={'validation.metrics': 'accuracy_score'},
                     result_column_names=result_column_names)

        eval_schema = self.get_table_schema(conn, "iris.evaluate_result_table")
        self.assertEqual(eval_schema.keys(), set(['loss', 'accuracy_score']))

        with temp_file.TemporaryDirectory(as_cwd=True):
            feature_column_names = [
                "petal_width", "petal_length", "sepal_width", "sepal_length"
            ]
            create_explain_table(conn, EstimatorType.XGBOOST, "TreeExplainer",
                                 "xgboost.gbtree", "iris.explain_result_table",
                                 feature_column_names)
            explain(ds, select, "TreeExplainer", {"plot_type": "decision"},
                    "iris.explain_result_table", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table")
        self.assertEqual(explain_schema.keys(), set(feature_column_names))

        with temp_file.TemporaryDirectory(as_cwd=True):
            create_explain_table(conn, EstimatorType.XGBOOST,
                                 "XGBoostExplainer", "xgboost.gbtree",
                                 "iris.explain_result_table_2",
                                 feature_column_names)
            explain(ds, select, "XGBoostExplainer", {},
                    "iris.explain_result_table_2", save_name)

        explain_schema = self.get_table_schema(conn,
                                               "iris.explain_result_table_2")
        self.assertEqual(explain_schema.keys(),
                         set(['feature', 'fscore', 'gain']))
        conn.close()
Beispiel #5
0
def submit_pai_predict(datasource,
                       original_sql,
                       select,
                       model,
                       label_name,
                       pred_params,
                       result_table,
                       user=""):
    """This function pack needed params and resource to a tarball
    and submit a prediction task to PAI

    Args:
        datasource: string
            Like: maxcompute://ak:[email protected]/api?
                  curr_project=test_ci&scheme=http
        original_sql: string
            Original "TO PREDICT" statement.
        select: string
            SQL statement to get prediction data set.
        model: string
            Model to load and do prediction.
        label_name: string
            Name of the label column, if not exist in select.
        pred_params: dict
            Params for training, crossponding to WITH clause.
        result_table: string
            The table name to save prediction result.
        user: string
            A string to identify the user, used to load model from the user's
            directory.
    """
    params = dict(locals())

    # format resultTable name to "db.table" to let the codegen form a
    # submitting argument of format "odps://project/tables/table_name"
    project = table_ops.get_project(datasource)
    if result_table.count(".") == 0:
        result_table = "%s.%s" % (project, result_table)

    model_metas = Model.load_metadata_from_db(datasource, model)
    model_type = model_metas.get_type()
    estimator = model_metas.get_meta("class_name")
    setup_predict_entry(params, model_type)

    train_label = model_metas.get_meta("label")
    if train_label is not None:
        train_label_desc = train_label.get_field_desc()[0]
    else:
        train_label_desc = None

    if pred_params is None:
        extra_result_cols = []
    else:
        extra_result_cols = pred_params.get("predict.extra_outputs", "")
        extra_result_cols = [
            c.strip() for c in extra_result_cols.split(",") if c.strip()
        ]

    with db.connect_with_data_source(datasource) as conn:
        result_column_names, train_label_idx = create_predict_table(
            conn, select, result_table, train_label_desc, label_name,
            extra_result_cols)

    oss_model_path = pai_model.get_oss_model_save_path(datasource,
                                                       model,
                                                       user=user)

    # TODO(typhoonzero): Do **NOT** create tmp table when the select statement
    # is like: "SELECT fields,... FROM table"
    with table_ops.create_tmp_tables_guard(select, datasource) as data_table:
        del params["label_name"]
        params["pai_table"] = data_table
        params["result_column_names"] = result_column_names
        params["train_label_idx"] = train_label_idx
        params["extra_result_cols"] = extra_result_cols

        if try_pai_local_run(params, oss_model_path):
            return

        with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd:
            prepare_archive(cwd, estimator, oss_model_path, params)

            cmd = get_pai_predict_cmd(
                datasource, project, oss_model_path, model, data_table,
                result_table, model_type, pred_params,
                "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE),
                "file://" + os.path.join(cwd, PARAMS_FILE))
            submit_pai_task(cmd, datasource)