Example #1
0
def train(original_sql,
          model_image,
          estimator_string,
          datasource,
          select,
          validation_select,
          model_params,
          train_params,
          validation_params,
          feature_column_map,
          label_column,
          save,
          load=None):
    if load:
        Model.load_from_db(datasource, load)
        load = "model_save"
    else:
        load = None

    train_step(original_sql=original_sql,
               model_image=model_image,
               estimator_string=estimator_string,
               datasource=datasource,
               select=select,
               validation_select=validation_select,
               model_params=model_params,
               train_params=train_params,
               validation_params=validation_params,
               feature_column_map=feature_column_map,
               label_column=label_column,
               save=save,
               load=load)
Example #2
0
def explain(datasource, select, explainer, model_params, result_table, model):
    if model_params is None:
        model_params = {}

    summary_params = dict()
    for k in model_params:
        if k.startswith("summary."):
            summary_key = k.replace("summary.", "")
            summary_params[summary_key] = model_params[k]

    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    bst = xgb.Booster()
    bst.load_model("my_model")

    if explainer == "XGBoostExplainer":
        xgb_native_explain(bst, datasource, result_table)
    else:
        # when explainer is "" or "TreeExplainer" use SHAP by default.
        shap_explain(bst, datasource, select, summary_params, result_table,
                     model)
Example #3
0
def submit_local_explain(datasource,
                         original_sql,
                         select,
                         model,
                         model_params,
                         result_table,
                         explainer="TreeExplainer",
                         user=""):
    model = Model.load_from_db(datasource, model)
    if model.get_type() == EstimatorType.XGBOOST:
        explain_func = xgboost_explain
    else:
        explain_func = tf_explain

    if result_table:
        feature_columns = model.get_meta("features")
        estimator_string = model.get_meta("class_name")
        field_descs = get_ordered_field_descs(feature_columns)
        feature_column_names = [fd.name for fd in field_descs]
        with db.connect_with_data_source(datasource) as conn:
            create_explain_table(conn, model.get_type(), explainer,
                                 estimator_string, result_table,
                                 feature_column_names)

    explain_func(datasource=datasource,
                 select=select,
                 explainer=explainer,
                 model_params=model_params,
                 result_table=result_table,
                 model=model)
    if not result_table:
        print_image_as_base64_html("summary.png")
Example #4
0
def submit_local_pred(datasource, select, result_table, pred_label_name, load):
    model = Model.load_from_db(datasource, load)
    if model.get_type() == EstimatorType.XGBOOST:
        xgboost_pred(datasource, select, result_table, pred_label_name, model)
    else:
        raise NotImplementedError("not implemented model type: %s" %
                                  model.get_type())
Example #5
0
def submit_local_evaluate(datasource,
                          original_sql,
                          select,
                          label_name,
                          model,
                          model_params,
                          result_table,
                          user=""):
    model = Model.load_from_db(datasource, model)
    if model.get_type() == EstimatorType.XGBOOST:
        evaluate_func = xgboost_evaluate
        validation_metrics = model_params.get("validation.metrics",
                                              "accuracy_score")
    else:
        evaluate_func = tf_evaluate
        validation_metrics = model_params.get("validation.metrics", "Accuracy")

    conn = db.connect_with_data_source(datasource)
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]
    result_column_names = create_evaluate_table(conn, result_table,
                                                validation_metrics)
    conn.close()

    evaluate_func(datasource=datasource,
                  select=select,
                  result_table=result_table,
                  model=model,
                  label_name=label_name,
                  model_params=model_params,
                  result_column_names=result_column_names)
Example #6
0
def evaluate_step(datasource,
                  select,
                  result_table,
                  model,
                  label_name,
                  model_params,
                  pai_table=None):
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    if model_params is None:
        model_params = {}

    validation_metrics = model_params.get("validation.metrics", "Accuracy")
    validation_metrics = [m.strip() for m in validation_metrics.split(',')]
    validation_steps = model_params.get("validation.steps", None)
    batch_size = model_params.get("validation.batch_size", 1)
    verbose = model_params.get("validation.verbose", 0)

    conn = db.connect_with_data_source(datasource)
    create_evaluate_table(conn, result_table, validation_metrics)
    conn.close()

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())
    train_label_desc.name = label_name
    label_meta = train_label_desc.to_dict(dtype_to_string=True)

    _evaluate(datasource=datasource,
              estimator_string=estimator_string,
              select=select,
              result_table=result_table,
              feature_columns=feature_columns,
              feature_column_names=feature_column_names,
              feature_metas=feature_metas,
              label_meta=label_meta,
              model_params=model_params,
              validation_metrics=validation_metrics,
              save=save,
              batch_size=batch_size,
              validation_steps=validation_steps,
              verbose=verbose,
              pai_table=pai_table)
Example #7
0
def submit_local_pred(datasource,
                      original_sql,
                      select,
                      model_name,
                      label_column,
                      model_params,
                      result_table,
                      user=""):
    model = Model.load_from_db(datasource, model_name)
    if model.get_type() == EstimatorType.XGBOOST:
        pred_func = xgboost_pred
    else:
        pred_func = tf_pred

    pred_func(datasource=datasource,
              select=select,
              result_table=result_table,
              pred_label_name=label_column,
              model=model)
Example #8
0
def submit_local_explain(datasource,
                         original_sql,
                         select,
                         model_name,
                         model_params,
                         result_table,
                         explainer="TreeExplainer",
                         user=""):
    model = Model.load_from_db(datasource, model_name)
    if model.get_type() == EstimatorType.XGBOOST:
        explain_func = xgboost_explain
    else:
        explain_func = tf_explain

    explain_func(datasource=datasource,
                 select=select,
                 explainer=explainer,
                 model_params=model_params,
                 result_table=result_table,
                 model=model)
Example #9
0
def submit_local_evaluate(datasource,
                          original_sql,
                          select,
                          pred_label_name,
                          model_name,
                          model_params,
                          result_table,
                          user=""):
    model = Model.load_from_db(datasource, model_name)
    if model.get_type() == EstimatorType.XGBOOST:
        evaluate_func = xgboost_evaluate
    else:
        evaluate_func = tf_evaluate

    evaluate_func(datasource=datasource,
                  select=select,
                  result_table=result_table,
                  model=model,
                  pred_label_name=pred_label_name,
                  model_params=model_params)
Example #10
0
def submit_local_pred(datasource,
                      original_sql,
                      select,
                      model,
                      label_name,
                      pred_params,
                      result_table,
                      user=""):
    model = Model.load_from_db(datasource, model)
    if model.get_type() == EstimatorType.XGBOOST:
        pred_func = xgboost_pred
    else:
        pred_func = tf_pred

    if model.get_meta("label") is None:
        train_label_desc = None
    else:
        train_label_desc = model.get_meta("label").get_field_desc()[0]

    if pred_params is None:
        extra_result_cols = []
    else:
        extra_result_cols = pred_params.get("predict.extra_outputs", "")
        extra_result_cols = [
            c.strip() for c in extra_result_cols.split(",") if c.strip()
        ]

    with db.connect_with_data_source(datasource) as conn:
        result_column_names, train_label_idx = create_predict_table(
            conn, select, result_table, train_label_desc, label_name,
            extra_result_cols)

    pred_func(datasource=datasource,
              select=select,
              result_table=result_table,
              result_column_names=result_column_names,
              train_label_idx=train_label_idx,
              model=model,
              extra_result_cols=extra_result_cols)
Example #11
0
def explain(datasource,
            select,
            explainer,
            model_params,
            result_table,
            model,
            pai_table="",
            oss_model_path="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None):
    """TBD
    """
    if model_params is None:
        model_params = {}

    summary_params = dict()
    for k in model_params:
        if k.startswith("summary."):
            summary_key = k.replace("summary.", "")
            summary_params[summary_key] = model_params[k]

    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    fc_map_ir = model.get_meta("features")
    label_meta = model.get_meta("label").get_field_desc()[0].to_dict(
        dtype_to_string=True)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    is_pai = True if pai_table else False
    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST)
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    dataset = xgb_shap_dataset(datasource, select, feature_column_names,
                               label_meta, feature_metas, is_pai, pai_table,
                               transform_fn)

    if explainer == "XGBoostExplainer":
        xgb_native_explain(bst, datasource, result_table)
    else:
        # when explainer is "" or "TreeExplainer" use SHAP by default.
        shap_explain(bst,
                     datasource,
                     dataset,
                     summary_params,
                     result_table,
                     is_pai=is_pai,
                     oss_dest=oss_dest,
                     oss_ak=oss_ak,
                     oss_sk=oss_sk,
                     oss_endpoint=oss_endpoint,
                     oss_bucket_name=oss_bucket_name)
Example #12
0
def explain(datasource,
            select,
            explainer,
            model_params,
            result_table,
            model,
            pai_table="",
            oss_model_path=""):
    """Do XGBoost model explanation, this function use selected data to
    explain the model stored at oss_model_path

    Args:
        datasource: The datasource to load explain data
        select: SQL statement to get the data set
        data_table: tmp table to save the explain data
        result_table: table to store the explanation result
        label_column: name of the label column
        oss_model_path: path to the model to be explained
    """
    if model_params is None:
        model_params = {}

    summary_params = dict()
    for k in model_params:
        if k.startswith("summary."):
            summary_key = k.replace("summary.", "")
            summary_params[summary_key] = model_params[k]

    is_pai = True if pai_table != "" else False
    if is_pai:
        # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
        # in xgboost/train.py
        oss.load_file(oss_model_path, "my_model")

        (estimator, model_params, train_params, feature_field_meta,
         feature_column_names, label_desc,
         fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")
        label_meta = label_desc.to_dict(dtype_to_string=True)
    else:
        if isinstance(model, six.string_types):
            model = Model.load_from_db(datasource, model)
        else:
            assert isinstance(
                model, Model), "not supported model type %s" % type(model)
        fc_map_ir = model.get_meta("features")
        label_meta = model.get_meta("label").get_field_desc()[0].to_dict(
            dtype_to_string=True)

    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(fc_map_ir, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    dataset = xgb_shap_dataset(datasource, select, feature_column_names,
                               label_meta, feature_metas, transform_fn)

    bst = xgb.Booster()
    bst.load_model("my_model")

    if explainer == "XGBoostExplainer":
        xgb_native_explain(bst, datasource, result_table)
    else:
        # when explainer is "" or "TreeExplainer" use SHAP by default.
        shap_explain(bst, datasource, dataset, summary_params, result_table)
Example #13
0
def pred(datasource, select, result_table, pred_label_name, model):
    """
    Do prediction using a trained model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        pred_label_name (str): the output label name to predict.
        model (Model|str): the model object or where to load the model.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, pred_label_name)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=None,
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            raw_data_dir=raw_data_dir)  # NOTE: default to use external memory

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            feature_file_name = os.path.join(
                tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Example #14
0
def explain_step(datasource,
                 select,
                 explainer,
                 model_params,
                 result_table,
                 model,
                 pai_table=None,
                 oss_dest=None,
                 oss_ak=None,
                 oss_sk=None,
                 oss_endpoint=None,
                 oss_bucket_name=None):
    """
    Do explanation to a trained TensorFlow model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        explainer (str): the explainer to explain the model.
                         Not used in TensorFlow models.
        model_params (dict): the parameters for evaluation.
        result_table (str): the output data table.
        model (Model|str): the model object or where to load the model.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    plot_type = model_params.get("summary.plot_type", "bar")

    train_attributes = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())

    label_name = model_params.get("label_col", train_label_desc.name)
    train_label_desc.name = label_name
    label_meta = train_label_desc.to_dict(dtype_to_string=True)

    if pai_table:
        assert oss_dest, "oss_dest must be given when submit to PAI"
    else:
        assert oss_dest is None

    if os.environ.get('DISPLAY', '') == '':
        print('no display found. Using non-interactive Agg backend')
        matplotlib.use('Agg')

    _explain(datasource=datasource,
             estimator_string=estimator_string,
             select=select,
             feature_columns=feature_columns,
             feature_column_names=feature_column_names,
             feature_metas=feature_metas,
             label_meta=label_meta,
             model_params=train_attributes,
             save=save,
             pai_table=pai_table,
             plot_type=plot_type,
             result_table=result_table,
             oss_dest=oss_dest,
             oss_ak=oss_ak,
             oss_sk=oss_sk,
             oss_endpoint=oss_endpoint,
             oss_bucket_name=oss_bucket_name)

    print_image_as_base64_html('summary.png')
Example #15
0
def explain(datasource, select, explainer, model_params, result_table, model):
    """
    Do explanation to a trained TensorFlow model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        explainer (str): the explainer to explain the model.
                         Not used in TensorFlow models.
        model_params (dict): the parameters for evaluation.
        result_table (str): the output data table.
        model (Model|str): the model object or where to load the model.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    plot_type = model_params.get("summary.plot_type", "bar")

    train_attributes = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())

    label_name = model_params.get("label_col", train_label_desc.name)
    train_label_desc.name = label_name
    label_meta = train_label_desc.to_dict(dtype_to_string=True)

    if result_table:
        conn = db.connect_with_data_source(datasource)
        if estimator_string.startswith("BoostedTrees"):
            column_defs = [
                "feature %s" %
                DataType.to_db_field_type(conn.driver, DataType.STRING),
                "dfc %s" %
                DataType.to_db_field_type(conn.driver, DataType.FLOAT32),
                "gain %s" %
                DataType.to_db_field_type(conn.driver, DataType.FLOAT32),
            ]
        else:
            selected_cols = db.selected_cols(conn, select)
            if label_name in selected_cols:
                selected_cols.remove(label_name)

            name_to_shape = dict([(fd.name, fd.shape) for fd in field_descs])
            column_defs = []
            float_field_type = DataType.to_db_field_type(
                conn.driver, DataType.FLOAT32)
            for name in selected_cols:
                shape = name_to_shape.get(name, None)
                if shape is None:
                    raise ValueError("cannot find column %s" % name)

                size = int(np.prod(shape))
                if size == 1:
                    column_def = "%s %s" % (name, float_field_type)
                    column_defs.append(column_def)
                else:
                    for i in six.moves.range(size):
                        column_def = "%s_%d %s" % (name, i, float_field_type)
                        column_defs.append(column_def)

        drop_sql = "DROP TABLE IF EXISTS %s;" % result_table
        create_sql = "CREATE TABLE %s (%s);" % (result_table,
                                                ",".join(column_defs))
        conn.execute(drop_sql)
        conn.execute(create_sql)
        conn.close()

    _explain(datasource=datasource,
             estimator_string=estimator_string,
             select=select,
             feature_columns=feature_columns,
             feature_column_names=feature_column_names,
             feature_metas=feature_metas,
             label_meta=label_meta,
             model_params=train_attributes,
             save=save,
             plot_type=plot_type,
             result_table=result_table)

    with open('summary.png', 'rb') as f:
        img = f.read()

    img = base64.b64encode(img)
    if six.PY3:
        img = img.decode('utf-8')
    img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \
          % img
    print(img)
Example #16
0
def predict_step(datasource,
                 select,
                 result_table,
                 label_name,
                 model,
                 pai_table=None):
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    label_meta = model.get_meta("label")
    train_label_desc = label_meta.get_field_desc()[0] if label_meta else None
    train_label_name = train_label_desc.name if train_label_desc else None
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())

    is_pai = True if pai_table else False
    if is_pai:
        select = "SELECT * FROM %s" % pai_table

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, label_name)

    if is_pai:
        conn.close()
        conn = PaiIOConnection.from_table(pai_table)
        select = None

    selected_cols = result_column_names[0:-1]
    if train_label_idx >= 0:
        selected_cols = selected_cols[0:train_label_idx] + [
            train_label_name
        ] + selected_cols[train_label_idx:]

    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)
    predict_generator = db.db_generator(conn, select)

    pop_optimizer_and_loss(model_params)
    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      label_name, conn, predict_generator, selected_cols)
    else:
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(result_table, feature_column_names, feature_metas,
                          train_label_name, label_name, conn,
                          predict_generator, selected_cols)

    print("Done predicting. Predict table : %s" % result_table)
    conn.close()
Example #17
0
def evaluate(datasource,
             select,
             result_table,
             model,
             pred_label_name=None,
             model_params=None):
    """
    Do evaluation to a trained XGBoost model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        model (Model|str): the model object or where to load the model.
        pred_label_name (str): the label column name.
        model_params (dict): the parameters for evaluation.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    if model_params is None:
        model_params = {}

    validation_metrics = model_params.get("validation.metrics", "Accuracy")
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    if pred_label_name:
        train_label_desc.name = pred_label_name

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")
    conn = db.connect_with_data_source(datasource)

    result_column_names = create_evaluate_table(conn, result_table,
                                                validation_metrics)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=train_label_desc.to_dict(dtype_to_string=True),
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn)

        for i, pred_dmatrix in enumerate(dpred):
            feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()
Example #18
0
def evaluate(datasource,
             select,
             result_table,
             model,
             label_name=None,
             model_params=None,
             result_column_names=[],
             pai_table=None):
    """TBD
    """
    if model_params is None:
        model_params = {}
    validation_metrics = model_params.get("validation.metrics",
                                          "accuracy_score")
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]

    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    model_params = model.get_meta("attributes")
    fc_map_ir = model.get_meta("features")
    train_label = model.get_meta("label")
    train_label_desc = train_label.get_field_desc()[0]

    if label_name:
        train_label_desc.name = label_name

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    is_pai = True if pai_table else False
    if is_pai:
        conn = PaiIOConnection.from_table(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=train_label_desc.to_dict(dtype_to_string=True),
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            is_pai=is_pai,
            pai_table=pai_table,
            pai_single_file=True,
            feature_column_code=fc_map_ir)

        for i, pred_dmatrix in enumerate(dpred):
            if is_pai:
                feature_file_name = pred_fn
            else:
                feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()
Example #19
0
def evaluate(datasource,
             select,
             result_table,
             load,
             pred_label_name=None,
             validation_metrics=["accuracy_score"]):
    """
    Do evaluation to a trained XGBoost model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        load (str): where the trained model stores.
        pred_label_name (str): the label column name.
        validation_metrics (list[str]): the evaluation metric names.

    Returns:
        None.
    """
    model = Model.load_from_db(datasource, load)
    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    if pred_label_name:
        train_label_desc.name = pred_label_name

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")
    conn = db.connect_with_data_source(datasource)

    result_column_names = _create_evaluate_table(conn, result_table,
                                                 validation_metrics)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(datasource=datasource,
                            fn=pred_fn,
                            dataset_sql=select,
                            feature_metas=feature_metas,
                            feature_column_names=feature_column_names,
                            label_meta=train_label_desc.to_dict(),
                            cache=True,
                            batch_size=10000,
                            transform_fn=transform_fn)

        for i, pred_dmatrix in enumerate(dpred):
            feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()
Example #20
0
def evaluate(datasource,
             select,
             result_table,
             model,
             pred_label_name=None,
             model_params=None):
    """
    Do evaluation to a trained TensorFlow model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        model (Model|str): the model object or where to load the model.
        pred_label_name (str): the label column name.
        model_params (dict): the parameters for evaluation.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    validation_metrics = model_params.get("validation.metrics", "Accuracy")
    validation_metrics = [m.strip() for m in validation_metrics.split(',')]
    validation_steps = model_params.get("validation.steps", None)
    batch_size = model_params.get("validation.batch_size", 1)
    verbose = model_params.get("validation.verbose", 0)

    conn = db.connect_with_data_source(datasource)
    create_evaluate_table(conn, result_table, validation_metrics)
    conn.close()

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())
    train_label_desc.name = pred_label_name
    label_meta = train_label_desc.to_dict(dtype_to_string=True)

    _evaluate(datasource=datasource,
              estimator_string=estimator_string,
              select=select,
              result_table=result_table,
              feature_columns=feature_columns,
              feature_column_names=feature_column_names,
              feature_metas=feature_metas,
              label_meta=label_meta,
              model_params=model_params,
              validation_metrics=validation_metrics,
              save=save,
              batch_size=batch_size,
              validation_steps=validation_steps,
              verbose=verbose)