Beispiel #1
0
    def load_from_oss(oss_model_dir, local_dir=None):
        """
        Load the saved model from OSS and unzip it on local_dir.

        Args:
            oss_model_dir (str): the OSS model directory to load.
                It is in the format of oss://bucket/path/to/dir/.
            local_dir (str): the local directory to load.

        Returns:
            Model: a Model object represent the model type and meta
            information.
        """
        if local_dir is None:
            local_dir = os.getcwd()

        with temp_file.TemporaryDirectory() as tmp_dir:
            tarball = os.path.join(tmp_dir, TARBALL_NAME)
            oss.load_file(oss_model_dir, tarball, TARBALL_NAME)
            Model._unzip(local_dir, tarball)

            model_obj_file = os.path.join(tmp_dir, MODEL_OBJ_FILE_NAME)
            oss.load_file(oss_model_dir, model_obj_file, MODEL_OBJ_FILE_NAME)
            with open(model_obj_file, "r") as f:
                d = json.loads(f.read(), cls=JSONDecoderWithFeatureColumn)
                model = Model._from_dict(d)
            return model
Beispiel #2
0
def predict_step(datasource, select, data_table, result_table, label_column,
                 oss_model_path):
    """PAI TensorFlow prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_column: prediction label column
        oss_model_path: the model path on OSS
    """

    try:
        tf.enable_eager_execution()
    except:  # noqa: E722
        pass

    (estimator, feature_column_names, feature_column_names_map, feature_metas,
     label_meta, model_params,
     feature_columns_code) = oss.load_metas(oss_model_path,
                                            "tensorflow_model_desc")

    fc_map_ir = feature_columns_code
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.TENSORFLOW)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    is_estimator = is_tf_estimator(import_model(estimator))

    # Keras single node is using h5 format to save the model, no need to deal
    # with export model format. Keras distributed mode will use estimator, so
    # this is also needed.
    model_local_dir = oss_model_path.split("/")[-1]
    if is_estimator:
        oss.load_file(oss_model_path, "exported_path")
        # NOTE(typhoonzero): directory "model_save" is hardcoded in
        # codegen/tensorflow/codegen.go
        oss.load_dir("%s/%s" % (oss_model_path, model_local_dir))
    else:
        oss.load_dir(os.path.join(oss_model_path, "model_save"))

    _predict(datasource=datasource,
             estimator_string=estimator,
             select=select,
             result_table=result_table,
             feature_columns=feature_columns,
             feature_column_names=feature_column_names,
             feature_column_names_map=feature_column_names_map,
             train_label_name=label_meta["feature_name"],
             result_col_name=label_column,
             feature_metas=feature_metas,
             model_params=model_params,
             save=model_local_dir,
             batch_size=1,
             pai_table=data_table)
Beispiel #3
0
def evaluate_step(datasource, select, data_table, result_table, oss_model_path,
                  metrics):
    """PAI TensorFlow evaluate wrapper
    This function do some preparation for the local evaluation, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        oss_model_path: the model path on OSS
        metrics: metrics to evaluate
    """

    (estimator, feature_column_names, feature_column_names_map, feature_metas,
     label_meta, model_params,
     feature_columns_code) = oss.load_metas(oss_model_path,
                                            "tensorflow_model_desc")

    fc_map_ir = feature_columns_code
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.TENSORFLOW)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    # NOTE(typhoonzero): No need to eval model_params["optimizer"] and
    # model_params["loss"] because predicting do not need these parameters.

    is_estimator = is_tf_estimator(import_model(estimator))

    # Keras single node is using h5 format to save the model, no need to deal
    # with export model format. Keras distributed mode will use estimator, so
    # this is also needed.
    model_name = oss_model_path.split("/")[-1]
    if is_estimator:
        oss.load_file(oss_model_path, "exported_path")
        # NOTE(typhoonzero): directory "model_save" is hardcoded in
        # codegen/tensorflow/codegen.go
        oss.load_dir("%s/%s" % (oss_model_path, model_name))
    else:
        oss.load_dir(os.path.join(oss_model_path, "model_save"))

    _evaluate(datasource=datasource,
              estimator_string=estimator,
              select=select,
              result_table=result_table,
              feature_columns=feature_columns,
              feature_column_names=feature_column_names,
              feature_metas=feature_metas,
              label_meta=label_meta,
              model_params=model_params,
              validation_metrics=metrics,
              save="model_save",
              batch_size=1,
              validation_steps=None,
              verbose=0,
              pai_table=data_table)
Beispiel #4
0
def predict(datasource, select, data_table, result_table, label_column,
            oss_model_path):
    """PAI TensorFlow prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_column: prediction label column
        oss_model_path: the model path on OSS
    """

    try:
        tf.enable_eager_execution()
    except:  # noqa: E722
        pass

    (estimator, feature_column_names, feature_column_names_map, feature_metas,
     label_meta, model_params,
     feature_columns_code) = oss.load_metas(oss_model_path,
                                            "tensorflow_model_desc")

    feature_columns = eval(feature_columns_code)

    # NOTE(typhoonzero): No need to eval model_params["optimizer"] and
    # model_params["loss"] because predicting do not need these parameters.

    is_estimator = is_tf_estimator(import_model(estimator))

    # Keras single node is using h5 format to save the model, no need to deal
    # with export model format. Keras distributed mode will use estimator, so
    # this is also needed.
    if is_estimator:
        oss.load_file(oss_model_path, "exported_path")
        # NOTE(typhoonzero): directory "model_save" is hardcoded in
        # codegen/tensorflow/codegen.go
        oss.load_dir("%s/model_save" % oss_model_path)
    else:
        oss.load_file(oss_model_path, "model_save")

    _predict(datasource=datasource,
             estimator_string=estimator,
             select=select,
             result_table=result_table,
             feature_columns=feature_columns,
             feature_column_names=feature_column_names,
             feature_column_names_map=feature_column_names_map,
             train_label_name=label_meta["feature_name"],
             result_col_name=label_column,
             feature_metas=feature_metas,
             model_params=model_params,
             save="model_save",
             batch_size=1,
             pai_table=data_table)
Beispiel #5
0
def explain_step(datasource, select, data_table, result_table, label_column,
                 oss_model_path):
    try:
        tf.enable_eager_execution()
    except Exception as e:
        sys.stderr.write("warning: failed to enable_eager_execution: %s" % e)
        pass

    (estimator, feature_column_names, feature_column_names_map, feature_metas,
     label_meta, model_params,
     feature_columns_code) = oss.load_metas(oss_model_path,
                                            "tensorflow_model_desc")

    fc_map_ir = feature_columns_code
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.TENSORFLOW)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    # NOTE(typhoonzero): No need to eval model_params["optimizer"] and
    # model_params["loss"] because predicting do not need these parameters.

    is_estimator = is_tf_estimator(import_model(estimator))

    # Keras single node is using h5 format to save the model, no need to deal
    # with export model format. Keras distributed mode will use estimator, so
    # this is also needed.
    model_name = oss_model_path.split("/")[-1]
    if is_estimator:
        oss.load_file(oss_model_path, "exported_path")
        # NOTE(typhoonzero): directory "model_save" is hardcoded in
        # codegen/tensorflow/codegen.go
        oss.load_dir("%s/%s" % (oss_model_path, model_name))
    else:
        oss.load_dir(os.path.join(oss_model_path, "model_save"))

    # (TODO: lhw) use oss to store result image
    _explain(datasource=datasource,
             estimator_string=estimator,
             select=select,
             feature_columns=feature_columns,
             feature_column_names=feature_column_names,
             feature_metas=feature_metas,
             label_meta=label_meta,
             model_params=model_params,
             save="model_save",
             result_table=result_table,
             pai_table=data_table,
             oss_dest=None,
             oss_ak=None,
             oss_sk=None,
             oss_endpoint=None,
             oss_bucket_name=None)
Beispiel #6
0
def explain_step(datasource, select, data_table, explainer, result_table,
                 label_column, oss_model_path):
    """Do XGBoost model explanation, this function use selected data to
    explain the model stored at oss_model_path

    Args:
        datasource: The datasource to load explain data
        select: SQL statement to get the data set
        data_table: tmp table to save the explain data
        result_table: table to store the explanation result
        label_column: name of the label column
        oss_model_path: path to the model to be explained
    """
    # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
    # in xgboost/train.py
    oss.load_file(oss_model_path, "my_model")

    (estimator, model_params, train_params, feature_field_meta,
     feature_column_names, label_field_meta,
     fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    summary_params = dict()
    for k in model_params:
        if k.startswith("summary."):
            summary_key = k.replace("summary.", "")
            summary_params[summary_key] = model_params[k]
    explain_xgb(
        datasource=datasource,
        select=select,
        feature_field_meta=feature_field_meta,
        feature_column_names=feature_column_names,
        label_meta=label_field_meta,
        summary_params=summary_params,
        explainer=explainer,
        result_table=result_table,
        is_pai=True,
        pai_explain_table=data_table,
        # (TODO:lhw) save/load explain result storage info into/from FLAGS
        oss_dest="",
        oss_ak="",
        oss_sk="",
        oss_endpoint="",
        oss_bucket_name="",
        transform_fn=transform_fn,
        feature_column_code=fc_map_ir)
Beispiel #7
0
def explain(datasource, select, data_table, result_table, label_column,
            oss_model_path):
    """Do XGBoost model explanation, this function use selected data to
    explain the model stored at oss_model_path

    Args:
        datasource: The datasource to load explain data
        select: SQL statement to get the data set
        data_table: tmp table to save the explain data
        result_table: table to store the explanation result
        label_column: name of the label column
        oss_model_path: path to the model to be explained
    """
    # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
    # in xgboost/train.py
    oss.load_file(oss_model_path, "my_model")

    (estimator, model_params, train_params, feature_field_meta,
     feature_column_names, label_field_meta,
     feature_column_code) = oss.load_metas(oss_model_path,
                                           "xgboost_model_desc")

    feature_column_transformers = eval('[{}]'.format(feature_column_code))
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *feature_column_transformers)

    explain_xgb(
        datasource=datasource,
        select=select,
        feature_field_meta=feature_field_meta,
        feature_column_names=feature_column_names,
        label_meta=label_field_meta,
        summary_params={},
        result_table=result_table,
        is_pai=True,
        pai_explain_table=data_table,
        hdfs_namenode_addr="",
        hive_location="",
        hdfs_user="",
        hdfs_pass="",
        # (TODO:lhw) save/load explain result storage info into/from FLAGS
        oss_dest="",
        oss_ak="",
        oss_sk="",
        oss_endpoint="",
        oss_bucket_name="",
        transform_fn=transform_fn,
        feature_column_code=feature_column_code)
Beispiel #8
0
def predict(datasource, select, data_table, result_table, label_column,
            oss_model_path):
    """PAI XGBoost prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_column: prediction label column
        oss_model_path: the model path on OSS
    """
    # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
    # in xgboost/train.py
    oss.load_file(oss_model_path, "my_model")
    (estimator, model_params, train_params, feature_metas,
     feature_column_names, label_meta,
     feature_column_code) = oss.load_metas(oss_model_path,
                                           "xgboost_model_desc")

    pred_label_meta = copy.copy(label_meta)
    pred_label_meta["feature_name"] = label_column

    feature_column_transformers = eval('[{}]'.format(feature_column_code))
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *feature_column_transformers)

    pred(datasource=datasource,
         select=select,
         feature_metas=feature_metas,
         feature_column_names=feature_column_names,
         train_label_meta=label_meta,
         pred_label_meta=label_meta,
         result_table=result_table,
         is_pai=True,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         pai_table=data_table,
         model_params=model_params,
         train_params=train_params,
         transform_fn=transform_fn,
         feature_column_code=feature_column_code)
Beispiel #9
0
def predict_step(datasource, select, data_table, result_table, label_column,
                 oss_model_path):
    """PAI XGBoost prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_column: prediction label column
        oss_model_path: the model path on OSS
    """
    # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
    # in xgboost/train.py
    oss.load_file(oss_model_path, "my_model")
    (estimator, model_params, train_params, feature_metas,
     feature_column_names, label_meta,
     fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")

    pred_label_meta = copy.copy(label_meta)
    pred_label_meta["feature_name"] = label_column

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    pred(datasource=datasource,
         select=select,
         feature_metas=feature_metas,
         feature_column_names=feature_column_names,
         train_label_meta=label_meta,
         pred_label_meta=label_meta,
         result_table=result_table,
         is_pai=True,
         pai_table=data_table,
         model_params=model_params,
         train_params=train_params,
         transform_fn=transform_fn,
         feature_column_code=fc_map_ir)
Beispiel #10
0
    def load_from_oss(oss_model_dir, local_dir=None):
        """
        Load the saved model from OSS and unzip it on local_dir.

        Args:
            oss_model_dir (str): the OSS model directory to load.
                It is in the format of oss://bucket/path/to/dir/.
            local_dir (str): the local directory to load.

        Returns:
            Model: a Model object represent the model type and meta
            information.
        """
        if local_dir is None:
            local_dir = os.getcwd()

        with temp_file.TemporaryDirectory() as tmp_dir:
            tarball = os.path.join(tmp_dir, TARBALL_NAME)
            oss.load_file(oss_model_dir, tarball, TARBALL_NAME)
            return Model._unzip(local_dir, tarball)
Beispiel #11
0
def predict(datasource,
            select,
            result_table,
            label_name,
            model,
            pai_table="",
            oss_model_path=""):
    """PAI XGBoost prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_name: prediction label column
        oss_model_path: the model path on OSS
    """
    is_pai = True if pai_table != "" else False
    if is_pai:
        # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
        # in xgboost/train.py
        oss.load_file(oss_model_path, "my_model")
        (estimator, model_params, train_params, feature_metas,
         feature_column_names, train_label_desc,
         fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")
    else:
        if isinstance(model, six.string_types):
            model = Model.load_from_db(datasource, model)
        else:
            assert isinstance(
                model, Model), "not supported model type %s" % type(model)

        model_params = model.get_meta("attributes")
        fc_map_ir = model.get_meta("features")
        train_label_desc = model.get_meta("label").get_field_desc()[0]

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, label_name)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=None,
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            raw_data_dir=raw_data_dir)  # NOTE: default to use external memory

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            feature_file_name = os.path.join(
                tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Beispiel #12
0
def train(original_sql,
          model_image,
          estimator_string,
          datasource,
          select,
          validation_select,
          model_params,
          train_params,
          validation_params,
          feature_column_map,
          label_column,
          save,
          load=None,
          pai_table="",
          pai_val_table=""):
    is_pai = True if pai_table != "" else False
    is_dist_train = False
    FLAGS = None
    oss_model_dir = ""

    if is_pai:
        FLAGS = define_tf_flags()
        num_workers = len(FLAGS.worker_hosts.split(","))
        is_dist_train = num_workers > 1
        oss_model_dir = FLAGS.sqlflow_oss_modeldir
        try:
            oss_path_to_load = train_params.pop("oss_path_to_load")
            if load:
                oss.load_file(oss_path_to_load, "my_model")
        except:  # noqa: E722
            pass

    feature_columns = compile_ir_feature_columns(feature_column_map,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(feature_column_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    label_meta = label_column.get_field_desc()[0].to_dict(dtype_to_string=True)

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    batch_size = train_params.pop("batch_size", None)
    epoch = train_params.pop("epoch", 1)
    load_pretrained_model = True if load else False
    disk_cache = train_params.pop("disk_cache", False)

    if is_dist_train:
        # NOTE(typhoonzero): dist_train returns None
        dist_train(flags=FLAGS,
                   datasource=datasource,
                   select=select,
                   model_params=model_params,
                   train_params=train_params,
                   feature_metas=feature_metas,
                   feature_column_names=feature_column_names,
                   label_meta=label_meta,
                   validation_select=validation_select,
                   disk_cache=disk_cache,
                   batch_size=batch_size,
                   epoch=epoch,
                   load_pretrained_model=load_pretrained_model,
                   is_pai=True,
                   pai_train_table=pai_table,
                   pai_validate_table=pai_val_table,
                   oss_model_dir=oss_model_dir,
                   transform_fn=transform_fn,
                   feature_column_code=feature_column_map,
                   model_repo_image=model_image,
                   original_sql=original_sql)
    else:
        return local_train(original_sql,
                           model_image,
                           estimator_string,
                           datasource,
                           select,
                           validation_select,
                           model_params,
                           train_params,
                           feature_metas,
                           feature_column_names,
                           feature_column_map,
                           label_column,
                           transform_fn,
                           save,
                           load=load,
                           is_pai=is_pai,
                           oss_model_dir=oss_model_dir)
Beispiel #13
0
def explain(datasource,
            select,
            explainer,
            model_params,
            result_table,
            model,
            pai_table="",
            oss_model_path=""):
    """Do XGBoost model explanation, this function use selected data to
    explain the model stored at oss_model_path

    Args:
        datasource: The datasource to load explain data
        select: SQL statement to get the data set
        data_table: tmp table to save the explain data
        result_table: table to store the explanation result
        label_column: name of the label column
        oss_model_path: path to the model to be explained
    """
    if model_params is None:
        model_params = {}

    summary_params = dict()
    for k in model_params:
        if k.startswith("summary."):
            summary_key = k.replace("summary.", "")
            summary_params[summary_key] = model_params[k]

    is_pai = True if pai_table != "" else False
    if is_pai:
        # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
        # in xgboost/train.py
        oss.load_file(oss_model_path, "my_model")

        (estimator, model_params, train_params, feature_field_meta,
         feature_column_names, label_desc,
         fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")
        label_meta = label_desc.to_dict(dtype_to_string=True)
    else:
        if isinstance(model, six.string_types):
            model = Model.load_from_db(datasource, model)
        else:
            assert isinstance(
                model, Model), "not supported model type %s" % type(model)
        fc_map_ir = model.get_meta("features")
        label_meta = model.get_meta("label").get_field_desc()[0].to_dict(
            dtype_to_string=True)

    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(fc_map_ir, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    dataset = xgb_shap_dataset(datasource, select, feature_column_names,
                               label_meta, feature_metas, transform_fn)

    bst = xgb.Booster()
    bst.load_model("my_model")

    if explainer == "XGBoostExplainer":
        xgb_native_explain(bst, datasource, result_table)
    else:
        # when explainer is "" or "TreeExplainer" use SHAP by default.
        shap_explain(bst, datasource, dataset, summary_params, result_table)
Beispiel #14
0
def train(datasource,
          estimator_string,
          select,
          validation_select,
          feature_columns,
          feature_column_names,
          feature_metas={},
          label_meta={},
          model_params={},
          train_params={},
          validation_metrics=["Accuracy"],
          disk_cache=False,
          save="",
          batch_size=None,
          epoch=1,
          validation_steps=1,
          verbose=0,
          max_steps=None,
          validation_start_delay_secs=0,
          validation_throttle_secs=0,
          save_checkpoints_steps=100,
          log_every_n_iter=10,
          load_pretrained_model=False,
          is_pai=True,
          pai_table="",
          pai_val_table="",
          feature_columns_code="",
          model_repo_image="",
          original_sql="",
          oss_model_dir_to_load="",
          feature_column_names_map=None):

    FLAGS = define_tf_flags()
    num_workers = len(FLAGS.worker_hosts.split(","))
    is_dist_train = num_workers > 1
    oss_model_dir = FLAGS.sqlflow_oss_modeldir

    if load_pretrained_model:
        oss.load_file(oss_model_dir_to_load, "my_model")

    # NOTE: in the current implementation, we are generating a transform_fn
    # from COLUMN clause. The transform_fn is executed during the process of
    # dumping the original data into DMatrix SVM file.
    transform_fn = ComposedColumnTransformer(feature_column_names,
                                             *feature_columns)

    if is_dist_train:
        dist_train(flags=FLAGS,
                   datasource=datasource,
                   select=select,
                   model_params=model_params,
                   train_params=train_params,
                   feature_metas=feature_metas,
                   feature_column_names=feature_column_names,
                   label_meta=label_meta,
                   validation_select=validation_select,
                   disk_cache=disk_cache,
                   batch_size=batch_size,
                   epoch=epoch,
                   load_pretrained_model=load_pretrained_model,
                   is_pai=True,
                   pai_train_table=pai_table,
                   pai_validate_table=pai_val_table,
                   oss_model_dir=oss_model_dir,
                   transform_fn=transform_fn,
                   feature_column_code=feature_columns_code,
                   model_repo_image=model_repo_image,
                   original_sql=original_sql)
    else:
        local_train(datasource=datasource,
                    select=select,
                    model_params=model_params,
                    train_params=train_params,
                    feature_metas=feature_metas,
                    feature_column_names=feature_column_names,
                    label_meta=label_meta,
                    validation_select=validation_select,
                    disk_cache=disk_cache,
                    batch_size=batch_size,
                    epoch=epoch,
                    load_pretrained_model=load_pretrained_model,
                    is_pai=True,
                    pai_train_table=pai_table,
                    pai_validate_table=pai_val_table,
                    rank=0,
                    nworkers=1,
                    oss_model_dir=oss_model_dir,
                    transform_fn=transform_fn,
                    feature_column_code=feature_columns_code,
                    model_repo_image=model_repo_image,
                    original_sql=original_sql)
Beispiel #15
0
def train_step(original_sql,
               model_image,
               estimator_string,
               datasource,
               select,
               validation_select,
               pai_table,
               pai_val_table,
               model_params,
               train_params,
               feature_column_map,
               label_column,
               save,
               load=None):
    FLAGS = define_tf_flags()
    num_workers = len(FLAGS.worker_hosts.split(","))
    is_dist_train = num_workers > 1
    oss_model_dir = FLAGS.sqlflow_oss_modeldir

    oss_path_to_load = train_params.pop("oss_path_to_load")
    if load:
        oss.load_file(oss_path_to_load, "my_model")

    conn = db.connect_with_data_source(datasource)
    fc_map_ir, fc_label_ir = infer_feature_columns(conn,
                                                   select,
                                                   feature_column_map,
                                                   label_column,
                                                   n=1000)
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])
    label_meta = label_column.get_field_desc()[0].to_dict()

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    batch_size = train_params.pop("batch_size", None)
    epoch = train_params.pop("epoch", 1)
    load_pretrained_model = True if load else False
    disk_cache = train_params.pop("disk_cache", False)

    if is_dist_train:
        dist_train(flags=FLAGS,
                   datasource=datasource,
                   select=select,
                   model_params=model_params,
                   train_params=train_params,
                   feature_metas=feature_metas,
                   feature_column_names=feature_column_names,
                   label_meta=label_meta,
                   validation_select=validation_select,
                   disk_cache=disk_cache,
                   batch_size=batch_size,
                   epoch=epoch,
                   load_pretrained_model=load_pretrained_model,
                   is_pai=True,
                   pai_train_table=pai_table,
                   pai_validate_table=pai_val_table,
                   oss_model_dir=oss_model_dir,
                   transform_fn=transform_fn,
                   feature_column_code=fc_map_ir,
                   model_repo_image=model_image,
                   original_sql=original_sql)
    else:
        local_train(datasource=datasource,
                    select=select,
                    model_params=model_params,
                    train_params=train_params,
                    feature_metas=feature_metas,
                    feature_column_names=feature_column_names,
                    label_meta=label_meta,
                    validation_select=validation_select,
                    disk_cache=disk_cache,
                    batch_size=batch_size,
                    epoch=epoch,
                    load_pretrained_model=load_pretrained_model,
                    is_pai=True,
                    pai_train_table=pai_table,
                    pai_validate_table=pai_val_table,
                    rank=0,
                    nworkers=1,
                    oss_model_dir=oss_model_dir,
                    transform_fn=transform_fn,
                    feature_column_code=fc_map_ir,
                    model_repo_image=model_image,
                    original_sql=original_sql)
Beispiel #16
0
def evaluate(datasource,
             select,
             result_table,
             model,
             label_name=None,
             model_params=None,
             pai_table="",
             oss_model_path=""):
    """TBD
    """
    if model_params is None:
        model_params = {}
    validation_metrics = model_params.get("validation.metrics",
                                          "accuracy_score")
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]

    is_pai = True if pai_table != "" else False
    if is_pai:
        assert (oss_model_path != "")
        # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
        # in xgboost/train.py
        oss.load_file(oss_model_path, "my_model")
        (estimator, model_params, train_params, feature_metas,
         feature_column_names, train_label_desc,
         fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")
    else:
        if isinstance(model, six.string_types):
            model = Model.load_from_db(datasource, model)
        else:
            assert isinstance(
                model, Model), "not supported model type %s" % type(model)

        model_params = model.get_meta("attributes")
        fc_map_ir = model.get_meta("features")
        train_label_desc = model.get_meta("label").get_field_desc()[0]

    if label_name:
        train_label_desc.name = label_name

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")
    conn = db.connect_with_data_source(datasource)

    result_column_names = create_evaluate_table(conn, result_table,
                                                validation_metrics)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=train_label_desc.to_dict(dtype_to_string=True),
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn)

        for i, pred_dmatrix in enumerate(dpred):
            feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()