Example #1
0
def pred(datasource,
         select,
         feature_metas,
         feature_column_names,
         train_label_meta,
         pred_label_meta,
         result_table,
         is_pai=False,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         pai_table="",
         model_params=None,
         train_params=None,
         transform_fn=None,
         feature_column_code=""):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = None
    dpred = xgb_dataset(
        datasource=datasource,
        fn='predict.txt',
        dataset_sql=select,
        feature_metas=feature_metas,
        feature_column_names=feature_column_names,
        label_meta=None,
        is_pai=is_pai,
        pai_table=pai_table,
        pai_single_file=True,
        cache=True,
        batch_size=DEFAULT_PREDICT_BATCH_SIZE,
        transform_fn=transform_fn,
        feature_column_code=feature_column_code,
        raw_data_dir="predict.raw.dir")  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    print("Start predicting XGBoost model...")

    if is_pai:
        pai_table = "odps://{}/tables/{}".format(*pai_table.split("."))
        selected_cols = db.pai_selected_cols(pai_table)
    else:
        selected_cols = db.selected_cols(conn, select)

    feature_file_id = 0
    train_label_name = train_label_meta["feature_name"]
    pred_label_name = pred_label_meta["feature_name"]
    for pred_dmatrix in dpred:
        predict_and_store_result(bst, pred_dmatrix, feature_file_id,
                                 model_params, selected_cols, train_label_name,
                                 pred_label_name, feature_column_names,
                                 feature_metas, is_pai, conn, result_table,
                                 hdfs_namenode_addr, hive_location, hdfs_user,
                                 hdfs_pass)
        feature_file_id += 1
    print("Done predicting. Predict table : %s" % result_table)
Example #2
0
def pred_imp(datasource,
             select,
             feature_metas,
             feature_column_names,
             train_label_meta,
             pred_label_meta,
             result_table,
             is_pai=False,
             pai_table="",
             model_params=None,
             train_params=None,
             transform_fn=None,
             feature_column_code="",
             rank=0,
             nworkers=1):
    print("rank={} nworkers={}".format(rank, nworkers))
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = PaiIOConnection.from_table(pai_table)
    dpred = xgb_dataset(
        datasource=datasource,
        fn='predict.txt',
        dataset_sql=select,
        feature_metas=feature_metas,
        feature_column_names=feature_column_names,
        label_meta=None,
        is_pai=is_pai,
        pai_table=pai_table,
        pai_single_file=True,
        cache=True,
        batch_size=DEFAULT_PREDICT_BATCH_SIZE,
        rank=rank,
        nworkers=nworkers,
        transform_fn=transform_fn,
        feature_column_code=feature_column_code,
        raw_data_dir="predict.raw.dir")  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    print("{} Start predicting XGBoost model...".format(datetime.now()))
    if not model_params:
        model_params = load_metadata("model_meta.json")["attributes"]

    selected_cols = db.selected_cols(conn, select)

    feature_file_id = 0
    train_label_name = train_label_meta["feature_name"]
    pred_label_name = pred_label_meta["feature_name"]
    for pred_dmatrix in dpred:
        predict_and_store_result(bst, pred_dmatrix, feature_file_id,
                                 model_params, selected_cols, train_label_name,
                                 pred_label_name, feature_column_names,
                                 feature_metas, is_pai, conn, result_table,
                                 rank)
        feature_file_id += 1
    print("{} Done predicting. Predict table: {}".format(
        datetime.now(), result_table))
Example #3
0
 def build_dataset(fn, slct):
     return xgb_dataset(datasource,
                        fn,
                        slct,
                        feature_metas,
                        feature_column_names,
                        label_meta_dict,
                        cache=disk_cache,
                        batch_size=batch_size,
                        epoch=epoch,
                        transform_fn=transform_fn)
Example #4
0
 def test_train(self):
     ds = testing.get_datasource()
     select = "SELECT * FROM iris.train"
     val_select = "SELECT * FROM iris.test"
     feature_column_names = [
         feature_metas[k]["feature_name"] for k in feature_metas
     ]
     is_pai = False
     pai_train_table = ""
     train_params = {"num_boost_round": 20}
     model_params = {"num_classes": 3}
     with tempfile.TemporaryDirectory() as tmp_dir_name:
         train_fn = os.path.join(tmp_dir_name, 'train.txt')
         val_fn = os.path.join(tmp_dir_name, 'val.txt')
         dtrain = xgb_dataset(ds, train_fn, select, feature_metas,
                              feature_column_names, label_meta, is_pai,
                              pai_train_table)
         dval = xgb_dataset(ds, val_fn, val_select, feature_metas,
                            feature_column_names, label_meta, is_pai,
                            pai_train_table)
         eval_result = train(dtrain, train_params, model_params, dval)
         self.assertLess(eval_result['train']['rmse'][-1], 0.01)
         self.assertLess(eval_result['validate']['rmse'][-1], 0.01)
Example #5
0
 def build_dataset(fn, slct, pai_table):
     return xgb_dataset(datasource,
                        fn,
                        slct,
                        feature_metas,
                        feature_column_names,
                        label_meta_dict,
                        cache=disk_cache,
                        batch_size=batch_size,
                        epoch=epoch,
                        transform_fn=transform_fn,
                        is_pai=is_pai,
                        pai_table=pai_table,
                        feature_column_code=feature_column_map)
Example #6
0
def evaluate(datasource,
             select,
             feature_metas,
             feature_column_names,
             label_meta,
             result_table,
             validation_metrics=["accuracy_score"],
             is_pai=False,
             hdfs_namenode_addr="",
             hive_location="",
             hdfs_user="",
             hdfs_pass="",
             pai_table="",
             model_params=None,
             transform_fn=None,
             feature_column_code=""):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = None
    dpred = xgb_dataset(datasource,
                        'predict.txt',
                        select,
                        feature_metas,
                        feature_column_names,
                        label_meta,
                        is_pai,
                        pai_table,
                        True,
                        True,
                        batch_size=DEFAULT_PREDICT_BATCH_SIZE,
                        transform_fn=transform_fn,
                        feature_column_code=feature_column_code
                        )  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load model
    print("Start evaluating XGBoost model...")
    feature_file_id = 0
    for pred_dmatrix in dpred:
        evaluate_and_store_result(bst, pred_dmatrix, feature_file_id,
                                  validation_metrics, model_params,
                                  feature_column_names, label_meta, is_pai,
                                  conn, result_table, hdfs_namenode_addr,
                                  hive_location, hdfs_user, hdfs_pass)
        feature_file_id += 1
    print("Done evaluating. Result table : %s" % result_table)
Example #7
0
def train(datasource,
          select,
          model_params,
          train_params,
          feature_metas,
          feature_column_names,
          label_meta,
          validation_select,
          disk_cache=False,
          batch_size=None,
          epoch=1,
          load_pretrained_model=False,
          is_pai=False,
          pai_train_table="",
          pai_validate_table="",
          rank=0,
          nworkers=1,
          oss_model_dir="",
          transform_fn=None,
          feature_column_code="",
          model_repo_image="",
          original_sql=""):
    if batch_size == -1:
        batch_size = None
    print("Start training XGBoost model...")
    dtrain = xgb_dataset(datasource,
                         'train.txt',
                         select,
                         feature_metas,
                         feature_column_names,
                         label_meta,
                         is_pai,
                         pai_train_table,
                         cache=disk_cache,
                         batch_size=batch_size,
                         epoch=epoch,
                         rank=rank,
                         nworkers=nworkers,
                         transform_fn=transform_fn,
                         feature_column_code=feature_column_code)
    if len(validation_select.strip()) > 0:
        dvalidate = list(
            xgb_dataset(datasource,
                        'validate.txt',
                        validation_select,
                        feature_metas,
                        feature_column_names,
                        label_meta,
                        is_pai,
                        pai_validate_table,
                        rank=rank,
                        nworkers=nworkers,
                        transform_fn=transform_fn,
                        feature_column_code=feature_column_code))[0]

    filename = "my_model"
    if load_pretrained_model:
        bst = xgb.Booster()
        bst.load_model(filename)
    else:
        bst = None

    re = None
    for per_batch_dmatrix in dtrain:
        watchlist = [(per_batch_dmatrix, "train")]
        if len(validation_select.strip()) > 0:
            watchlist.append((dvalidate, "validate"))

        re = dict()
        bst = xgb.train(model_params,
                        per_batch_dmatrix,
                        evals=watchlist,
                        evals_result=re,
                        xgb_model=bst,
                        **train_params)
        print("Evaluation result: %s" % re)

    if rank == 0:
        # TODO(sneaxiy): collect features and label
        metadata = collect_metadata(original_sql=original_sql,
                                    select=select,
                                    validation_select=validation_select,
                                    model_repo_image=model_repo_image,
                                    class_name=model_params.get("booster"),
                                    attributes=model_params,
                                    features=None,
                                    label=None,
                                    evaluation=re)
        save_model_to_local_file(bst, model_params, filename)
        save_metadata("model_meta.json", metadata)
        if is_pai and len(oss_model_dir) > 0:
            save_model(oss_model_dir, filename, model_params, train_params,
                       feature_metas, feature_column_names, label_meta,
                       feature_column_code)
Example #8
0
def predict(datasource,
            select,
            result_table,
            label_name,
            model,
            pai_table="",
            oss_model_path=""):
    """PAI XGBoost prediction wrapper
    This function do some preparation for the local prediction, say,
    download the model from OSS, extract metadata and so on.

    Args:
        datasource: the datasource from which to get data
        select: data selection SQL statement
        data_table: tmp table which holds the data from select
        result_table: table to save prediction result
        label_name: prediction label column
        oss_model_path: the model path on OSS
    """
    is_pai = True if pai_table != "" else False
    if is_pai:
        # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded
        # in xgboost/train.py
        oss.load_file(oss_model_path, "my_model")
        (estimator, model_params, train_params, feature_metas,
         feature_column_names, train_label_desc,
         fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc")
    else:
        if isinstance(model, six.string_types):
            model = Model.load_from_db(datasource, model)
        else:
            assert isinstance(
                model, Model), "not supported model type %s" % type(model)

        model_params = model.get_meta("attributes")
        fc_map_ir = model.get_meta("features")
        train_label_desc = model.get_meta("label").get_field_desc()[0]

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, label_name)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=None,
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            raw_data_dir=raw_data_dir)  # NOTE: default to use external memory

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            feature_file_name = os.path.join(
                tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Example #9
0
def evaluate(datasource,
             select,
             result_table,
             model,
             pred_label_name=None,
             model_params=None):
    """
    Do evaluation to a trained XGBoost model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        model (Model|str): the model object or where to load the model.
        pred_label_name (str): the label column name.
        model_params (dict): the parameters for evaluation.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    if model_params is None:
        model_params = {}

    validation_metrics = model_params.get("validation.metrics", "Accuracy")
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    if pred_label_name:
        train_label_desc.name = pred_label_name

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")
    conn = db.connect_with_data_source(datasource)

    result_column_names = create_evaluate_table(conn, result_table,
                                                validation_metrics)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=train_label_desc.to_dict(dtype_to_string=True),
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn)

        for i, pred_dmatrix in enumerate(dpred):
            feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()
Example #10
0
def pred(datasource, select, result_table, pred_label_name, model):
    """
    Do prediction using a trained model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        pred_label_name (str): the output label name to predict.
        model (Model|str): the model object or where to load the model.

    Returns:
        None.
    """
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, pred_label_name)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=None,
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            raw_data_dir=raw_data_dir)  # NOTE: default to use external memory

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            feature_file_name = os.path.join(
                tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Example #11
0
def evaluate(datasource,
             select,
             result_table,
             model,
             label_name=None,
             model_params=None,
             result_column_names=[],
             pai_table=None):
    """TBD
    """
    if model_params is None:
        model_params = {}
    validation_metrics = model_params.get("validation.metrics",
                                          "accuracy_score")
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]

    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    model_params = model.get_meta("attributes")
    fc_map_ir = model.get_meta("features")
    train_label = model.get_meta("label")
    train_label_desc = train_label.get_field_desc()[0]

    if label_name:
        train_label_desc.name = label_name

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    is_pai = True if pai_table else False
    if is_pai:
        conn = PaiIOConnection.from_table(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=train_label_desc.to_dict(dtype_to_string=True),
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            is_pai=is_pai,
            pai_table=pai_table,
            pai_single_file=True,
            feature_column_code=fc_map_ir)

        for i, pred_dmatrix in enumerate(dpred):
            if is_pai:
                feature_file_name = pred_fn
            else:
                feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()
Example #12
0
def predict(datasource,
            select,
            result_table,
            result_column_names,
            train_label_idx,
            model,
            extra_result_cols=[],
            pai_table=None):
    """TBD
    """
    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        # NOTE(typhoonzero): must run Model.load_from_db in a temp
        # directory, calling pyodps in current directory on PAI
        # workers will cause paiio fails.
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    model_params = model.get_meta("attributes")
    fc_map_ir = model.get_meta("features")
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    is_pai = True if pai_table else False
    if is_pai:
        conn = PaiIOConnection.from_table(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(datasource=datasource,
                            fn=pred_fn,
                            dataset_sql=select,
                            feature_metas=feature_metas,
                            feature_column_names=feature_column_names,
                            label_meta=None,
                            cache=True,
                            batch_size=10000,
                            transform_fn=transform_fn,
                            raw_data_dir=raw_data_dir,
                            is_pai=is_pai,
                            pai_table=pai_table,
                            pai_single_file=True,
                            feature_column_code=fc_map_ir)

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            if is_pai:
                feature_file_name = os.path.join(tmp_dir_name,
                                                 "predict.txt.raw")
            else:
                feature_file_name = os.path.join(
                    tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Example #13
0
def evaluate(datasource,
             select,
             result_table,
             load,
             pred_label_name=None,
             validation_metrics=["accuracy_score"]):
    """
    Do evaluation to a trained XGBoost model.

    Args:
        datasource (str): the database connection string.
        select (str): the input data to predict.
        result_table (str): the output data table.
        load (str): where the trained model stores.
        pred_label_name (str): the label column name.
        validation_metrics (list[str]): the evaluation metric names.

    Returns:
        None.
    """
    model = Model.load_from_db(datasource, load)
    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    if pred_label_name:
        train_label_desc.name = pred_label_name

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs])

    # NOTE: in the current implementation, we are generating a transform_fn
    # from the COLUMN clause. The transform_fn is executed during the process
    # of dumping the original data into DMatrix SVM file.
    compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type())
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *compiled_fc["feature_columns"])

    bst = xgb.Booster()
    bst.load_model("my_model")
    conn = db.connect_with_data_source(datasource)

    result_column_names = _create_evaluate_table(conn, result_table,
                                                 validation_metrics)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(datasource=datasource,
                            fn=pred_fn,
                            dataset_sql=select,
                            feature_metas=feature_metas,
                            feature_column_names=feature_column_names,
                            label_meta=train_label_desc.to_dict(),
                            cache=True,
                            batch_size=10000,
                            transform_fn=transform_fn)

        for i, pred_dmatrix in enumerate(dpred):
            feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()