def pred(datasource, select, feature_metas, feature_column_names, train_label_meta, pred_label_meta, result_table, is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table="", model_params=None, train_params=None, transform_fn=None, feature_column_code=""): if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = None dpred = xgb_dataset( datasource=datasource, fn='predict.txt', dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, cache=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, transform_fn=transform_fn, feature_column_code=feature_column_code, raw_data_dir="predict.raw.dir") # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("Start predicting XGBoost model...") if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) selected_cols = db.pai_selected_cols(pai_table) else: selected_cols = db.selected_cols(conn, select) feature_file_id = 0 train_label_name = train_label_meta["feature_name"] pred_label_name = pred_label_meta["feature_name"] for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, selected_cols, train_label_name, pred_label_name, feature_column_names, feature_metas, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) feature_file_id += 1 print("Done predicting. Predict table : %s" % result_table)
def pred_imp(datasource, select, feature_metas, feature_column_names, train_label_meta, pred_label_meta, result_table, is_pai=False, pai_table="", model_params=None, train_params=None, transform_fn=None, feature_column_code="", rank=0, nworkers=1): print("rank={} nworkers={}".format(rank, nworkers)) if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = PaiIOConnection.from_table(pai_table) dpred = xgb_dataset( datasource=datasource, fn='predict.txt', dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, cache=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, rank=rank, nworkers=nworkers, transform_fn=transform_fn, feature_column_code=feature_column_code, raw_data_dir="predict.raw.dir") # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("{} Start predicting XGBoost model...".format(datetime.now())) if not model_params: model_params = load_metadata("model_meta.json")["attributes"] selected_cols = db.selected_cols(conn, select) feature_file_id = 0 train_label_name = train_label_meta["feature_name"] pred_label_name = pred_label_meta["feature_name"] for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, selected_cols, train_label_name, pred_label_name, feature_column_names, feature_metas, is_pai, conn, result_table, rank) feature_file_id += 1 print("{} Done predicting. Predict table: {}".format( datetime.now(), result_table))
def build_dataset(fn, slct): return xgb_dataset(datasource, fn, slct, feature_metas, feature_column_names, label_meta_dict, cache=disk_cache, batch_size=batch_size, epoch=epoch, transform_fn=transform_fn)
def test_train(self): ds = testing.get_datasource() select = "SELECT * FROM iris.train" val_select = "SELECT * FROM iris.test" feature_column_names = [ feature_metas[k]["feature_name"] for k in feature_metas ] is_pai = False pai_train_table = "" train_params = {"num_boost_round": 20} model_params = {"num_classes": 3} with tempfile.TemporaryDirectory() as tmp_dir_name: train_fn = os.path.join(tmp_dir_name, 'train.txt') val_fn = os.path.join(tmp_dir_name, 'val.txt') dtrain = xgb_dataset(ds, train_fn, select, feature_metas, feature_column_names, label_meta, is_pai, pai_train_table) dval = xgb_dataset(ds, val_fn, val_select, feature_metas, feature_column_names, label_meta, is_pai, pai_train_table) eval_result = train(dtrain, train_params, model_params, dval) self.assertLess(eval_result['train']['rmse'][-1], 0.01) self.assertLess(eval_result['validate']['rmse'][-1], 0.01)
def build_dataset(fn, slct, pai_table): return xgb_dataset(datasource, fn, slct, feature_metas, feature_column_names, label_meta_dict, cache=disk_cache, batch_size=batch_size, epoch=epoch, transform_fn=transform_fn, is_pai=is_pai, pai_table=pai_table, feature_column_code=feature_column_map)
def evaluate(datasource, select, feature_metas, feature_column_names, label_meta, result_table, validation_metrics=["accuracy_score"], is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table="", model_params=None, transform_fn=None, feature_column_code=""): if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = None dpred = xgb_dataset(datasource, 'predict.txt', select, feature_metas, feature_column_names, label_meta, is_pai, pai_table, True, True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, transform_fn=transform_fn, feature_column_code=feature_column_code ) # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load model print("Start evaluating XGBoost model...") feature_file_id = 0 for pred_dmatrix in dpred: evaluate_and_store_result(bst, pred_dmatrix, feature_file_id, validation_metrics, model_params, feature_column_names, label_meta, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) feature_file_id += 1 print("Done evaluating. Result table : %s" % result_table)
def train(datasource, select, model_params, train_params, feature_metas, feature_column_names, label_meta, validation_select, disk_cache=False, batch_size=None, epoch=1, load_pretrained_model=False, is_pai=False, pai_train_table="", pai_validate_table="", rank=0, nworkers=1, oss_model_dir="", transform_fn=None, feature_column_code="", model_repo_image="", original_sql=""): if batch_size == -1: batch_size = None print("Start training XGBoost model...") dtrain = xgb_dataset(datasource, 'train.txt', select, feature_metas, feature_column_names, label_meta, is_pai, pai_train_table, cache=disk_cache, batch_size=batch_size, epoch=epoch, rank=rank, nworkers=nworkers, transform_fn=transform_fn, feature_column_code=feature_column_code) if len(validation_select.strip()) > 0: dvalidate = list( xgb_dataset(datasource, 'validate.txt', validation_select, feature_metas, feature_column_names, label_meta, is_pai, pai_validate_table, rank=rank, nworkers=nworkers, transform_fn=transform_fn, feature_column_code=feature_column_code))[0] filename = "my_model" if load_pretrained_model: bst = xgb.Booster() bst.load_model(filename) else: bst = None re = None for per_batch_dmatrix in dtrain: watchlist = [(per_batch_dmatrix, "train")] if len(validation_select.strip()) > 0: watchlist.append((dvalidate, "validate")) re = dict() bst = xgb.train(model_params, per_batch_dmatrix, evals=watchlist, evals_result=re, xgb_model=bst, **train_params) print("Evaluation result: %s" % re) if rank == 0: # TODO(sneaxiy): collect features and label metadata = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_repo_image, class_name=model_params.get("booster"), attributes=model_params, features=None, label=None, evaluation=re) save_model_to_local_file(bst, model_params, filename) save_metadata("model_meta.json", metadata) if is_pai and len(oss_model_dir) > 0: save_model(oss_model_dir, filename, model_params, train_params, feature_metas, feature_column_names, label_meta, feature_column_code)
def predict(datasource, select, result_table, label_name, model, pai_table="", oss_model_path=""): """PAI XGBoost prediction wrapper This function do some preparation for the local prediction, say, download the model from OSS, extract metadata and so on. Args: datasource: the datasource from which to get data select: data selection SQL statement data_table: tmp table which holds the data from select result_table: table to save prediction result label_name: prediction label column oss_model_path: the model path on OSS """ is_pai = True if pai_table != "" else False if is_pai: # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_metas, feature_column_names, train_label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir) # NOTE: default to use external memory print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def evaluate(datasource, select, result_table, model, pred_label_name=None, model_params=None): """ Do evaluation to a trained XGBoost model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. model (Model|str): the model object or where to load the model. pred_label_name (str): the label column name. model_params (dict): the parameters for evaluation. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(",")] model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if pred_label_name: train_label_desc.name = pred_label_name field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()
def pred(datasource, select, result_table, pred_label_name, model): """ Do prediction using a trained model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. pred_label_name (str): the output label name to predict. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, pred_label_name) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir) # NOTE: default to use external memory print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def evaluate(datasource, select, result_table, model, label_name=None, model_params=None, result_column_names=[], pai_table=None): """TBD """ if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "accuracy_score") validation_metrics = [m.strip() for m in validation_metrics.split(",")] bst = xgb.Booster() if isinstance(model, six.string_types): with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label = model.get_meta("label") train_label_desc = train_label.get_field_desc()[0] if label_name: train_label_desc.name = label_name feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) is_pai = True if pai_table else False if is_pai: conn = PaiIOConnection.from_table(pai_table) else: conn = db.connect_with_data_source(datasource) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, feature_column_code=fc_map_ir) for i, pred_dmatrix in enumerate(dpred): if is_pai: feature_file_name = pred_fn else: feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()
def predict(datasource, select, result_table, result_column_names, train_label_idx, model, extra_result_cols=[], pai_table=None): """TBD """ bst = xgb.Booster() if isinstance(model, six.string_types): # NOTE(typhoonzero): must run Model.load_from_db in a temp # directory, calling pyodps in current directory on PAI # workers will cause paiio fails. with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) is_pai = True if pai_table else False if is_pai: conn = PaiIOConnection.from_table(pai_table) else: conn = db.connect_with_data_source(datasource) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset(datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, feature_column_code=fc_map_ir) print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): if is_pai: feature_file_name = os.path.join(tmp_dir_name, "predict.txt.raw") else: feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def evaluate(datasource, select, result_table, load, pred_label_name=None, validation_metrics=["accuracy_score"]): """ Do evaluation to a trained XGBoost model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. load (str): where the trained model stores. pred_label_name (str): the label column name. validation_metrics (list[str]): the evaluation metric names. Returns: None. """ model = Model.load_from_db(datasource, load) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if pred_label_name: train_label_desc.name = pred_label_name field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = _create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset(datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()