def train(datasource, select, model_params, train_params, feature_metas, feature_column_names, label_meta, validation_select, disk_cache=False, batch_size=None, epoch=1, load_pretrained_model=False, is_pai=False, pai_train_table="", pai_validate_table="", rank=0, nworkers=1, oss_model_dir="", transform_fn=None, feature_column_code="", model_repo_image="", original_sql=""): if batch_size == -1: batch_size = None print("Start training XGBoost model...") dtrain = xgb_dataset(datasource, 'train.txt', select, feature_metas, feature_column_names, label_meta, is_pai, pai_train_table, cache=disk_cache, batch_size=batch_size, epoch=epoch, rank=rank, nworkers=nworkers, transform_fn=transform_fn, feature_column_code=feature_column_code) if len(validation_select.strip()) > 0: dvalidate = list( xgb_dataset(datasource, 'validate.txt', validation_select, feature_metas, feature_column_names, label_meta, is_pai, pai_validate_table, rank=rank, nworkers=nworkers, transform_fn=transform_fn, feature_column_code=feature_column_code))[0] filename = "my_model" if load_pretrained_model: bst = xgb.Booster() bst.load_model(filename) else: bst = None re = None for per_batch_dmatrix in dtrain: watchlist = [(per_batch_dmatrix, "train")] if len(validation_select.strip()) > 0: watchlist.append((dvalidate, "validate")) re = dict() bst = xgb.train(model_params, per_batch_dmatrix, evals=watchlist, evals_result=re, xgb_model=bst, **train_params) print("Evaluation result: %s" % re) if rank == 0: # TODO(sneaxiy): collect features and label metadata = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_repo_image, class_name=model_params.get("booster"), attributes=model_params, features=None, label=None, evaluation=re) save_model_to_local_file(bst, model_params, filename) save_metadata("model_meta.json", metadata) if is_pai and len(oss_model_dir) > 0: save_model(oss_model_dir, filename, model_params, train_params, feature_metas, feature_column_names, label_meta, feature_column_code)
def train(datasource, estimator_string, select, validation_select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, epoch=1, validation_steps=1, verbose=0, max_steps=None, validation_start_delay_secs=0, validation_throttle_secs=0, save_checkpoints_steps=100, log_every_n_iter=10, load_pretrained_model=False, is_pai=True, pai_table="", pai_val_table="", feature_columns_code="", model_params_code_map={}, model_repo_image="", original_sql="", feature_column_names_map=None): # TODO(sneaxiy): collect features and label model_meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_repo_image, class_name=estimator_string, attributes=model_params, features=None, label=None) estimator = import_model(estimator_string) is_estimator = is_tf_estimator(estimator) if verbose < 1: # always use verbose == 1 when using PAI to get more logs verbose = 1 set_log_level(verbose, is_estimator) model_params.update(feature_columns) FLAGS = define_tf_flags() set_oss_environs(FLAGS) num_workers = len(FLAGS.worker_hosts.split(",")) worker_id = FLAGS.task_index train_dataset_fn = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_table, batch_size, epochs=epoch, shuffle_size=1000, num_workers=num_workers, worker_id=worker_id) val_dataset_fn = None if validation_select: val_dataset_fn = get_dataset_fn(validation_select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_val_table, batch_size) if not is_estimator: if isinstance(estimator, types.FunctionType): # functional model need field_metas parameter model_params["field_metas"] = feature_metas keras_train_and_save(estimator, model_params, save, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epoch, verbose, validation_metrics, validation_steps, load_pretrained_model, model_meta) else: estimator_train_and_save(estimator, model_params, save, FLAGS, train_dataset_fn, val_dataset_fn, log_every_n_iter, max_steps, validation_start_delay_secs, validation_throttle_secs, save_checkpoints_steps, validation_metrics, load_pretrained_model, model_meta) # save model to OSS if num_workers == 1 or worker_id == 0: oss_model_dir = FLAGS.sqlflow_oss_modeldir oss.save_oss_model(oss_model_dir, estimator_string, is_estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params_code_map, feature_columns_code, num_workers) print("Model saved to oss: %s" % oss_model_dir) print("Done training")
def train(datasource, estimator_string, select, validation_select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, epoch=1, validation_steps=1, verbose=0, max_steps=None, validation_start_delay_secs=0, validation_throttle_secs=0, save_checkpoints_steps=100, log_every_n_iter=10, load_pretrained_model=False, is_pai=False, pai_table="", pai_val_table="", feature_columns_code="", model_params_code_map={}, model_repo_image="", original_sql="", feature_column_names_map=None): # NOTE(typhoonzero): feature_column_names_map is used only for PAI # submitter API. # TODO(sneaxiy): collect features and label model_meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_repo_image, class_name=estimator_string, attributes=model_params, features=None, label=None) estimator = import_model(estimator_string) is_estimator = is_tf_estimator(estimator) set_log_level(verbose, is_estimator) model_params.update(feature_columns) train_dataset_fn = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_table, batch_size, epochs=epoch, shuffle_size=1000) val_dataset_fn = None if validation_select: val_dataset_fn = get_dataset_fn(validation_select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_val_table, batch_size) if not is_estimator: # keras if isinstance(estimator, types.FunctionType): # functional model need field_metas parameter model_params["field_metas"] = feature_metas keras_train_and_save(estimator, model_params, save, is_pai, train_dataset_fn, val_dataset_fn, label_meta, epoch, verbose, validation_metrics, validation_steps, load_pretrained_model, model_meta) else: estimator_train_and_save(estimator, model_params, save, train_dataset_fn, val_dataset_fn, max_steps, validation_start_delay_secs, validation_throttle_secs, save_checkpoints_steps, validation_metrics, load_pretrained_model, model_meta) # remove cache files any(map(os.remove, glob.glob('cache_train.*'))) any(map(os.remove, glob.glob('cache_validation.*'))) print("Done training")
def local_train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, feature_metas, feature_column_names, feature_column_map, label_column, transform_fn, save, load="", is_pai=False, oss_model_dir=""): disk_cache = train_params.pop("disk_cache", False) batch_size = train_params.pop("batch_size", None) if batch_size is not None and batch_size < 0: batch_size = None epoch = train_params.pop("epoch", 1) num_workers = train_params.pop("num_workers", 1) label_meta_dict = label_column.get_field_desc()[0].to_dict( dtype_to_string=True) def build_dataset(fn, slct): return xgb_dataset(datasource, fn, slct, feature_metas, feature_column_names, label_meta_dict, cache=disk_cache, batch_size=batch_size, epoch=epoch, transform_fn=transform_fn) file_name = "my_model" if load: Model.load_from_db(datasource, load) bst = xgb.Booster() bst.load_model(file_name) else: bst = None with temp_file.TemporaryDirectory() as tmp_dir_name: train_fn = os.path.join(tmp_dir_name, 'train.txt') val_fn = os.path.join(tmp_dir_name, 'val.txt') train_dataset = build_dataset(train_fn, select) if validation_select: val_dataset = build_dataset(val_fn, validation_select) else: val_dataset = None eval_result = dict() watchlist = [None] if val_dataset: # The `xgboost.train` API only accepts the XGBoost DMatrix # object as the training or validation dataset, so we should # convert the generator to DMatrix. if isinstance(val_dataset, types.GeneratorType): val_dataset = list(val_dataset)[0] watchlist.append((val_dataset, "validate")) for per_batch_dmatrix in train_dataset: watchlist[0] = (per_batch_dmatrix, "train") bst = xgb.train(model_params, per_batch_dmatrix, evals=watchlist, evals_result=eval_result, xgb_model=bst, **train_params) print("Evaluation result: %s" % eval_result) meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_image, class_name=estimator_string, attributes=model_params, features=feature_column_map, label=label_column, evaluation=eval_result, num_workers=num_workers) save_model_to_local_file(bst, model_params, file_name) model = Model(EstimatorType.XGBOOST, meta) model.save_to_db(datasource, save) if is_pai and len(oss_model_dir) > 0: # TODO(typhoonzero): remove this since we are saving metas into db now. save_model(oss_model_dir, "my_model", model_params, train_params, feature_metas, feature_column_names, label_meta_dict, feature_column_map) return eval_result
def train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, feature_column_map, label_column, save, load=None): """ Train, evaluate and save the XGBoost model locally. Args: original_sql (str): the original SQL statement. model_image (str): the model repo docker image. estimator (str): the XGBoost booster type like xgboost.gbtree. datasource (str): the database connection URI. select (str): the SQL statement for training. validation_select (str): the SQL statement for evaluation. model_params (dict): the XGBoost model parameters. train_params (dict): the training parameters, can have disk_cache(bool), batch_size(int), epoch(int) settings in the dict. feature_column_map (dict): the feature column map to do derivation. label_column (FeatureColumn): the label column. save (str): the table name to save the trained model and meta. load (str): the table name to load the pretrained model. Returns: A dict which indicates the evaluation result. """ conn = db.connect_with_data_source(datasource) fc_map_ir, fc_label_ir = infer_feature_columns(conn, select, feature_column_map, label_column, n=1000) fc_map = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) feature_column_list = fc_map["feature_columns"] field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) label_meta = label_column.get_field_desc()[0].to_dict() # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_list) disk_cache = False batch_size = None epoch = 1 if "disk_cache" in train_params: disk_cache = train_params.pop("disk_cache") if "batch_size" in train_params: batch_size = train_params.pop("batch_size") if "epoch" in train_params: epoch = train_params.pop("epoch") def build_dataset(fn, slct): return xgb_dataset(datasource, fn, slct, feature_metas, feature_column_names, label_meta, cache=disk_cache, batch_size=batch_size, epoch=epoch, transform_fn=transform_fn) file_name = "my_model" if load: Model.load_from_db(datasource, load) bst = xgb.Booster() bst.load_model(file_name) else: bst = None with temp_file.TemporaryDirectory() as tmp_dir_name: train_fn = os.path.join(tmp_dir_name, 'train.txt') val_fn = os.path.join(tmp_dir_name, 'val.txt') train_dataset = build_dataset(train_fn, select) if validation_select: val_dataset = build_dataset(val_fn, validation_select) else: val_dataset = None eval_result = dict() watchlist = [None] if val_dataset: # The `xgboost.train` API only accepts the XGBoost DMatrix # object as the training or validation dataset, so we should # convert the generator to DMatrix. if isinstance(val_dataset, types.GeneratorType): val_dataset = list(val_dataset)[0] watchlist.append((val_dataset, "validate")) for per_batch_dmatrix in train_dataset: watchlist[0] = (per_batch_dmatrix, "train") bst = xgb.train(model_params, per_batch_dmatrix, evals=watchlist, evals_result=eval_result, xgb_model=bst, **train_params) print("Evaluation result: %s" % eval_result) meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_image, class_name=estimator_string, attributes=model_params, features=fc_map_ir, label=fc_label_ir, evaluation=eval_result, num_workers=1) save_model_to_local_file(bst, model_params, file_name) model = Model(EstimatorType.XGBOOST, meta) model.save_to_db(datasource, save) return eval_result
def train_step(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, validation_params, feature_column_map, label_column, save, load=None, pai_table=None, pai_val_table=None): if model_params is None: model_params = {} if train_params is None: train_params = {} if validation_params is None: validation_params = {} if load: Model.load_from_db(datasource, load) load = "model_save" else: load = None is_pai = True if pai_table else False fc_map = compile_ir_feature_columns(feature_column_map, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(feature_column_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # no label for clustering model label_meta = None if label_column: label_meta = label_column.get_field_desc()[0].to_dict( dtype_to_string=True) feature_column_names_map = dict() for target in feature_column_map: fclist = feature_column_map[target] feature_column_names_map[target] = [ fc.get_field_desc()[0].name for fc in fclist ] # Construct optimizer objects to pass to model initializer. # The original model_params is serializable (do not have tf.xxx objects). model_params_constructed = copy.deepcopy(model_params) for optimizer_arg in ["optimizer", "dnn_optimizer", "linear_optimizer"]: if optimizer_arg in model_params_constructed: model_params_constructed[optimizer_arg] = get_tf_optimizer( model_params_constructed[optimizer_arg]) if "loss" in model_params_constructed: model_params_constructed["loss"] = get_tf_loss( model_params_constructed["loss"]) # extract params for training. verbose = train_params.get("verbose", 1) batch_size = train_params.get("batch_size", 1) epoch = train_params.get("epoch", 1) save_checkpoints_steps = train_params.get("save_checkpoints_steps", 100) max_steps = train_params.get("max_steps", None) if max_steps is not None and max_steps <= 0: max_steps = None validation_metrics = validation_params.get("metrics", "Accuracy") validation_metrics = [v.strip() for v in validation_metrics.split(",")] validation_steps = validation_params.get("steps", 1) validation_start_delay_secs = validation_params.get("start_delay_secs", 0) validation_throttle_secs = validation_params.get("throttle_secs", 0) estimator = import_model(estimator_string) is_estimator = is_tf_estimator(estimator) # always use verbose == 1 when using PAI to get more logs if verbose < 1: verbose = 1 set_log_level(verbose, is_estimator) model_params_constructed.update(fc_map) FLAGS = define_tf_flags() set_oss_environs(FLAGS) num_workers = len(FLAGS.worker_hosts.split(",")) worker_id = FLAGS.task_index train_dataset_fn = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_table, batch_size, epochs=epoch, shuffle_size=1000, num_workers=num_workers, worker_id=worker_id) val_dataset_fn = None if validation_select or pai_val_table: val_dataset_fn = get_dataset_fn(validation_select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_val_table, batch_size) model_meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_image, class_name=estimator_string, attributes=model_params, features=feature_column_map, label=label_column) # FIXME(typhoonzero): avoid save model_meta twice, keras_train_and_save, # estimator_train_and_save also dumps model_meta to a file under cwd. # should only keep the model.save_to_db part. save_dir = "model_save" if not is_estimator: if isinstance(estimator, types.FunctionType): # functional model need field_metas parameter model_params_constructed["field_metas"] = feature_metas keras_train_and_save(estimator, model_params_constructed, save_dir, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epoch, verbose, validation_metrics, validation_steps, load, model_meta, is_pai) else: estimator_train_and_save(estimator, model_params_constructed, save_dir, FLAGS, train_dataset_fn, val_dataset_fn, max_steps, validation_start_delay_secs, validation_throttle_secs, save_checkpoints_steps, validation_metrics, load, model_meta) # save model to DB/OSS model = Model(EstimatorType.TENSORFLOW, model_meta) if num_workers == 1 or worker_id == 0: saved = model.save_to_db(datasource, save, oss_model_dir=FLAGS.sqlflow_oss_modeldir) print("Model saved to DB: %s" % saved) print("Done training")