def test_pai_train_step(self): from runtime.step.tensorflow.train import train_step model_params = dict() model_params["hidden_units"] = [10, 20] model_params["n_classes"] = 3 original_sql = """ SELECT * FROM alifin_jtest_dev.sqlflow_test_iris_train TO TRAIN DNNClassifier WITH model.n_classes = 3, model.hidden_units = [10, 20] LABEL class INTO e2etest_pai_dnn;""" datasource = testing.get_datasource() save = "e2etest_pai_dnn" FLAGS = define_tf_flags() FLAGS.sqlflow_oss_ak = os.getenv("SQLFLOW_OSS_AK") FLAGS.sqlflow_oss_sk = os.getenv("SQLFLOW_OSS_SK") FLAGS.sqlflow_oss_ep = os.getenv("SQLFLOW_OSS_MODEL_ENDPOINT") oss_path_to_save = pai_model.get_oss_model_save_path(datasource, save, user="") FLAGS.sqlflow_oss_modeldir = pai_model.get_oss_model_url( oss_path_to_save) train_step(original_sql, "", "DNNClassifier", datasource, "SELECT * FROM alifin_jtest_dev.sqlflow_iris_train", "", "alifin_jtest_dev.sqlflow_iris_train", "", model_params, {}, feature_column_map, label_column, save, None)
def evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, pai_table=""): FLAGS = define_tf_flags() set_oss_environs(FLAGS) estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) is_pai = True if pai_table else False eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=is_pai, pai_table=pai_table, batch_size=batch_size) model_params.update(feature_columns) pop_optimizer_and_loss(model_params) if is_estimator: with open("exported_path", "r") as fid: exported_path = str(fid.read()) model_params["warm_start_from"] = exported_path estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator_cls, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) if result_table: metric_name_list = ["loss"] + validation_metrics if is_pai: conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) write_result_metrics(result_metrics, metric_name_list, result_table, conn) conn.close()
def init_pai_local_tf_flags_and_envs(oss_model_dir): FLAGS = define_tf_flags() FLAGS.sqlflow_oss_ak = os.getenv("SQLFLOW_OSS_AK") FLAGS.sqlflow_oss_sk = os.getenv("SQLFLOW_OSS_SK") FLAGS.sqlflow_oss_ep = os.getenv("SQLFLOW_OSS_MODEL_ENDPOINT") if not oss_model_dir.startswith("oss://"): oss_model_dir = pai_model.get_oss_model_url(oss_model_dir) FLAGS.sqlflow_oss_modeldir = oss_model_dir FLAGS.checkpointDir = os.getcwd() set_oss_environs(FLAGS)
if name in params: dict_args[name] = params[name] return func(**dict_args) def entrypoint(): with open("train_params.pkl", "rb") as file: params = pickle.load(file) if params["entry_type"] == "train_tf": call_fun(train_tf, params) elif params["entry_type"] == "train_xgb": call_fun(train_xgb, params) elif params["entry_type"] == "predict_tf": call_fun(predict_tf, params) elif params["entry_type"] == "predict_xgb": call_fun(predict_xgb, params) elif params["entry_type"] == "explain_tf": call_fun(explain_tf, params) elif params["entry_type"] == "explain_xgb": call_fun(explain_xgb, params) elif params["entry_type"] == "evaluate_tf": call_fun(evaluate_tf, params) elif params["entry_type"] == "evaluate_xgb": call_fun(evaluate_xgb, params) if __name__ == "__main__": FLAGS = define_tf_flags() set_oss_environs(FLAGS) entrypoint()
def train(datasource, estimator_string, select, validation_select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, epoch=1, validation_steps=1, verbose=0, max_steps=None, validation_start_delay_secs=0, validation_throttle_secs=0, save_checkpoints_steps=100, log_every_n_iter=10, load_pretrained_model=False, is_pai=True, pai_table="", pai_val_table="", feature_columns_code="", model_params_code_map={}, model_repo_image="", original_sql="", feature_column_names_map=None): # TODO(sneaxiy): collect features and label model_meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_repo_image, class_name=estimator_string, attributes=model_params, features=None, label=None) estimator = import_model(estimator_string) is_estimator = is_tf_estimator(estimator) if verbose < 1: # always use verbose == 1 when using PAI to get more logs verbose = 1 set_log_level(verbose, is_estimator) model_params.update(feature_columns) FLAGS = define_tf_flags() set_oss_environs(FLAGS) num_workers = len(FLAGS.worker_hosts.split(",")) worker_id = FLAGS.task_index train_dataset_fn = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_table, batch_size, epochs=epoch, shuffle_size=1000, num_workers=num_workers, worker_id=worker_id) val_dataset_fn = None if validation_select: val_dataset_fn = get_dataset_fn(validation_select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_val_table, batch_size) if not is_estimator: if isinstance(estimator, types.FunctionType): # functional model need field_metas parameter model_params["field_metas"] = feature_metas keras_train_and_save(estimator, model_params, save, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epoch, verbose, validation_metrics, validation_steps, load_pretrained_model, model_meta) else: estimator_train_and_save(estimator, model_params, save, FLAGS, train_dataset_fn, val_dataset_fn, log_every_n_iter, max_steps, validation_start_delay_secs, validation_throttle_secs, save_checkpoints_steps, validation_metrics, load_pretrained_model, model_meta) # save model to OSS if num_workers == 1 or worker_id == 0: oss_model_dir = FLAGS.sqlflow_oss_modeldir oss.save_oss_model(oss_model_dir, estimator_string, is_estimator, feature_column_names, feature_column_names_map, feature_metas, label_meta, model_params_code_map, feature_columns_code, num_workers) print("Model saved to oss: %s" % oss_model_dir) print("Done training")
def train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, validation_params, feature_column_map, label_column, save, load=None, pai_table="", pai_val_table=""): is_pai = True if pai_table != "" else False is_dist_train = False FLAGS = None oss_model_dir = "" if is_pai: FLAGS = define_tf_flags() num_workers = len(FLAGS.worker_hosts.split(",")) is_dist_train = num_workers > 1 oss_model_dir = FLAGS.sqlflow_oss_modeldir try: oss_path_to_load = train_params.pop("oss_path_to_load") if load: oss.load_file(oss_path_to_load, "my_model") except: # noqa: E722 pass feature_columns = compile_ir_feature_columns(feature_column_map, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(feature_column_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) label_meta = label_column.get_field_desc()[0].to_dict(dtype_to_string=True) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) batch_size = train_params.pop("batch_size", None) epoch = train_params.pop("epoch", 1) load_pretrained_model = True if load else False disk_cache = train_params.pop("disk_cache", False) if is_dist_train: # NOTE(typhoonzero): dist_train returns None dist_train(flags=FLAGS, datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=feature_column_map, model_repo_image=model_image, original_sql=original_sql) else: return local_train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, feature_metas, feature_column_names, feature_column_map, label_column, transform_fn, save, load=load, is_pai=is_pai, oss_model_dir=oss_model_dir)
def train(datasource, estimator_string, select, validation_select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, train_params={}, validation_metrics=["Accuracy"], disk_cache=False, save="", batch_size=None, epoch=1, validation_steps=1, verbose=0, max_steps=None, validation_start_delay_secs=0, validation_throttle_secs=0, save_checkpoints_steps=100, log_every_n_iter=10, load_pretrained_model=False, is_pai=True, pai_table="", pai_val_table="", feature_columns_code="", model_repo_image="", original_sql="", oss_model_dir_to_load="", feature_column_names_map=None): FLAGS = define_tf_flags() num_workers = len(FLAGS.worker_hosts.split(",")) is_dist_train = num_workers > 1 oss_model_dir = FLAGS.sqlflow_oss_modeldir if load_pretrained_model: oss.load_file(oss_model_dir_to_load, "my_model") # NOTE: in the current implementation, we are generating a transform_fn # from COLUMN clause. The transform_fn is executed during the process of # dumping the original data into DMatrix SVM file. transform_fn = ComposedColumnTransformer(feature_column_names, *feature_columns) if is_dist_train: dist_train(flags=FLAGS, datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=feature_columns_code, model_repo_image=model_repo_image, original_sql=original_sql) else: local_train(datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, rank=0, nworkers=1, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=feature_columns_code, model_repo_image=model_repo_image, original_sql=original_sql)
def train_step(original_sql, model_image, estimator_string, datasource, select, validation_select, pai_table, pai_val_table, model_params, train_params, feature_column_map, label_column, save, load=None): FLAGS = define_tf_flags() num_workers = len(FLAGS.worker_hosts.split(",")) is_dist_train = num_workers > 1 oss_model_dir = FLAGS.sqlflow_oss_modeldir oss_path_to_load = train_params.pop("oss_path_to_load") if load: oss.load_file(oss_path_to_load, "my_model") conn = db.connect_with_data_source(datasource) fc_map_ir, fc_label_ir = infer_feature_columns(conn, select, feature_column_map, label_column, n=1000) feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) label_meta = label_column.get_field_desc()[0].to_dict() transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) batch_size = train_params.pop("batch_size", None) epoch = train_params.pop("epoch", 1) load_pretrained_model = True if load else False disk_cache = train_params.pop("disk_cache", False) if is_dist_train: dist_train(flags=FLAGS, datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=fc_map_ir, model_repo_image=model_image, original_sql=original_sql) else: local_train(datasource=datasource, select=select, model_params=model_params, train_params=train_params, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=label_meta, validation_select=validation_select, disk_cache=disk_cache, batch_size=batch_size, epoch=epoch, load_pretrained_model=load_pretrained_model, is_pai=True, pai_train_table=pai_table, pai_validate_table=pai_val_table, rank=0, nworkers=1, oss_model_dir=oss_model_dir, transform_fn=transform_fn, feature_column_code=fc_map_ir, model_repo_image=model_image, original_sql=original_sql)
def train_step(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, validation_params, feature_column_map, label_column, save, load=None, pai_table=None, pai_val_table=None): if model_params is None: model_params = {} if train_params is None: train_params = {} if validation_params is None: validation_params = {} if load: Model.load_from_db(datasource, load) load = "model_save" else: load = None is_pai = True if pai_table else False fc_map = compile_ir_feature_columns(feature_column_map, EstimatorType.TENSORFLOW) field_descs = get_ordered_field_descs(feature_column_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # no label for clustering model label_meta = None if label_column: label_meta = label_column.get_field_desc()[0].to_dict( dtype_to_string=True) feature_column_names_map = dict() for target in feature_column_map: fclist = feature_column_map[target] feature_column_names_map[target] = [ fc.get_field_desc()[0].name for fc in fclist ] # Construct optimizer objects to pass to model initializer. # The original model_params is serializable (do not have tf.xxx objects). model_params_constructed = copy.deepcopy(model_params) for optimizer_arg in ["optimizer", "dnn_optimizer", "linear_optimizer"]: if optimizer_arg in model_params_constructed: model_params_constructed[optimizer_arg] = get_tf_optimizer( model_params_constructed[optimizer_arg]) if "loss" in model_params_constructed: model_params_constructed["loss"] = get_tf_loss( model_params_constructed["loss"]) # extract params for training. verbose = train_params.get("verbose", 1) batch_size = train_params.get("batch_size", 1) epoch = train_params.get("epoch", 1) save_checkpoints_steps = train_params.get("save_checkpoints_steps", 100) max_steps = train_params.get("max_steps", None) if max_steps is not None and max_steps <= 0: max_steps = None validation_metrics = validation_params.get("metrics", "Accuracy") validation_metrics = [v.strip() for v in validation_metrics.split(",")] validation_steps = validation_params.get("steps", 1) validation_start_delay_secs = validation_params.get("start_delay_secs", 0) validation_throttle_secs = validation_params.get("throttle_secs", 0) estimator = import_model(estimator_string) is_estimator = is_tf_estimator(estimator) # always use verbose == 1 when using PAI to get more logs if verbose < 1: verbose = 1 set_log_level(verbose, is_estimator) model_params_constructed.update(fc_map) FLAGS = define_tf_flags() set_oss_environs(FLAGS) num_workers = len(FLAGS.worker_hosts.split(",")) worker_id = FLAGS.task_index train_dataset_fn = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_table, batch_size, epochs=epoch, shuffle_size=1000, num_workers=num_workers, worker_id=worker_id) val_dataset_fn = None if validation_select or pai_val_table: val_dataset_fn = get_dataset_fn(validation_select, datasource, feature_column_names, feature_metas, label_meta, is_pai, pai_val_table, batch_size) model_meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_image, class_name=estimator_string, attributes=model_params, features=feature_column_map, label=label_column) # FIXME(typhoonzero): avoid save model_meta twice, keras_train_and_save, # estimator_train_and_save also dumps model_meta to a file under cwd. # should only keep the model.save_to_db part. save_dir = "model_save" if not is_estimator: if isinstance(estimator, types.FunctionType): # functional model need field_metas parameter model_params_constructed["field_metas"] = feature_metas keras_train_and_save(estimator, model_params_constructed, save_dir, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epoch, verbose, validation_metrics, validation_steps, load, model_meta, is_pai) else: estimator_train_and_save(estimator, model_params_constructed, save_dir, FLAGS, train_dataset_fn, val_dataset_fn, max_steps, validation_start_delay_secs, validation_throttle_secs, save_checkpoints_steps, validation_metrics, load, model_meta) # save model to DB/OSS model = Model(EstimatorType.TENSORFLOW, model_meta) if num_workers == 1 or worker_id == 0: saved = model.save_to_db(datasource, save, oss_model_dir=FLAGS.sqlflow_oss_modeldir) print("Model saved to DB: %s" % saved) print("Done training")