def submit_pai_explain(datasource, select, result_table, model_name, model_params, user=""): """This function pack need params and resource to a tarball and submit a explain task to PAI Args: datasource: current datasource select: sql statement to get explain data set result_table: the table name to save result model_name: model used to do prediction model_params: dict, Params for training, crossponding to WITH clause """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) params["oss_model_path"] = oss_model_path label_column = model_params.get("label_col") params["label_column"] = label_column create_explain_result_table(datasource, data_table, result_table, model_type, estimator, label_column) setup_explain_entry(params, model_type) prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_explain_cmd(datasource, project, oss_model_path, model_name, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), label_column, cwd) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def submit_pai_evaluate(datasource, model_name, select, result_table, model_attrs, user=""): """Submit a PAI evaluation task Args: datasource: current datasource model_name: model used to do evaluation select: sql statement to get evaluate data set result_table: the table name to save result model_params: dict, Params for training, crossponding to WITH claus """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) params["oss_model_path"] = oss_model_path model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) if model_type == EstimatorType.PAIML: raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.") data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table metrics = get_evaluate_metrics(model_type, model_attrs) params["metrics"] = metrics create_evaluate_result_table(datasource, result_table, metrics) conf = cluster_conf.get_cluster_config(model_attrs) if model_type == EstimatorType.XGBOOST: params["entry_type"] = "evaluate_xgb" else: params["entry_type"] = "evaluate_tf" prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_tf_cmd(conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model_name, oss_model_path, data_table, "", result_table, project) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def submit_pai_explain(datasource, original_sql, select, model_name, model_params, result_table, explainer="TreeExplainer", user=""): """This function pack need params and resource to a tarball and submit a explain task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model_name: string Model to load and do prediction. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table params["explainer"] = explainer # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) params["oss_model_path"] = oss_model_path model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) params["load"] = model_name label_column = model_params.get("label_col") params["label_column"] = label_column create_explain_result_table(datasource, data_table, result_table, model_type, estimator, label_column) setup_explain_entry(params, model_type) prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_explain_cmd(datasource, project, oss_model_path, model_name, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), label_column, cwd) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def submit_pai_train(datasource, original_sql, select, validation_select, estimator_string, model_image, feature_column_map, label_column, model_params, train_params, save, load, user=""): """This function submit PAI-TF train task to the PAI platform. Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original statement used for generate train code. select: string The SQL statement for selecting data for train. validation_select: string Ths SQL statement for selecting data for validation. estimator_string: string TensorFlow estimator name, Keras class name, or XGBoost. model_image: string Docker image that is used to train the model. If it's empty, use default image sqlflow/sqlflow:step feature_column_map: dict A dict, key is the Estimator/Keras Model param name, value is runtime.feature.column. label_column: runtime.feature.column.FeatureColumn FeatureColumn describing the label. model_params: dict Params to construct the estimator/Keras Model. train_params: dict Params used to run the training. save: string Model name to save. load: string The pre-trained model name to load before training. user: string A string to identify the user, used to store models in the user's directory. """ # prepare params for to call runtime.pai.xxx_submitter.train_step(...), # the params will be pickled into train_params.pkl params = dict(locals()) if estimator_string.lower().startswith("xgboost"): params["entry_type"] = "train_xgb" else: params["entry_type"] = "train_tf" cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") train_table, val_table = table_ops.create_train_and_eval_tmp_table( select, validation_select, datasource) params["pai_table"], params["pai_val_table"] = train_table, val_table # clean target dir oss_path_to_save = pai_model.get_oss_model_save_path(datasource, save, user=user) oss_path_to_load = pai_model.get_oss_model_save_path(datasource, load, user=user) if oss_path_to_load == "" or oss_path_to_load != oss_path_to_save: pai_model.clean_oss_model_path(oss_path_to_save + "/") train_params["oss_path_to_load"] = oss_path_to_load # zip all required resource to a tarball prepare_archive(cwd, estimator_string, oss_path_to_save, params) # submit pai task to execute the training cmd = get_pai_train_cmd(datasource, estimator_string, save, train_table, val_table, model_params, train_params, oss_path_to_save, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), cwd) submit_pai_task(cmd, datasource) table_ops.drop_tables([train_table, val_table], datasource)
def submit_pai_evaluate(datasource, original_sql, select, label_name, model, model_params, result_table, user=""): """Submit a PAI evaluation task Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string The label name to evaluate. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) model_type, estimator = pai_model.get_saved_model_type_and_estimator( datasource, model) if model_type == EstimatorType.PAIML: raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.") if model_type == EstimatorType.XGBOOST: params["entry_type"] = "evaluate_xgb" validation_metrics = model_params.get("validation.metrics", "accuracy_score") else: params["entry_type"] = "evaluate_tf" validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(",")] with db.connect_with_data_source(datasource) as conn: result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table params["result_column_names"] = result_column_names if try_pai_local_run(params, oss_model_path): return conf = cluster_conf.get_cluster_config(model_params) with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_tf_cmd( conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model, oss_model_path, data_table, "", result_table, project) submit_pai_task(cmd, datasource)
def submit_pai_train(datasource, estimator_string, select, validation_select, model_params, save, load, **train_params): """This function submit PAI-TF train task to PAI platform Args: datasource: string Like: odps://access_id:[email protected]/api? curr_project=test_ci&scheme=http estimator_string: string TensorFlow estimator name, Keras class name, or XGBoost select: string The SQL statement for selecting data for train validation_select: string Ths SQL statement for selecting data for validation model_params: dict Params for training, crossponding to WITH clause load: string The pre-trained model name to load train_params: dict Extra train params, will be passed to runtime.tensorflow.train. """ # prepare params for tensorflow train, # the params will be pickled into train_params.pkl params = dict(locals()) del params["train_params"] params.update(train_params) if estimator_string.lower().startswith("xgboost"): params["entry_type"] = "train_xgb" else: params["entry_type"] = "train_tf" cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") train_table, val_table = table_ops.create_train_and_eval_tmp_table( select, validation_select, datasource) params["pai_table"], params["pai_val_table"] = train_table, val_table # FIXME(typhoonzero): get user from session user = "" if "user" in params: user = params["user"] # clean target dir path_to_save = pai_model.get_oss_model_save_path(datasource, save, user=user) path_to_load = pai_model.get_oss_model_save_path(datasource, load, user=user) params["oss_model_dir"] = path_to_save if path_to_load == "" or path_to_load != path_to_save: pai_model.clean_oss_model_path(path_to_save + "/") # zip all required resource to a tarball prepare_archive(cwd, estimator_string, path_to_save, params) # submit pai task to execute the training cmd = get_pai_train_cmd(datasource, estimator_string, save, train_table, val_table, model_params, train_params, path_to_save, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), cwd) submit_pai_task(cmd, datasource) # save trained model to sqlfs pai_model.save_model_to_sqlfs(datasource, path_to_save, save) table_ops.drop_tables([train_table, val_table], datasource)
def submit_pai_evaluate(datasource, original_sql, select, model_name, model_params, result_table, user=""): """Submit a PAI evaluation task Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model_name: string Model to load and do prediction. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) cwd = tempfile.mkdtemp(prefix="sqlflow", dir="/tmp") project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table oss_model_path = pai_model.get_oss_model_save_path(datasource, model_name, user=user) params["oss_model_path"] = oss_model_path model_type, estimator = pai_model.get_oss_saved_model_type_and_estimator( oss_model_path, project) if model_type == EstimatorType.PAIML: raise SQLFlowDiagnostic("PAI model evaluation is not supported yet.") data_table = table_ops.create_tmp_table_from_select(select, datasource) params["data_table"] = data_table metrics = get_evaluate_metrics(model_type, model_params) params["metrics"] = metrics create_evaluate_result_table(datasource, result_table, metrics) conf = cluster_conf.get_cluster_config(model_params) if model_type == EstimatorType.XGBOOST: params["entry_type"] = "evaluate_xgb" else: params["entry_type"] = "evaluate_tf" prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_tf_cmd(conf, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), ENTRY_FILE, model_name, oss_model_path, data_table, "", result_table, project) submit_pai_task(cmd, datasource) table_ops.drop_tables([data_table], datasource)
def submit_pai_predict(datasource, original_sql, select, model, label_name, model_params, result_table, user=""): """This function pack needed params and resource to a tarball and submit a prediction task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string Name of the label column, if not exist in select. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) model_type, estimator = \ pai_model.get_saved_model_type_and_estimator( datasource, model) setup_predict_entry(params, model_type) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table params["oss_model_path"] = oss_model_path params["model"] = "" if try_pai_local_run(params, oss_model_path): return with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_predict_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE)) submit_pai_task(cmd, datasource)
def submit_pai_predict(datasource, original_sql, select, model, label_name, pred_params, result_table, user=""): """This function pack needed params and resource to a tarball and submit a prediction task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string Name of the label column, if not exist in select. pred_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) model_metas = Model.load_metadata_from_db(datasource, model) model_type = model_metas.get_type() estimator = model_metas.get_meta("class_name") setup_predict_entry(params, model_type) train_label = model_metas.get_meta("label") if train_label is not None: train_label_desc = train_label.get_field_desc()[0] else: train_label_desc = None if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("predict.extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] with db.connect_with_data_source(datasource) as conn: result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name, extra_result_cols) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: del params["label_name"] params["pai_table"] = data_table params["result_column_names"] = result_column_names params["train_label_idx"] = train_label_idx params["extra_result_cols"] = extra_result_cols if try_pai_local_run(params, oss_model_path): return with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_predict_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, pred_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE)) submit_pai_task(cmd, datasource)
def submit_pai_explain(datasource, original_sql, select, model, model_params, result_table, explainer="TreeExplainer", user=""): """This function pack need params and resource to a tarball and submit a explain task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table: if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table # used to save the explain image timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") params["oss_dest"] = "explain_images/%s/%s" % (user, timestamp) add_env_to_params(params, "SQLFLOW_OSS_AK", "oss_ak") add_env_to_params(params, "SQLFLOW_OSS_SK", "oss_sk") add_env_to_params(params, "SQLFLOW_OSS_ALISA_ENDPOINT", "oss_endpoint") add_env_to_params(params, "SQLFLOW_OSS_ALISA_BUCKET", "oss_bucket_name") meta = Model.load_metadata_from_db(datasource, model) model_type = meta.get_type() estimator = meta.get_meta("class_name") label_name = model_params.get("label_col") if label_name is None: label_column = meta.get_meta("label") if label_column is not None: label_name = label_column.get_field_desc()[0].name setup_explain_entry(params, model_type) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table # Create explain result table if result_table: conn = db.connect_with_data_source(datasource) feature_columns = meta.get_meta("features") estimator_string = meta.get_meta("class_name") field_descs = get_ordered_field_descs(feature_columns) feature_column_names = [fd.name for fd in field_descs] create_explain_table(conn, meta.get_type(), explainer, estimator_string, result_table, feature_column_names) conn.close() if not try_pai_local_run(params, oss_model_path): with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_explain_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), label_name) submit_pai_task(cmd, datasource) if result_table: print('Saved result into: {}'.format(result_table)) else: print_oss_image(params["oss_dest"], params["oss_ak"], params["oss_sk"], params["oss_endpoint"], params["oss_bucket_name"])