def build_predict_config(self, flags): input_table = FLAGS.tables if "PAI" in tf.__version__ else flags.inputTable output_table = FLAGS.outputs if "PAI" in tf.__version__ else flags.outputTable ckp_dir = os.path.dirname(flags.checkpointPath) \ if '/' in flags.checkpointPath else flags.checkpointPath train_config_path = os.path.join(ckp_dir, "train_config.json") if tf.gfile.Exists(train_config_path): predict_checkpoint_path = flags.checkpointPath else: raise RuntimeError("Checkpoint in {} not found".format(ckp_dir)) with tf.gfile.Open(train_config_path, "r") as f: tf.logging.info("config file is {}".format(train_config_path)) train_config_dict = json.load(f) first_sequence = flags.firstSequence second_sequence = flags.secondSequence append_columns = flags.appendCols.split( ",") if flags.appendCols else [] if flags.inputSchema: input_schema = flags.inputSchema else: if "PAI" in tf.__version__: input_schema = get_selected_columns_schema( input_table, set([first_sequence, second_sequence] + append_columns)) else: input_schema = flags.inputSchema output_schema = flags.outputSchema for column_name in append_columns: output_schema += "," + column_name config_json = { 'preprocess_config': { 'input_schema': input_schema, 'first_sequence': flags.firstSequence, 'second_sequence': flags.secondSequence, 'output_schema': output_schema, 'sequence_length': train_config_dict["sequence_length"], 'label_name': flags.labelName, 'label_enumerate_values': flags.labelEnumerateValues if flags.labelEnumerateValues \ else train_config_dict["label_enumerate_values"] }, 'model_config': train_config_dict['_config_json']['model_config'], 'predict_config': { 'predict_checkpoint_path': predict_checkpoint_path, 'predict_input_fp': input_table, 'predict_output_fp': output_table, 'predict_batch_size': flags.batchSize } } if not 'bert' in config_json['model_config']['model_name']: config_json['model_config']['vocab_path'] = os.path.join( os.path.dirname(predict_checkpoint_path), "train_vocab.txt") user_param_dict = get_user_defined_prams_dict(flags.advancedParameters) for key, val in user_param_dict.items(): setattr(self, key, val) return config_json
def build_preprocess_config(self, flags): first_sequence, second_sequence, label_name = \ flags.firstSequence, flags.secondSequence, flags.labelName input_table = FLAGS.tables if "PAI" in tf.__version__ else flags.inputTable output_table = FLAGS.outputs if "PAI" in tf.__version__ else flags.outputTable append_columns = flags.appendCols.split( ",") if flags.appendCols else [] if "PAI" in tf.__version__: input_schema = get_selected_columns_schema( input_table, set([first_sequence, second_sequence, label_name] + append_columns)) else: input_schema = flags.inputSchema output_schema = flags.outputSchema for column_name in append_columns: output_schema += "," + column_name user_param_dict = get_user_defined_prams_dict(flags.advancedParameters) if flags.modelName in _name_to_app_model: tokenizer_name_or_path = user_param_dict.get( "tokenizer_name_or_path", "google-bert-base-zh") setattr(self, "model_name", "serialization") setattr(self, "app_model_name", flags.modelName) else: tokenizer_name_or_path = flags.modelName setattr(self, "model_name", "serialization") setattr(self, "app_model_name", "text_classify_bert") config_json = { "preprocess_config": { "preprocess_input_fp": input_table, "preprocess_output_fp": output_table, "preprocess_batch_size": flags.batchSize, "sequence_length": flags.sequenceLength, "tokenizer_name_or_path": tokenizer_name_or_path, "input_schema": input_schema, "first_sequence": flags.firstSequence, "second_sequence": flags.secondSequence, "label_name": flags.labelName, "label_enumerate_values": get_label_enumerate_values(flags.labelEnumerateValues), "output_schema": output_schema } } for key, val in user_param_dict.items(): setattr(self, key, val) return config_json
def build_train_config(self, flags): # Parse input table/csv schema first_sequence, second_sequence, label_name = \ flags.firstSequence, flags.secondSequence, flags.labelName label_enumerate_values = get_label_enumerate_values( flags.labelEnumerateValues) if "PAI" in tf.__version__: train_input_fp, eval_input_fp = FLAGS.tables.split(",") if first_sequence is None: assert flags.sequenceLength is not None input_schema = _name_to_app_model[ flags.modelName].get_input_tensor_schema( sequence_length=flags.sequenceLength) else: input_schema = get_selected_columns_schema( train_input_fp, [first_sequence, second_sequence, label_name]) else: train_input_fp, eval_input_fp = flags.inputTable.split(",") input_schema = flags.inputSchema train_input_fp, eval_input_fp = train_input_fp.strip( ), eval_input_fp.strip() # Parse args from APP's FLAGS config_json = { "preprocess_config": { "input_schema": input_schema, "first_sequence": first_sequence, "second_sequence": second_sequence, "sequence_length": flags.sequenceLength, "label_name": label_name, "label_enumerate_values": label_enumerate_values }, "model_config": { "model_name": flags.modelName, }, "train_config": { "train_input_fp": train_input_fp, "num_epochs": flags.numEpochs, "save_steps": flags.saveCheckpointSteps, "train_batch_size": flags.batchSize, "model_dir5": flags.checkpointDir, "optimizer_config": { "optimizer": flags.optimizerType, "learning_rate": flags.learningRate }, "distribution_config": { "distribution_strategy": flags.distributionStrategy, } }, "evaluate_config": { "eval_input_fp": eval_input_fp, "eval_batch_size": 32, "num_eval_steps": None } } tf.logging.info(flags.advancedParameters) user_param_dict = get_user_defined_prams_dict(flags.advancedParameters) tf.logging.info(user_param_dict) if flags.modelName in _name_to_app_model: default_model_params = _name_to_app_model[ flags.modelName].default_model_params() else: raise NotImplementedError for key, _ in default_model_params.items(): default_val = default_model_params[key] if key in user_param_dict: if isinstance(default_val, bool): tmp_val = (user_param_dict[key].lower() == "true") else: tmp_val = type(default_val)(user_param_dict[key]) config_json["model_config"][key] = tmp_val else: config_json["model_config"][key] = default_val config_json["model_config"]["num_labels"] = len( label_enumerate_values.split(",")) if "pretrain_model_name_or_path" in config_json["model_config"]: pretrain_model_name_or_path = config_json["model_config"][ "pretrain_model_name_or_path"] contrib_models_path = os.path.join(FLAGS.modelZooBasePath, "contrib_models.json") if not "PAI" in tf.__version__ and "oss://" in contrib_models_path: pass elif tf.gfile.Exists(contrib_models_path): with tf.gfile.Open( os.path.join(FLAGS.modelZooBasePath, "contrib_models.json")) as f: contrib_models = json.load(f) if pretrain_model_name_or_path in contrib_models: pretrain_model_name_or_path = contrib_models[ pretrain_model_name_or_path] config_json["model_config"][ "pretrain_model_name_or_path"] = pretrain_model_name_or_path config_json["preprocess_config"][ "tokenizer_name_or_path"] = pretrain_model_name_or_path else: config_json["preprocess_config"]["tokenizer_name_or_path"] = "" if "num_accumulated_batches" in user_param_dict: config_json["train_config"]["distribution_config"]["num_accumulated_batches"] = \ user_param_dict["num_accumulated_batches"] if "pull_evaluation_in_multiworkers_training" in user_param_dict: config_json["train_config"]["distribution_config"]["pull_evaluation_in_multiworkers_training"] = \ (user_param_dict["pull_evaluation_in_multiworkers_training"].lower() == "true") other_param_keys = { "train_config": ["throttle_secs", "keep_checkpoint_max", "log_step_count_steps"], "optimizer_config": [ "weight_decay_ratio", "lr_decay", "warmup_ratio", "gradient_clip", "clip_norm_value" ], "evaluate_config": ["eval_batch_size", "num_eval_steps"], } for first_key, second_key_list in other_param_keys.items(): for second_key in second_key_list: if second_key in user_param_dict: obj = config_json["train_config"][first_key] if first_key == "optimizer_config" \ else config_json[first_key] obj[second_key] = user_param_dict[second_key] if "shuffle_buffer_size" in user_param_dict: setattr(self, "shuffle_buffer_size", int(user_param_dict["shuffle_buffer_size"])) else: setattr(self, "shuffle_buffer_size", None) if "init_checkpoint_path" in user_param_dict: setattr(self, "init_checkpoint_path", user_param_dict["init_checkpoint_path"]) if "export_best_checkpoint" in user_param_dict: assert user_param_dict["export_best_checkpoint"].lower() in [ "true", "false" ] if user_param_dict["export_best_checkpoint"].lower() == "true": setattr(self, "export_best_checkpoint", True) else: setattr(self, "export_best_checkpoint", False) if "export_best_checkpoint_metric" in user_param_dict: setattr(self, "export_best_checkpoint_metric", user_param_dict["export_best_checkpoint_metric"]) else: if flags.modelName.startswith("text_classify"): setattr(self, "export_best_checkpoint_metric", "py_accuracy") elif flags.modelName.startswith( "text_match") and label_enumerate_values is None: setattr(self, "export_best_checkpoint_metric", "mse") else: setattr(self, "export_best_checkpoint_metric", "accuracy") return config_json
def __init__(self): """ Configuration adapter for `ez_bert_feat` It adapts user command args to configuration protocol of `ez_transfer` engine """ self.mode = "predict_on_the_fly" if FLAGS.usePAI: input_table = FLAGS.tables output_table = FLAGS.outputs else: input_table = _APP_FLAGS.inputTable output_table = _APP_FLAGS.outputTable all_pretrain_model_archive_map = dict() all_pretrain_model_archive_map.update( ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) all_pretrain_model_archive_map.update( BERT_PRETRAINED_MODEL_ARCHIVE_MAP) all_pretrain_model_archive_map.update( ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) if _APP_FLAGS.modelName not in all_pretrain_model_archive_map: predict_checkpoint_path = _APP_FLAGS.modelName else: predict_checkpoint_path = os.path.join( FLAGS.modelZooBasePath, os.path.dirname( all_pretrain_model_archive_map[_APP_FLAGS.modelName])) predict_checkpoint_dir = os.path.dirname(predict_checkpoint_path) if tf.gfile.Exists( os.path.join(predict_checkpoint_dir, "train_config.json")): with tf.gfile.Open( os.path.join(predict_checkpoint_dir, "train_config.json")) as f: train_config_json = json.load(f) if "model_name" in train_config_json: finetune_model_name = train_config_json["model_name"] else: finetune_model_name = None if "_config_json" in train_config_json: train_model_config = train_config_json["_config_json"][ "model_config"] else: train_model_config = None else: finetune_model_name = None train_model_config = None if FLAGS.usePAI: all_input_col_names = get_all_columns_name(input_table) else: all_input_col_names = set( [t.split(":")[0] for t in _APP_FLAGS.inputSchema.split(",")]) first_sequence = _APP_FLAGS.firstSequence assert first_sequence in all_input_col_names, "The first sequence should be in input schema" second_sequence = _APP_FLAGS.secondSequence if second_sequence not in all_input_col_names: second_sequence = "" append_columns = [t for t in _APP_FLAGS.appendCols.split(",") if t and t in all_input_col_names] \ if _APP_FLAGS.appendCols else [] tf.logging.info(input_table) if FLAGS.usePAI: selected_cols_set = [first_sequence] if second_sequence: selected_cols_set.append(second_sequence) selected_cols_set.extend(append_columns) selected_cols_set = set(selected_cols_set) input_schema = get_selected_columns_schema(input_table, selected_cols_set) else: assert _APP_FLAGS.inputSchema is not None input_schema = _APP_FLAGS.inputSchema output_schema = _APP_FLAGS.outputSchema for column_name in append_columns: output_schema += "," + column_name config_json = { 'preprocess_config': { 'input_schema': input_schema, 'first_sequence': first_sequence, 'second_sequence': second_sequence, 'output_schema': output_schema, 'sequence_length': _APP_FLAGS.sequenceLength, "max_predictions_per_seq": 20 }, 'model_config': { 'model_name': 'feat_ext_bert', 'pretrain_model_name_or_path': _APP_FLAGS.modelName, 'finetune_model_name': finetune_model_name, }, 'predict_config': { 'predict_checkpoint_path': predict_checkpoint_path, 'predict_batch_size': _APP_FLAGS.batchSize, 'predict_input_fp': input_table, 'predict_output_fp': output_table } } if train_model_config: for key, val in train_model_config.items(): if key not in config_json["model_config"]: config_json["model_config"][str(key)] = val config_json["worker_hosts"] = FLAGS.worker_hosts config_json["task_index"] = FLAGS.task_index config_json["job_name"] = FLAGS.job_name config_json["num_gpus"] = FLAGS.workerGPU config_json["num_workers"] = FLAGS.workerCount tf.logging.info("{}".format(config_json)) super(BertFeatConfig, self).__init__(mode="predict_on_the_fly", config_json=config_json) for key, val in self.__dict__.items(): tf.logging.info(" {}: {}".format(key, val))