Esempio n. 1
0
    def build_predict_config(self, flags):
        input_table = FLAGS.tables if "PAI" in tf.__version__ else flags.inputTable
        output_table = FLAGS.outputs if "PAI" in tf.__version__ else flags.outputTable
        ckp_dir = os.path.dirname(flags.checkpointPath) \
            if '/' in flags.checkpointPath else flags.checkpointPath
        train_config_path = os.path.join(ckp_dir, "train_config.json")
        if tf.gfile.Exists(train_config_path):
            predict_checkpoint_path = flags.checkpointPath
        else:
            raise RuntimeError("Checkpoint in {} not found".format(ckp_dir))

        with tf.gfile.Open(train_config_path, "r") as f:
            tf.logging.info("config file is {}".format(train_config_path))
            train_config_dict = json.load(f)

        first_sequence = flags.firstSequence
        second_sequence = flags.secondSequence
        append_columns = flags.appendCols.split(
            ",") if flags.appendCols else []
        if flags.inputSchema:
            input_schema = flags.inputSchema
        else:
            if "PAI" in tf.__version__:
                input_schema = get_selected_columns_schema(
                    input_table,
                    set([first_sequence, second_sequence] + append_columns))
            else:
                input_schema = flags.inputSchema
        output_schema = flags.outputSchema
        for column_name in append_columns:
            output_schema += "," + column_name

        config_json = {
            'preprocess_config': {
                'input_schema': input_schema,
                'first_sequence': flags.firstSequence,
                'second_sequence': flags.secondSequence,
                'output_schema': output_schema,
                'sequence_length': train_config_dict["sequence_length"],
                'label_name': flags.labelName,
                'label_enumerate_values': flags.labelEnumerateValues if flags.labelEnumerateValues \
                        else train_config_dict["label_enumerate_values"]
            },
            'model_config': train_config_dict['_config_json']['model_config'],
            'predict_config': {
                'predict_checkpoint_path': predict_checkpoint_path,
                'predict_input_fp': input_table,
                'predict_output_fp': output_table,
                'predict_batch_size': flags.batchSize
            }
        }
        if not 'bert' in config_json['model_config']['model_name']:
            config_json['model_config']['vocab_path'] = os.path.join(
                os.path.dirname(predict_checkpoint_path), "train_vocab.txt")
        user_param_dict = get_user_defined_prams_dict(flags.advancedParameters)
        for key, val in user_param_dict.items():
            setattr(self, key, val)
        return config_json
Esempio n. 2
0
    def build_preprocess_config(self, flags):
        first_sequence, second_sequence, label_name = \
            flags.firstSequence, flags.secondSequence, flags.labelName
        input_table = FLAGS.tables if "PAI" in tf.__version__ else flags.inputTable
        output_table = FLAGS.outputs if "PAI" in tf.__version__ else flags.outputTable
        append_columns = flags.appendCols.split(
            ",") if flags.appendCols else []
        if "PAI" in tf.__version__:
            input_schema = get_selected_columns_schema(
                input_table,
                set([first_sequence, second_sequence, label_name] +
                    append_columns))
        else:
            input_schema = flags.inputSchema
        output_schema = flags.outputSchema
        for column_name in append_columns:
            output_schema += "," + column_name

        user_param_dict = get_user_defined_prams_dict(flags.advancedParameters)
        if flags.modelName in _name_to_app_model:
            tokenizer_name_or_path = user_param_dict.get(
                "tokenizer_name_or_path", "google-bert-base-zh")
            setattr(self, "model_name", "serialization")
            setattr(self, "app_model_name", flags.modelName)
        else:
            tokenizer_name_or_path = flags.modelName
            setattr(self, "model_name", "serialization")
            setattr(self, "app_model_name", "text_classify_bert")
        config_json = {
            "preprocess_config": {
                "preprocess_input_fp":
                input_table,
                "preprocess_output_fp":
                output_table,
                "preprocess_batch_size":
                flags.batchSize,
                "sequence_length":
                flags.sequenceLength,
                "tokenizer_name_or_path":
                tokenizer_name_or_path,
                "input_schema":
                input_schema,
                "first_sequence":
                flags.firstSequence,
                "second_sequence":
                flags.secondSequence,
                "label_name":
                flags.labelName,
                "label_enumerate_values":
                get_label_enumerate_values(flags.labelEnumerateValues),
                "output_schema":
                output_schema
            }
        }
        for key, val in user_param_dict.items():
            setattr(self, key, val)
        return config_json
Esempio n. 3
0
    def build_train_config(self, flags):
        # Parse input table/csv schema
        first_sequence, second_sequence, label_name = \
            flags.firstSequence, flags.secondSequence, flags.labelName
        label_enumerate_values = get_label_enumerate_values(
            flags.labelEnumerateValues)
        if "PAI" in tf.__version__:
            train_input_fp, eval_input_fp = FLAGS.tables.split(",")
            if first_sequence is None:
                assert flags.sequenceLength is not None
                input_schema = _name_to_app_model[
                    flags.modelName].get_input_tensor_schema(
                        sequence_length=flags.sequenceLength)
            else:
                input_schema = get_selected_columns_schema(
                    train_input_fp,
                    [first_sequence, second_sequence, label_name])
        else:
            train_input_fp, eval_input_fp = flags.inputTable.split(",")
            input_schema = flags.inputSchema
        train_input_fp, eval_input_fp = train_input_fp.strip(
        ), eval_input_fp.strip()
        # Parse args from APP's FLAGS
        config_json = {
            "preprocess_config": {
                "input_schema": input_schema,
                "first_sequence": first_sequence,
                "second_sequence": second_sequence,
                "sequence_length": flags.sequenceLength,
                "label_name": label_name,
                "label_enumerate_values": label_enumerate_values
            },
            "model_config": {
                "model_name": flags.modelName,
            },
            "train_config": {
                "train_input_fp": train_input_fp,
                "num_epochs": flags.numEpochs,
                "save_steps": flags.saveCheckpointSteps,
                "train_batch_size": flags.batchSize,
                "model_dir5": flags.checkpointDir,
                "optimizer_config": {
                    "optimizer": flags.optimizerType,
                    "learning_rate": flags.learningRate
                },
                "distribution_config": {
                    "distribution_strategy": flags.distributionStrategy,
                }
            },
            "evaluate_config": {
                "eval_input_fp": eval_input_fp,
                "eval_batch_size": 32,
                "num_eval_steps": None
            }
        }

        tf.logging.info(flags.advancedParameters)

        user_param_dict = get_user_defined_prams_dict(flags.advancedParameters)

        tf.logging.info(user_param_dict)
        if flags.modelName in _name_to_app_model:
            default_model_params = _name_to_app_model[
                flags.modelName].default_model_params()
        else:
            raise NotImplementedError
        for key, _ in default_model_params.items():
            default_val = default_model_params[key]
            if key in user_param_dict:
                if isinstance(default_val, bool):
                    tmp_val = (user_param_dict[key].lower() == "true")
                else:
                    tmp_val = type(default_val)(user_param_dict[key])
                config_json["model_config"][key] = tmp_val
            else:
                config_json["model_config"][key] = default_val

        config_json["model_config"]["num_labels"] = len(
            label_enumerate_values.split(","))
        if "pretrain_model_name_or_path" in config_json["model_config"]:
            pretrain_model_name_or_path = config_json["model_config"][
                "pretrain_model_name_or_path"]
            contrib_models_path = os.path.join(FLAGS.modelZooBasePath,
                                               "contrib_models.json")
            if not "PAI" in tf.__version__ and "oss://" in contrib_models_path:
                pass
            elif tf.gfile.Exists(contrib_models_path):
                with tf.gfile.Open(
                        os.path.join(FLAGS.modelZooBasePath,
                                     "contrib_models.json")) as f:
                    contrib_models = json.load(f)
                if pretrain_model_name_or_path in contrib_models:
                    pretrain_model_name_or_path = contrib_models[
                        pretrain_model_name_or_path]

            config_json["model_config"][
                "pretrain_model_name_or_path"] = pretrain_model_name_or_path
            config_json["preprocess_config"][
                "tokenizer_name_or_path"] = pretrain_model_name_or_path
        else:
            config_json["preprocess_config"]["tokenizer_name_or_path"] = ""

        if "num_accumulated_batches" in user_param_dict:
            config_json["train_config"]["distribution_config"]["num_accumulated_batches"] = \
                user_param_dict["num_accumulated_batches"]

        if "pull_evaluation_in_multiworkers_training" in user_param_dict:
            config_json["train_config"]["distribution_config"]["pull_evaluation_in_multiworkers_training"] = \
                (user_param_dict["pull_evaluation_in_multiworkers_training"].lower() == "true")

        other_param_keys = {
            "train_config":
            ["throttle_secs", "keep_checkpoint_max", "log_step_count_steps"],
            "optimizer_config": [
                "weight_decay_ratio", "lr_decay", "warmup_ratio",
                "gradient_clip", "clip_norm_value"
            ],
            "evaluate_config": ["eval_batch_size", "num_eval_steps"],
        }
        for first_key, second_key_list in other_param_keys.items():
            for second_key in second_key_list:
                if second_key in user_param_dict:
                    obj = config_json["train_config"][first_key] if first_key == "optimizer_config" \
                        else config_json[first_key]
                    obj[second_key] = user_param_dict[second_key]

        if "shuffle_buffer_size" in user_param_dict:
            setattr(self, "shuffle_buffer_size",
                    int(user_param_dict["shuffle_buffer_size"]))
        else:
            setattr(self, "shuffle_buffer_size", None)

        if "init_checkpoint_path" in user_param_dict:
            setattr(self, "init_checkpoint_path",
                    user_param_dict["init_checkpoint_path"])

        if "export_best_checkpoint" in user_param_dict:
            assert user_param_dict["export_best_checkpoint"].lower() in [
                "true", "false"
            ]
            if user_param_dict["export_best_checkpoint"].lower() == "true":
                setattr(self, "export_best_checkpoint", True)
            else:
                setattr(self, "export_best_checkpoint", False)

        if "export_best_checkpoint_metric" in user_param_dict:
            setattr(self, "export_best_checkpoint_metric",
                    user_param_dict["export_best_checkpoint_metric"])
        else:
            if flags.modelName.startswith("text_classify"):
                setattr(self, "export_best_checkpoint_metric", "py_accuracy")
            elif flags.modelName.startswith(
                    "text_match") and label_enumerate_values is None:
                setattr(self, "export_best_checkpoint_metric", "mse")
            else:
                setattr(self, "export_best_checkpoint_metric", "accuracy")

        return config_json
Esempio n. 4
0
    def __init__(self):
        """ Configuration adapter for `ez_bert_feat`
            It adapts user command args to configuration protocol of `ez_transfer` engine
        """
        self.mode = "predict_on_the_fly"
        if FLAGS.usePAI:
            input_table = FLAGS.tables
            output_table = FLAGS.outputs
        else:
            input_table = _APP_FLAGS.inputTable
            output_table = _APP_FLAGS.outputTable

        all_pretrain_model_archive_map = dict()
        all_pretrain_model_archive_map.update(
            ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
        all_pretrain_model_archive_map.update(
            BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
        all_pretrain_model_archive_map.update(
            ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
        if _APP_FLAGS.modelName not in all_pretrain_model_archive_map:
            predict_checkpoint_path = _APP_FLAGS.modelName
        else:
            predict_checkpoint_path = os.path.join(
                FLAGS.modelZooBasePath,
                os.path.dirname(
                    all_pretrain_model_archive_map[_APP_FLAGS.modelName]))

        predict_checkpoint_dir = os.path.dirname(predict_checkpoint_path)
        if tf.gfile.Exists(
                os.path.join(predict_checkpoint_dir, "train_config.json")):
            with tf.gfile.Open(
                    os.path.join(predict_checkpoint_dir,
                                 "train_config.json")) as f:
                train_config_json = json.load(f)
            if "model_name" in train_config_json:
                finetune_model_name = train_config_json["model_name"]
            else:
                finetune_model_name = None
            if "_config_json" in train_config_json:
                train_model_config = train_config_json["_config_json"][
                    "model_config"]
            else:
                train_model_config = None
        else:
            finetune_model_name = None
            train_model_config = None

        if FLAGS.usePAI:
            all_input_col_names = get_all_columns_name(input_table)
        else:
            all_input_col_names = set(
                [t.split(":")[0] for t in _APP_FLAGS.inputSchema.split(",")])
        first_sequence = _APP_FLAGS.firstSequence
        assert first_sequence in all_input_col_names, "The first sequence should be in input schema"
        second_sequence = _APP_FLAGS.secondSequence
        if second_sequence not in all_input_col_names:
            second_sequence = ""
        append_columns = [t for t in _APP_FLAGS.appendCols.split(",") if t and t in all_input_col_names] \
                          if _APP_FLAGS.appendCols else []
        tf.logging.info(input_table)
        if FLAGS.usePAI:
            selected_cols_set = [first_sequence]
            if second_sequence:
                selected_cols_set.append(second_sequence)
            selected_cols_set.extend(append_columns)
            selected_cols_set = set(selected_cols_set)
            input_schema = get_selected_columns_schema(input_table,
                                                       selected_cols_set)
        else:
            assert _APP_FLAGS.inputSchema is not None
            input_schema = _APP_FLAGS.inputSchema
        output_schema = _APP_FLAGS.outputSchema
        for column_name in append_columns:
            output_schema += "," + column_name

        config_json = {
            'preprocess_config': {
                'input_schema': input_schema,
                'first_sequence': first_sequence,
                'second_sequence': second_sequence,
                'output_schema': output_schema,
                'sequence_length': _APP_FLAGS.sequenceLength,
                "max_predictions_per_seq": 20
            },
            'model_config': {
                'model_name': 'feat_ext_bert',
                'pretrain_model_name_or_path': _APP_FLAGS.modelName,
                'finetune_model_name': finetune_model_name,
            },
            'predict_config': {
                'predict_checkpoint_path': predict_checkpoint_path,
                'predict_batch_size': _APP_FLAGS.batchSize,
                'predict_input_fp': input_table,
                'predict_output_fp': output_table
            }
        }
        if train_model_config:
            for key, val in train_model_config.items():
                if key not in config_json["model_config"]:
                    config_json["model_config"][str(key)] = val

        config_json["worker_hosts"] = FLAGS.worker_hosts
        config_json["task_index"] = FLAGS.task_index
        config_json["job_name"] = FLAGS.job_name
        config_json["num_gpus"] = FLAGS.workerGPU
        config_json["num_workers"] = FLAGS.workerCount
        tf.logging.info("{}".format(config_json))
        super(BertFeatConfig, self).__init__(mode="predict_on_the_fly",
                                             config_json=config_json)

        for key, val in self.__dict__.items():
            tf.logging.info("  {}: {}".format(key, val))