Beispiel #1
0
    def test_process(self):
        hparam = Hparams()
        hparam.load_from_config_file('/search/odin/yyk/workspace/AiSpace/configs/custom/idiom_generator.yml')
        hparam.stand_by()
        hparam.cascade_set("model_load_path", "/search/odin/yyk/workspace/AiSpace/save/test_bert_for_text_generation_idiom__idiom_generator_119_23")
        model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
        model.compile(optimizer=optimizer, loss=losses, metrics=metrics)

        tokenizer = CPMTokenizer(hparam.dataset.tokenizer)

        input = "春眠不觉晓"
        input_tokens = tokenizer.tokenize(input) + [tokenizer.vocab.sep_token]

        input_encoded = tokenizer.encode(input_tokens)

        input_ids = tf.constant([input_encoded['input_ids']], dtype=tf.int32)
        attention_mask = tf.constant([[1] * len(input_encoded['input_ids'])], dtype=tf.int32)
        input_dict = {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }
        # output = model(input_dict)
        output = model.generate(input_ids, **hparam.generation_attributes)

        print(input_encoded)
        output = tokenizer.decode(output.numpy().reshape([-1]).tolist())
        print(output)
Beispiel #2
0
def deploy(hparams: Hparams):
    logger = logging.getLogger(__name__)
    assert hparams.model_resume_path is not None, ValueError("Model resume path is None, must be specified.")
    # reuse hparams
    model_resume_path = hparams.model_resume_path
    logger.info(f"Reuse saved json config from {os.path.join(hparams.get_workspace_dir(), 'hparams.json')}")
    hparams.reuse_saved_json_hparam()
    hparams.cascade_set("model_resume_path", model_resume_path)
    # build model
    (model,) = build_model(hparams, return_losses=False, return_metrics=False,
                           return_optimizer=False)
    logger.info("Export model to deployment.")
    saved_path = model.deploy()
    logger.info(f"Save bento Service in {saved_path}")
Beispiel #3
0
def load_dataset(hparams: Hparams,
                 ret_train=True,
                 ret_dev=True,
                 ret_test=True,
                 ret_info=True):
    from aispace import datasets

    train_split, validation_split, test_split = get_dataset_split(hparams)
    if ret_train:
        train_datasets, dataset_info = build_dataset(hparams,
                                                     train_split,
                                                     with_info=True)
    if ret_dev:
        dev_datasets, dev_dataset_info = build_dataset(hparams,
                                                       validation_split,
                                                       with_info=True)
        if dev_dataset_info is not None:
            dataset_info = dev_dataset_info
    if ret_test:
        test_datasets, test_dataset_info = build_dataset(hparams,
                                                         test_split,
                                                         with_info=True)
        if test_dataset_info is not None:
            dataset_info = test_dataset_info

    # check the consistence of tokenizer using in building dataset and now using.
    if hparams.get("dataset", {}).get("tokenizer", {}).get("name", "") != "":
        if dataset_info.metadata is None:
            logger.warning("dataset_info has no metadata attribute.")
        elif hparams.get("dataset", {}).get("tokenizer", {}).get("name", "") \
                != dataset_info.metadata.get("tokenizer", ""):
            raise ValueError(
                f'The dataset is built using tokenizer {dataset_info.metadata.get("tokenizer", "")}, '
                f'however, now is using {hparams.get("dataset", {}).get("tokenizer", {}).get("name", "")}, '
                f'please remove/rebuild the data and restart!')
        elif hparams.get("pretrained", {}).get("config", {}).get("vocab_size", 0) \
                != dataset_info.metadata.get("vocab_size", 0):
            raise ValueError(
                f'The dataset is built using tokenizer {dataset_info.metadata.get("tokenizer", "")}, '
                f'whose vocab size is {dataset_info.metadata.get("vocab_size", "xx")},'
                f'however, now is {hparams.get("pretrained", {}).get("config", {}).get("vocab_size", 0)}, '
                f'please remove/rebuild the data and restart!')

    # data mapping
    def build_generator(fields):
        input_names = [itm.get('name') for itm in hparams.dataset.inputs]
        output_names = [itm.get('name') for itm in hparams.dataset.outputs]
        output_name2column = {
            itm.get('name'): itm.get('column')
            for itm in hparams.dataset.outputs
        }
        inputs, outputs = {}, {}
        for k, v in fields.items():
            if k in input_names:
                inputs[k] = v
            elif k in output_names:
                inputs[output_name2column.get(k, k)] = v
                outputs[k] = v
            else:
                raise ValueError(f"{k} not in inputs or outputs.")
        return inputs, outputs

    training_hparams = hparams.training
    # reset some hparams
    if ret_info:
        print(dataset_info)
        # train_data_size = dataset_info.splits.get("train").num_examples
        # validation_data_size = dataset_info.splits.get("validation").num_examples
        # test_data_size = dataset_info.splits.get("test").num_examples
        # steps_per_epoch = int(train_data_size / training_hparams.batch_size)
        # num_warmup_steps = \
        #     int(
        #         training_hparams.max_epochs * train_data_size * training_hparams.warmup_factor / training_hparams.batch_size)
        # num_warmup_steps = min(steps_per_epoch, num_warmup_steps)

        # if validation_data_size is not None:
        #     validation_steps = validation_data_size // training_hparams.batch_size
        # else:
        #     validation_steps = None
        #
        # if test_data_size is not None:
        #     test_steps = test_data_size // training_hparams.batch_size
        # else:
        #     test_steps = None

    for i in range(len(train_split)):
        # build batch
        if ret_train:
            if train_datasets is not None and train_datasets[i] is not None:
                # get train_steps and reset training hparams
                logger.info(
                    "Reset training hparams according to real training data info."
                )
                steps_per_epoch = 0
                for _ in train_datasets[i]:
                    steps_per_epoch += 1
                steps_per_epoch //= training_hparams.batch_size
                num_warmup_steps = \
                    int(training_hparams.max_epochs * steps_per_epoch * training_hparams.warmup_factor)
                if "num_warmup_steps" not in training_hparams or training_hparams.num_warmup_steps <= 0:
                    hparams.cascade_set('training.num_warmup_steps',
                                        num_warmup_steps)
                    logger.info(
                        f"Set training.num_warmup_steps to {num_warmup_steps}")
                else:
                    logger.info(
                        f"Get training.num_warmup_steps is {hparams.training.num_warmup_steps}"
                    )
                if "steps_per_epoch" not in training_hparams or training_hparams.steps_per_epoch <= 0:
                    hparams.cascade_set('training.steps_per_epoch',
                                        steps_per_epoch)
                    logger.info(
                        f"Set training.steps_per_epoch to {steps_per_epoch}")
                else:
                    logger.info(
                        f"Get training.steps_per_epoch is {hparams.training.steps_per_epoch}"
                    )

                # prepare train dataset
                train_dataset = train_datasets[i].\
                    map(build_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE). \
                    prefetch(buffer_size=tf.data.experimental.AUTOTUNE). \
                    shuffle(hparams.training.shuffle_size).\
                    repeat(). \
                    batch(hparams.training.batch_size)

                logger.info("Train dataset has loaded.")
            else:
                train_dataset = None
                logger.info("Train dateset get None.")
        if ret_dev:
            if dev_datasets is not None and dev_datasets[i] is not None:
                logger.info(
                    "Reset validation hparams according to real validation data info."
                )
                validation_steps = 0
                for _ in dev_datasets[i]:
                    validation_steps += 1
                validation_steps //= training_hparams.batch_size
                if "validation_steps" not in training_hparams or training_hparams.validation_steps <= 0:
                    hparams.cascade_set('training.validation_steps',
                                        validation_steps)
                    logger.info(
                        f"Set training.validation_steps to {validation_steps}")
                else:
                    logger.info(
                        f"Get training.validation_steps is {hparams.training.validation_steps}"
                    )

                dev_dataset = dev_datasets[i].\
                    map(build_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE). \
                    prefetch(buffer_size=tf.data.experimental.AUTOTUNE). \
                    repeat(). \
                    batch(hparams.training.batch_size)

                logger.info("Validation dataset has loaded.")
            else:
                dev_dataset = None
                logger.info("Validation dataset get None.")
        if ret_test:
            if test_datasets is not None and test_datasets[i] is not None:
                logger.info(
                    "Reset test hparams according to real test data info.")
                test_steps = 0
                for _ in test_datasets[i]:
                    test_steps += 1
                test_steps //= training_hparams.batch_size
                if "test_steps" not in training_hparams or training_hparams.test_steps <= 0:
                    hparams.cascade_set('training.test_steps', test_steps)
                    logger.info(f"Set training.test_steps to {test_steps}")
                else:
                    logger.info(
                        f"Get training.test_steps is {hparams.training.test_steps}"
                    )

                test_dataset = test_datasets[i]. \
                    map(build_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE). \
                    prefetch(buffer_size=tf.data.experimental.AUTOTUNE). \
                    batch(hparams.training.batch_size)

                logger.info("Test dataset has loaded.")
            else:
                test_dataset = None
                logger.info("Test dataset get None.")

        result = ()
        if ret_train:
            result += (train_dataset, )
        if ret_dev:
            result += (dev_dataset, )
        if ret_test:
            result += (test_dataset, )

        if ret_info:
            result += (dataset_info, )

        yield result