Beispiel #1
0
    def test_process(self):
        hparam = Hparams()
        hparam.load_from_config_file('/search/odin/yyk/workspace/AiSpace/configs/custom/idiom_generator.yml')
        hparam.stand_by()
        hparam.cascade_set("model_load_path", "/search/odin/yyk/workspace/AiSpace/save/test_bert_for_text_generation_idiom__idiom_generator_119_23")
        model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
        model.compile(optimizer=optimizer, loss=losses, metrics=metrics)

        tokenizer = CPMTokenizer(hparam.dataset.tokenizer)

        input = "春眠不觉晓"
        input_tokens = tokenizer.tokenize(input) + [tokenizer.vocab.sep_token]

        input_encoded = tokenizer.encode(input_tokens)

        input_ids = tf.constant([input_encoded['input_ids']], dtype=tf.int32)
        attention_mask = tf.constant([[1] * len(input_encoded['input_ids'])], dtype=tf.int32)
        input_dict = {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }
        # output = model(input_dict)
        output = model.generate(input_ids, **hparam.generation_attributes)

        print(input_encoded)
        output = tokenizer.decode(output.numpy().reshape([-1]).tolist())
        print(output)
Beispiel #2
0
    def test_gpt2_checkpoint(self):
        hparam = Hparams()
        hparam.load_from_config_file('/search/odin/yyk/workspace/AiSpace/configs/custom/test_gpt2.yml')
        # hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/tnews.yml')
        # hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/cmrc2018.yml')
        hparam.stand_by()

        model_path = "/search/odin/yyk/data/pretrained/gpt/cpm-lm-tf2_v2"
        # model1 = tf.keras.models.load_model(model_path)
        # model_gold = model1.trainable_variables

        # ckpt_vars = [itm for itm in tf.train.list_variables(ckpt) if itm[0].find('adam') == -1]
        # ckpt_vars = [itm for itm in tf.train.list_variables(hparam.pretrained.model_path) if itm[0].find('adam') == -1]

        model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
        model.compile(optimizer=optimizer, loss=losses, metrics=metrics)
        #
        model_vars = model.trainable_variables

        for itm in model_vars:
            print(f"{itm.name}, {itm.shape}")
            # print(itm.numpy())
            # print(type(itm.numpy()))
            # break
        print()
Beispiel #3
0
 def test_eval(self):
     hparams = Hparams()
     hparams.load_from_config_file("../../configs/glue_zh/tnews_k_fold.yml")
     hparams.stand_by()
     ckpts = [
         "../../save/test_textcnn_for_classification_119_14/k_fold/1/model_saved/model",
         "../../save/test_textcnn_for_classification_119_14/k_fold/2/model_saved/model",
     ]
     evaluation(hparams, checkpoints=ckpts)
Beispiel #4
0
 def test_process(self):
     hparam = Hparams()
     hparam.load_from_config_file(
         '/search/odin/yyk/workspace/AiSpace/configs/custom/test_gpt2.yml')
     hparam.stand_by()
     model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
     model.compile(optimizer=optimizer, loss=losses, metrics=metrics)
     model_vars = model.trainable_variables
     model_path = "/search/odin/yyk/data/pretrained/gpt/cpm-lm-tf2_v2"
     tf_huggingface_gpt2_adapter(model_vars, model_path)
Beispiel #5
0
    def test_init(self):
        hparams = Hparams()
        hparams.load_from_config_file("../../../configs/custom/test_gpt2.yml")
        hparams.stand_by()
        tokenizer = CPMTokenizer(hparams.dataset.tokenizer)

        a = "这两天,XLNet貌似也引起了NLP圈的极大关注,从实验数据看,在某些场景下,确实XLNet相对Bert有很大幅度的提升。"
        b = "就像我们之前说的,感觉Bert打开两阶段模式的魔法盒开关后,在这条路上,会有越来越多的同行者,而XLNet就是其中比较引人注目的一位"

        res = tokenizer.encode(a, b)
        print(res)
Beispiel #6
0
    def test_electra_checkpoint(self):
        hparam = Hparams()
        hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/tnews.yml')
        hparam.stand_by()

        # ckpt = "/search/data1/yyk/workspace/projects/ERNIE/ernie/checkpoints"
        # ckpt_vars = [itm for itm in tf.train.list_variables(ckpt) if itm[0].find('adam') == -1]
        ckpt_vars = [itm for itm in tf.train.list_variables(hparam.pretrained.model_path)
                     if itm[0].find('adam') == -1 and not itm[0].endswith("lamb_m") and not itm[0].endswith("lamb_v")]

        model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
        model.compile(optimizer=optimizer, loss=losses, metrics=metrics)

        model_vars = model.trainable_variables

        print()
Beispiel #7
0
    def test_lstc_load(self):
        hparams = Hparams()
        hparams.load_from_config_file("../configs/custom/test_gpt2.yml")
        hparams.stand_by()
        checksum_dir = "../aispace/datasets/url_checksums"
        tfds.download.add_checksums_dir(checksum_dir)
        # download_config = DownloadConfig(register_checksums=True)
        tnews = tfds.load(
            "idiom/idiom_generator",
            # data_dir="/search/data1/yyk/data/datasets/glue_zh",
            split="train[90%:]",
            data_dir="../data",
            builder_kwargs={'hparams': hparams},
            # download_and_prepare_kwargs={'download_config': download_config}
        )

        tokenizer = BertTokenizer(hparams.dataset.tokenizer)
        # id_to_label = {v: k for k, v in hparams.duee_event_type_labels.items()}
        label_counter = {}
        i = 0
        for itm in tnews:
            # for k, v in itm.items():
            #     if v.shape[0] == 151:
            #         print(itm)
            #         break
            # print(itm)
            # print()
            # print(tokenizer.decode([int(t) for t in itm["input_ids"].numpy().tolist()]))
            # break
            i += 1
            # l = hparams.dataset.outputs[0].labels[tf.argmax(itm["output_1"], -1).numpy().tolist()]
            # print(id_to_label[l])
            # if id_to_label[l] not in label_counter:
            #     label_counter[id_to_label[l]] = 0
            # label_counter[id_to_label[l]] += 1
        # print(label_counter)
        # print(len(label_counter))
        print(i)


# python -u aispace/trainer.py \
#    --experiment_name test \
#    --model_name bert_for_classification \
#    --schedule train_and_eval \
#    --config_name tnews \
#    --config_dir ./configs/glue_zh \
#    --gpus 0 1 2 3
Beispiel #8
0
    def test_lstc_load(self):
        hparams = Hparams()
        hparams.load_from_config_file(
            "../configs/2020_LSTC/DuEE_keyphrase.yml")
        hparams.stand_by()
        checksum_dir = "../aispace/datasets/url_checksums"
        tfds.download.add_checksums_dir(checksum_dir)
        # download_config = DownloadConfig(register_checksums=True)
        tnews = tfds.load(
            "lstc_2020/DuEE_role",
            # data_dir="/search/data1/yyk/data/datasets/glue_zh",
            data_dir="../data",
            builder_kwargs={'hparams': hparams},
            # download_and_prepare_kwargs={'download_config': download_config}
        )

        # tokenizer = BertTokenizer(hparams.dataset.tokenizer)
        # s = "BCI下架新疆棉花产品"
        # res = tokenizer.tokenize(s, True)
        # print(res)
        # id_to_label = {v: k for k, v in hparams.duee_event_type_labels.items()}
        label_counter = {}
        for itm in tnews["train"]:
            # for k, v in itm.items():
            #     if v.shape[0] == 151:
            #         print(itm)
            #         break
            print(itm)
            print()
            # print(tokenizer.decode([int(t) for t in itm["input_ids"].numpy().tolist()]))
            break
            # l = hparams.dataset.outputs[0].labels[tf.argmax(itm["output_1"], -1).numpy().tolist()]
            # print(id_to_label[l])
            # if id_to_label[l] not in label_counter:
            #     label_counter[id_to_label[l]] = 0
            # label_counter[id_to_label[l]] += 1
        print(label_counter)
        print(len(label_counter))


# python -u aispace/trainer.py \
#    --experiment_name test \
#    --model_name bert_for_classification \
#    --schedule train_and_eval \
#    --config_name tnews \
#    --config_dir ./configs/glue_zh \
#    --gpus 0 1 2 3
Beispiel #9
0
    def test_glue_load(self):
        hparams = Hparams()
        hparams.load_from_config_file("../configs/qa/dureader_yesno.yml")
        hparams.stand_by()
        checksum_dir = "../aispace/datasets/url_checksums"
        tfds.download.add_checksums_dir(checksum_dir)
        download_config = DownloadConfig(register_checksums=True)
        print(tfds.list_builders())
        dureader = tfds.load(
            "dureader/yesno",
            # data_dir="/search/data1/yyk/data/datasets/glue_zh",
            data_dir="../data/dureader",
            builder_kwargs={'hparams': hparams},
            download_and_prepare_kwargs={'download_config': download_config})
        for itm in dureader['train']:
            print(itm)
            break
        print()

        # train_dataset, dev_dataset, dataset_info = next(load_dataset(hparams, ret_test=False))
        # test_dataset = next(load_dataset(hparams, ret_train=True, ret_dev=True, ret_test=True, ret_info=True))[0]

        # total, zero = 0, 0
        # for itm in tqdm(test_dataset):
        # tt = itm[0]['input_ids'].numpy().tolist()
        # print(itm[0]['p_mask'].numpy().tolist())
        # print(itm[0]['start_position'].numpy().tolist())
        # print(itm[0]['end_position'].numpy().tolist())
        # print(tt)
        # break
        # total += 1
        # zero += len([t for t in tt if t == 0])
        # print()
        # print(f"{zero}, {total}, {zero / float(total)}")
        # print(total)


# python -u aispace/trainer.py \
#    --experiment_name test \
#    --model_name bert_for_classification \
#    --schedule train_and_eval \
#    --config_name tnews \
#    --config_dir ./configs/glue_zh \
#    --gpus 0 1 2 3
Beispiel #10
0
    def test_glue_load(self):
        hparams = Hparams()
        hparams.load_from_config_file("../configs/glue_zh/cmrc2018.yml")
        hparams.stand_by()
        # checksum_dir = "../aispace/datasets/url_checksums"
        # tfds.download.add_checksums_dir(checksum_dir)
        # download_config = DownloadConfig(register_checksums=True)
        # cmrc2018 = tfds.load("glue_zh/cmrc2018",
        #                   # data_dir="/search/data1/yyk/data/datasets/glue_zh",
        #                   data_dir="../data/glue_zh",
        #                   builder_kwargs={'hparams': hparams},
        #                   download_and_prepare_kwargs={'download_config': download_config}
        #                   )

        # train_dataset, dev_dataset, dataset_info = next(load_dataset(hparams, ret_test=False))
        test_dataset = next(
            load_dataset(hparams,
                         ret_train=False,
                         ret_dev=True,
                         ret_test=False,
                         ret_info=False))[0]

        total, zero = 0, 0
        for itm in test_dataset:
            tt = itm[0]['start_position'].numpy().tolist()
            # print(itm[0]['p_mask'].numpy().tolist())
            # print(itm[0]['start_position'].numpy().tolist())
            # print(itm[0]['end_position'].numpy().tolist())
            # break
            total += len(tt)
            zero += len([t for t in tt if t == 0])
        print()
        print(f"{zero}, {total}, {zero / float(total)}")


# python -u aispace/trainer.py \
#    --experiment_name test \
#    --model_name bert_for_classification \
#    --schedule train_and_eval \
#    --config_name tnews \
#    --config_dir ./configs/glue_zh \
#    --gpus 0 1 2 3
Beispiel #11
0
    def test_electra_checkpoint(self):
        hparam = Hparams()
        hparam.load_from_config_file(
            '/search/data1/yyk/workspace/AiSpace/configs/glue_zh/tnews.yml')
        # hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/cmrc2018.yml')
        hparam.stand_by()

        # ckpt = "/search/data1/yyk/workspace/projects/ERNIE/ernie/checkpoints"
        ckpt = "/search/data1/yyk/data/pretrained/albert/albert_large_zh_google/model.ckpt-best"
        ckpt_vars = [
            itm for itm in tf.train.list_variables(ckpt)
            if itm[0].find('adam') == -1
        ]
        # ckpt_vars = [itm for itm in tf.train.list_variables(hparam.pretrained.model_path) if itm[0].find('adam') == -1]

        model, (losses, loss_weights), metrics, optimizer = build_model(hparam)
        model.compile(optimizer=optimizer, loss=losses, metrics=metrics)

        model_vars = model.trainable_variables

        print()
    def test_dataset_split(self):
        hparams = Hparams()
        hparams.load_from_config_file("../configs/glue_zh/tnews.yml")
        hparams.stand_by()

        k_fold_experiment(hparams)