def test_bert_embed_eq_bert_piece_encoder(self):
        ds = DataSet({
            'words':
            ["this is a texta model vocab".split(), 'this is'.split()]
        })
        encoder = BertWordPieceEncoder(
            model_dir_or_name='test/data_for_tests/embedding/small_bert')
        encoder.eval()
        encoder.index_datasets(ds, field_name='words')
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
        word_pieces_res = encoder(word_pieces)

        vocab = Vocabulary()
        vocab.from_dataset(ds, field_name='words')
        vocab.index_dataset(ds, field_name='words', new_field_name='words')
        ds.set_input('words')
        words = torch.LongTensor(ds['words'].get([0, 1]))
        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            pool_method='first',
            include_cls_sep=True,
            pooled_cls=False,
            min_freq=1)
        embed.eval()
        words_res = embed(words)

        # 检查word piece什么的是正常work的
        self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0)
        self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0)
        self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
Beispiel #2
0
 def test_bert_embedding_1(self):
     vocab = Vocabulary().add_word_lst("this is a test . [SEP]".split())
     embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1)
     requires_grad = embed.requires_grad
     embed.requires_grad = not requires_grad
     embed.train()
     words = torch.LongTensor([[2, 3, 4, 0]])
     result = embed(words)
     self.assertEqual(result.size(), (1, 4, 16))
Beispiel #3
0
    def test_download(self):
        # import os
        vocab = Vocabulary().add_word_lst("This is a test .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='en')
        words = torch.LongTensor([[2, 3, 4, 0]])
        print(embed(words).size())

        for pool_method in ['first', 'last', 'max', 'avg']:
            for include_cls_sep in [True, False]:
                embed = BertEmbedding(vocab, model_dir_or_name='en', pool_method=pool_method,
                                      include_cls_sep=include_cls_sep)
                print(embed(words).size())
Beispiel #4
0
def load_data():

    paths = {
        'train': 'data/{}/train.txt'.format(dataset),
        'dev': 'data/{}/dev.txt'.format(dataset),
        'test': 'data/{}/test.txt'.format(dataset)
    }
    min_freq = 2
    data_bundle = CNNERPipe(
        bigrams=True, encoding_type=encoding_type).process_from_file(paths)

    dict_save_path = os.path.join("data/{}/data.pth".format(dataset))
    context_dict, context_word2id, context_id2word = get_neighbor_for_vocab(
        data_bundle.get_vocab('chars').word2idx, glove_path, dict_save_path)

    train_feature_data, dev_feature_data, test_feature_data = build_instances(
        "data/{}".format(dataset), context_num, context_dict)

    embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bi_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec',
        word_dropout=0.02,
        dropout=0.3,
        min_freq=min_freq,
        only_norm_found_vector=normalize_embed,
        only_train_min_freq=True)

    tencent_embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/tencent_unigram.txt',
        min_freq=min_freq,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, tencent_embed, bert_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
Beispiel #5
0
 def test_word_drop(self):
     vocab = Vocabulary().add_word_lst("This is a test .".split())
     embed = BertEmbedding(vocab,
                           model_dir_or_name='en',
                           dropout=0.1,
                           word_dropout=0.2)
     for i in range(10):
         words = torch.LongTensor([[2, 3, 4, 0]])
         print(embed(words).size())
Beispiel #6
0
def get_data():
    data = CTBxJointPipe().process_from_file(data_folder)
    data.delete_field('bigrams')
    data.delete_field('trigrams')
    data.delete_field('chars')
    data.rename_field('pre_chars', 'chars')
    data.delete_field('pre_bigrams')
    data.delete_field('pre_trigrams')
    bert_embed = BertEmbedding(data.get_vocab('chars'), model_dir_or_name='cn', requires_grad=True)
    return data, bert_embed
Beispiel #7
0
 def __init__(self,
              vocab,
              num_class,
              bert_model_dir_or_name,
              fine_tune=False):
     super(BertTC, self).__init__()
     self.embed = BertEmbedding(vocab,
                                requires_grad=fine_tune,
                                model_dir_or_name=bert_model_dir_or_name,
                                include_cls_sep=True)
     self.classifier = nn.Linear(self.embed.embedding_dim, num_class)
Beispiel #8
0
def load_data():
    paths = {
        'train': 'data/{}/train.txt'.format(dataset),
        "dev": 'data/{}/dev.txt'.format(dataset),
        "test": 'data/{}/test.txt'.format(dataset)
    }
    min_freq = 1
    data_bundle = CNNERPipe(
        bigrams=True, encoding_type=encoding_type).process_from_file(paths)

    train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api(
        os.path.join("data", dataset), "all", args.feature_level)

    embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    tencent_embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/tencent_unigram.txt',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bi_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec',
        word_dropout=0.02,
        dropout=0.3,
        min_freq=min_freq,
        only_norm_found_vector=normalize_embed,
        only_train_min_freq=True)

    bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, tencent_embed, bert_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature
def load_ner_data():
    paths = {
        'train': 'data/{}/train.txt'.format(dataset),
        'dev': 'data/{}/dev.txt'.format(dataset),
        'test': 'data/{}/test.txt'.format(dataset)
    }
    min_freq = 2
    data_bundle = CNNERPipe(
        bigrams=True, encoding_type=encoding_type).process_from_file(paths)

    # train_list = data_bundle.get_dataset('train')['raw_chars']

    embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bi_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec',
        word_dropout=0.02,
        dropout=0.3,
        min_freq=2,
        only_norm_found_vector=normalize_embed,
        only_train_min_freq=True)

    tencent_embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/tencent_unigram.txt',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=True)

    # embed = StackEmbedding([tencent_embed, bert_embed], dropout=0, word_dropout=0.02)
    embed = StackEmbedding([embed, tencent_embed, bert_embed],
                           dropout=0,
                           word_dropout=0.02)
    return data_bundle, embed, bi_embed
def trainer(data_folder, write2model, write2vocab):
    data_bundle = PeopleDailyNERLoader().load(
        data_folder)  # 这一行代码将从{data_dir}处读取数据至DataBundle
    data_bundle = PeopleDailyPipe().process(data_bundle)
    data_bundle.rename_field('chars', 'words')
    # 存储vocab
    targetVocab = dict(data_bundle.vocabs["target"])
    wordsVocab = dict(data_bundle.vocabs["words"])
    targetWc = dict(data_bundle.vocabs['target'].word_count)
    wordsWc = dict(data_bundle.vocabs['words'].word_count)
    with open(write2vocab, "w", encoding="utf-8") as VocabOut:
        VocabOut.write(
            json.dumps(
                {
                    "targetVocab": targetVocab,
                    "wordsVocab": wordsVocab,
                    "targetWc": targetWc,
                    "wordsWc": wordsWc
                },
                ensure_ascii=False))

    embed = BertEmbedding(vocab=data_bundle.get_vocab('words'),
                          model_dir_or_name='cn',
                          requires_grad=False,
                          auto_truncate=True)
    model = BiLSTMCRF(embed=embed,
                      num_classes=len(data_bundle.get_vocab('target')),
                      num_layers=1,
                      hidden_size=100,
                      dropout=0.5,
                      target_vocab=data_bundle.get_vocab('target'))

    metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
    optimizer = Adam(model.parameters(), lr=2e-5)
    loss = LossInForward()
    device = 0 if torch.cuda.is_available() else 'cpu'
    # device = "cpu"
    trainer = Trainer(data_bundle.get_dataset('train'),
                      model,
                      loss=loss,
                      optimizer=optimizer,
                      batch_size=8,
                      dev_data=data_bundle.get_dataset('dev'),
                      metrics=metric,
                      device=device,
                      n_epochs=1)
    trainer.train()
    tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
    tester.test()
    saver = ModelSaver(write2model)
    saver.save_pytorch(model, param_only=False)
Beispiel #11
0
def load_data():

    paths = {
        "train": "../data/{}/train.txt".format(dataset),
        "test": "../data/{}/test.txt".format(dataset),
        "dev": "../data/{}/dev.txt".format(dataset)
    }
    data = WNUT_17NERPipe(encoding_type=encoding_type).process_from_file(paths)

    dict_save_path = os.path.join("../data/{}/data.pth".format(dataset))
    context_dict, context_word2id, context_id2word = get_neighbor_for_vocab(
        data.get_vocab('words').word2idx, glove_path, dict_save_path)

    train_feature_data, dev_feature_data, test_feature_data = build_instances(
        "../data/{}".format(dataset), context_num, context_dict)

    data.rename_field('words', 'chars')
    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name=elmo_model,
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()
    bert_embed = BertEmbedding(vocab=data.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)
    embed = StackEmbedding([embed, bert_embed], dropout=0, word_dropout=0.02)

    return data, embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
    def test_save_load(self):
        bert_save_test = 'bert_save_test'
        try:
            os.makedirs(bert_save_test, exist_ok=True)
            vocab = Vocabulary().add_word_lst(
                "this is a test . [SEP] NotInBERT".split())
            embed = BertEmbedding(
                vocab,
                model_dir_or_name='test/data_for_tests/embedding/small_bert',
                word_dropout=0.1,
                auto_truncate=True)

            embed.save(bert_save_test)
            load_embed = BertEmbedding.load(bert_save_test)
            words = torch.randint(len(vocab), size=(2, 20))
            embed.eval(), load_embed.eval()
            self.assertEqual((embed(words) - load_embed(words)).sum(), 0)

        finally:
            import shutil
            shutil.rmtree(bert_save_test)
Beispiel #13
0
@cache_results('caches/conll2003.pkl', _refresh=False)
def load_data():
    # 替换路径
    paths = 'data/conll2003'
    data = Conll2003NERPipe(
        encoding_type=encoding_type).process_from_file(paths)
    return data


data = load_data()
print(data)

embed = BertEmbedding(data.get_vocab(Const.INPUT),
                      model_dir_or_name='en-base-cased',
                      pool_method='max',
                      requires_grad=True,
                      layers='11',
                      include_cls_sep=False,
                      dropout=0.5,
                      word_dropout=0.01)

callbacks = [
    GradientClipCallback(clip_type='norm', clip_value=1),
    WarmupCallback(warmup=0.1, schedule='linear'),
    EvaluateCallback(data.get_dataset('test'))
]

model = BertCRF(embed,
                tag_vocab=data.get_vocab('target'),
                encoding_type=encoding_type)
optimizer = AdamW(model.parameters(), lr=2e-5)
Beispiel #14
0
from fastNLP.io import WeiboSenti100kPipe
from fastNLP.embeddings import BertEmbedding
from fastNLP.io.pipe.qa import CMRC2018Loader
from fastNLP.io import CNXNLILoader
from fastNLP.io import WeiboNERLoader
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

if __name__ == "__main__":
    # 下载情感分析-分类数据
    data_bundle = WeiboSenti100kPipe().process_from_file()
    data_bundle.rename_field('chars', 'words')
    # 下载bert
    embed = BertEmbedding(data_bundle.get_vocab('words'),
                          model_dir_or_name='cn-wwm',
                          include_cls_sep=True)
    # 问答数据
    data_bundle = CMRC2018Loader().load()
    # 文本匹配
    data_bundle = CNXNLILoader().load()
    # NER
    data_bundle = WeiboNERLoader().load()
    # embedding
    vocab = Vocabulary()
    vocab.add_word_lst("你 好 .".split())
    embed = StaticEmbedding(vocab, model_dir_or_name='cn-sgns-literature-word')
Beispiel #15
0
train_data, dev_data = data_set.split(0.015)

# training
device = 0 if torch.cuda.is_available() else 'cpu'
'''
EMBED_DIM = 100
model = CNNText((len(vocab),EMBED_DIM), num_classes=len(vocab_target), dropout=0.1)
metrics=AccuracyMetric()
loss = CrossEntropyLoss()
optimizer=optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
N_EPOCHS = 10
BATCH_SIZE = 16
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics,optimizer=optimizer,n_epochs=N_EPOCHS, batch_size=BATCH_SIZE, device=device)
trainer.train()
'''
embed = BertEmbedding(vocab, model_dir_or_name='en', include_cls_sep=True)
model = BertForSequenceClassification(embed, len(vocab_target))
trainer = Trainer(train_data,
                  model,
                  optimizer=Adam(model_params=model.parameters(), lr=2e-5),
                  loss=CrossEntropyLoss(),
                  device=device,
                  batch_size=8,
                  dev_data=dev_data,
                  metrics=AccuracyMetric(),
                  n_epochs=2,
                  print_every=1)
trainer.train()

saver = ModelSaver("save_model/bert2021.1.19.pkl")
saver.save_pytorch(model)
Beispiel #16
0
vocab = Vocabulary()
vocab.from_dataset(train_dataset, field_name='words', no_create_entry_dataset=[test_dataset])
vocab.index_dataset(train_dataset, test_dataset, field_name='words')

target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab.from_dataset(train_dataset, field_name='target', no_create_entry_dataset=[test_dataset])
target_vocab.index_dataset(train_dataset, test_dataset, field_name='target')

'''build bundle'''
data_dict = {"train":train_dataset, "test":test_dataset}
vocab_dict = {"words":vocab, "target":target_vocab}
data_bundle = DataBundle(vocab_dict, data_dict)
print(data_bundle)

'''build model'''
embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-uncased', include_cls_sep=True)
model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))
# model = BertForSequenceClassification(embed, 2)

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model,
                  optimizer=Adam(model_params=model.parameters(), lr=2e-5),
                  loss=CrossEntropyLoss(), device=device,
                  batch_size=8, dev_data=data_bundle.get_dataset('train'),
                  metrics=AccuracyMetric(), n_epochs=10, print_every=1)
trainer.train()

tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric())
tester.test()

Beispiel #17
0
 char_vocab_pkl_file = os.path.join(model_path, 'vocab_char.pkl')
 target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl')
 logger.warn('加载数据集')
 data_bundle = load_serialize_obj(train_data_bundle_pkl_file)
 logger.warn('获取词典')
 char_vocab = data_bundle.get_vocab('words')
 logger.info('char_vocab:{}'.format(char_vocab))
 target_vocab = data_bundle.get_vocab('target')
 logger.info('target_vocab:{}'.format(target_vocab))
 save_serialize_obj(char_vocab, char_vocab_pkl_file)
 save_serialize_obj(target_vocab, target_vocab_pkl_file)
 logger.info('词典序列化:{}'.format(char_vocab_pkl_file))
 logger.warn('选择预训练词向量')
 # model_dir_or_name = 'cn-wwm'
 model_dir_or_name = './data/embed/ERNIE_1.0_max-len-512-pytorch'
 bert_embed = BertEmbedding(vocab=char_vocab, model_dir_or_name=model_dir_or_name, requires_grad=False)
 logger.warn('神经网络模型')
 model = BiLSTMCRF(embed=bert_embed, num_classes=len(target_vocab), num_layers=1, hidden_size=200, dropout=0.5,
                   target_vocab=target_vocab)
 logger.info(model)
 logger.warn('训练超参数设定')
 loss = LossInForward()
 optimizer = Adam([param for param in model.parameters() if param.requires_grad])
 # metric = AccuracyMetric()
 metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab(Const.TARGET), only_gross=False)  # 若only_gross=False, 即还会返回各个label的metric统计值
 device = 'cuda' if torch.cuda.is_available() else 'cpu'  # 如果有gpu的话在gpu上运行,训练速度会更快
 logger.info('device:{}'.format(device))
 batch_size = 32
 n_epochs = 10
 early_stopping = 10
 trainer = Trainer(
    def test_bert_embedding_1(self):
        vocab = Vocabulary().add_word_lst(
            "this is a test . [SEP] NotInBERT".split())
        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            word_dropout=0.1)
        requires_grad = embed.requires_grad
        embed.requires_grad = not requires_grad
        embed.train()
        words = torch.LongTensor([[2, 3, 4, 0]])
        result = embed(words)
        self.assertEqual(result.size(), (1, 4, 16))

        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            word_dropout=0.1)
        embed.eval()
        words = torch.LongTensor([[2, 3, 4, 0]])
        result = embed(words)
        self.assertEqual(result.size(), (1, 4, 16))

        # 自动截断而不报错
        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            word_dropout=0.1,
            auto_truncate=True)

        words = torch.LongTensor([[2, 3, 4, 1] * 10, [2, 3] + [0] * 38])
        result = embed(words)
        self.assertEqual(result.size(), (2, 40, 16))
Beispiel #19
0
def load_data():
    if dataset == 'ON5e':
        paths = 'data/ON5e/english'
        data = OntoNotesNERPipe(
            encoding_type=encoding_type).process_from_file(paths)
    else:
        paths = {
            "train": "data/{}/train.txt".format(dataset),
            "dev": "data/{}/dev.txt".format(dataset),
            "test": "data/{}/test.txt".format(dataset)
        }
        data = ENNERPipe(encoding_type=encoding_type).process_from_file(paths)

    if knowledge:
        train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api(
            os.path.join("data", dataset), "all", feature_level)
    else:
        train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = None, None, None, None, None, None

    char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'),
                                      embed_size=embed_size,
                                      char_emb_size=embed_size,
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      activation='relu',
                                      min_char_freq=2,
                                      requires_grad=True,
                                      include_word_start_end=False,
                                      char_attn_type=char_type,
                                      char_n_head=3,
                                      char_dim_ffn=60,
                                      char_scale=char_type == 'naive',
                                      char_dropout=0.15,
                                      char_after_norm=True)

    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    data.rename_field('words', 'chars')

    embed = ElmoEmbedding(vocab=data.get_vocab('chars'),
                          model_dir_or_name=elmo_model,
                          layers='mix',
                          requires_grad=False,
                          word_dropout=0.0,
                          dropout=0.5,
                          cache_word_reprs=False)
    embed.set_mix_weights_requires_grad()

    bert_embed = BertEmbedding(vocab=data.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method="first",
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, bert_embed, word_embed, char_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data, embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature
 logger.warn('加载数据集')
 data_bundle = load_serialize_obj(train_data_bundle_pkl_file)
 logger.warn('获取词典')
 char_vocab = data_bundle.get_vocab('words')
 logger.info('char_vocab:{}'.format(char_vocab))
 target_vocab = data_bundle.get_vocab('target')
 logger.info('target_vocab:{}'.format(target_vocab))
 save_serialize_obj(char_vocab, char_vocab_pkl_file)
 save_serialize_obj(target_vocab, target_vocab_pkl_file)
 logger.info('词典序列化:{}'.format(char_vocab_pkl_file))
 logger.warn('选择预训练词向量')
 bert_embed = BertEmbedding(vocab=char_vocab,
                            model_dir_or_name='cn-wwm',
                            pool_method='max',
                            requires_grad=True,
                            layers='11',
                            include_cls_sep=False,
                            dropout=0.5,
                            word_dropout=0.01,
                            auto_truncate=True)
 logger.warn('神经网络模型')
 model = BertCRF(bert_embed, tag_vocab=target_vocab, encoding_type='bio')
 logger.info(model)
 logger.warn('训练超参数设定')
 loss = LossInForward()
 optimizer = AdamW(
     [param for param in model.parameters() if param.requires_grad],
     lr=2e-5)
 # metric = AccuracyMetric()
 metric = SpanFPreRecMetric(
     tag_vocab=data_bundle.get_vocab(Const.TARGET),
elif arg.task == 'qnli':
    data_bundle = QNLIBertPipe(lower=arg.to_lower,
                               tokenizer=arg.tokenizer).process_from_file()
elif arg.task == 'mnli':
    data_bundle = MNLIBertPipe(lower=arg.to_lower,
                               tokenizer=arg.tokenizer).process_from_file()
elif arg.task == 'quora':
    data_bundle = QuoraBertPipe(lower=arg.to_lower,
                                tokenizer=arg.tokenizer).process_from_file()
else:
    raise RuntimeError(f'NOT support {arg.task} task yet!')

print(data_bundle)  # print details in data_bundle

# load embedding
embed = BertEmbedding(data_bundle.vocabs[Const.INPUT],
                      model_dir_or_name=arg.bert_model_dir_or_name)

# define model
model = BertForSentenceMatching(embed,
                                num_labels=len(
                                    data_bundle.vocabs[Const.TARGET]))

# define optimizer and callback
optimizer = AdamW(lr=arg.lr, params=model.parameters())
callbacks = [
    WarmupCallback(warmup=arg.warm_up_rate, schedule='linear'),
]

if arg.task in ['snli']:
    callbacks.append(
        EvaluateCallback(data=data_bundle.datasets[arg.test_dataset_name]))
Beispiel #22
0
@cache_results('imdb.pkl')
def get_data():
    data_bundle = IMDBLoader().process('imdb/')
    return data_bundle


data_bundle = get_data()

print(data_bundle)

# 删除超过512, 但由于英语中会把word进行word piece处理,所以截取的时候做一点的裕量
data_bundle.datasets['train'].drop(lambda x: len(x['words']) > 400)
data_bundle.datasets['dev'].drop(lambda x: len(x['words']) > 400)
data_bundle.datasets['test'].drop(lambda x: len(x['words']) > 400)
bert_embed = BertEmbedding(data_bundle.vocabs['words'],
                           requires_grad=False,
                           model_dir_or_name="en-base-uncased")
model = BiLSTMSentiment(bert_embed, len(data_bundle.vocabs['target']))

Trainer(data_bundle.datasets['train'],
        model,
        optimizer=None,
        loss=CrossEntropyLoss(),
        device=0,
        batch_size=10,
        dev_data=data_bundle.datasets['dev'],
        metrics=AccuracyMetric()).train()

# 在测试集上测试一下效果
Tester(data_bundle.datasets['test'],
       model,
Beispiel #23
0
from fastNLP.embeddings import BertEmbedding
from fastNLP.models import BertForQuestionAnswering
from fastNLP.core.losses import CMRC2018Loss
from fastNLP.core.metrics import CMRC2018Metric
from fastNLP.io.pipe.qa import CMRC2018BertPipe
from fastNLP import Trainer, BucketSampler
from fastNLP import WarmupCallback, GradientClipCallback
from fastNLP.core.optimizer import AdamW


data_bundle = CMRC2018BertPipe().process_from_file()
data_bundle.rename_field('chars', 'words')

print(data_bundle)

embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=True, include_cls_sep=False, auto_truncate=True,
                      dropout=0.5, word_dropout=0.01)
model = BertForQuestionAnswering(embed)
loss = CMRC2018Loss()
metric = CMRC2018Metric()

wm_callback = WarmupCallback(schedule='linear')
gc_callback = GradientClipCallback(clip_value=1, clip_type='norm')
callbacks = [wm_callback, gc_callback]

optimizer = AdamW(model.parameters(), lr=5e-5)

trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,
                  sampler=BucketSampler(seq_len_field_name='context_len'),
                  dev_data=data_bundle.get_dataset('dev'), metrics=metric,
                  callbacks=callbacks, device=0, batch_size=6, num_workers=2, n_epochs=2, print_every=1,
                  test_use_tqdm=False, update_every=10)
Beispiel #24
0
    def forward(self, chars):
        # batch_size, max_len = words.size()
        chars = self.embedding(chars)
        outputs = self.mlp(chars)

        return {Const.OUTPUT: outputs}

    def predict(self, chars):
        # batch_size, max_len = words.size()
        chars = self.embedding(chars)
        outputs = self.mlp(chars)

        return {Const.OUTPUT: outputs}

embed = BertEmbedding(data.get_vocab(Const.CHAR_INPUT), model_dir_or_name='cn-wwm-ext',
                        pool_method='first', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5)

callbacks = [
                GradientClipCallback(clip_type='norm', clip_value=1),
                WarmupCallback(warmup=0.1, schedule='linear')
            ]

model = BertCNNER(embed, len(data.vocabs[Const.TARGET]))
optimizer = AdamW(model.parameters(), lr=3e-5)

for name, dataset in data.datasets.items():
    original_len = len(dataset)
    dataset.drop(lambda x:x['seq_len']>256, inplace=True)
    clipped_len = len(dataset)
    print("Delete {} instances in {}.".format(original_len-clipped_len, name))
Beispiel #25
0
    bi_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'),
                               model_dir_or_name='cpt/gigaword/bi.ite50.vec',
                               word_dropout=0.02,
                               dropout=0.3,
                               min_freq=min_freq,
                               only_norm_found_vector=normalize_embed,
                               only_train_min_freq=True)

    return data_bundle, embed, bi_embed


data_bundle, embed, bi_embed = load_data()

bert_embed = BertEmbedding(data_bundle.get_vocab('chars'),
                           model_dir_or_name='transformer_cpt/bert',
                           requires_grad=False)

print(data_bundle)
model = TENER(tag_vocab=data_bundle.get_vocab('target'),
              embed=embed,
              num_layers=num_layers,
              d_model=d_model,
              n_head=n_heads,
              feedforward_dim=dim_feedforward,
              dropout=dropout,
              after_norm=after_norm,
              attn_type=attn_type,
              bi_embed=bi_embed,
              bert_embed=bert_embed,
              fc_dropout=fc_dropout,
Beispiel #26
0
        self.embedding = Embedding(embed, dropout=0.1)
        self.tag_size = tag_size
        self.mlp = MLP(size_layer=[self.embedding.embedding_dim, tag_size])

    def forward(self, chars):
        # batch_size, max_len = words.size()
        chars = self.embedding(chars)
        outputs = self.mlp(chars)

        return {Const.OUTPUT: outputs}


embed = BertEmbedding(data.vocabs[Const.CHAR_INPUT],
                      model_dir_or_name='en-base',
                      pool_method='max',
                      requires_grad=True,
                      layers='11')

for name, dataset in data.datasets.items():
    dataset.set_pad_val(Const.TARGET, -100)

callbacks = [
    GradientClipCallback(clip_type='norm', clip_value=1),
    WarmupCallback(warmup=0.1, schedule='linear')
]

model = BertCNNER(embed, len(data.vocabs[Const.TARGET]))
optimizer = AdamW(model.parameters(), lr=1e-4)

for name, dataset in data.datasets.items():