Ejemplo n.º 1
0
 def test_bert_word_piece_encoder(self):
     embed = BertWordPieceEncoder(
         model_dir_or_name='test/data_for_tests/embedding/small_bert',
         word_dropout=0.1)
     ds = DataSet({'words': ["this is a test . [SEP]".split()]})
     embed.index_datasets(ds, field_name='words')
     self.assertTrue(ds.has_field('word_pieces'))
     result = embed(torch.LongTensor([[1, 2, 3, 4]]))
Ejemplo n.º 2
0
    def test_bert_embed_eq_bert_piece_encoder(self):
        ds = DataSet({
            'words':
            ["this is a texta model vocab".split(), 'this is'.split()]
        })
        encoder = BertWordPieceEncoder(
            model_dir_or_name='test/data_for_tests/embedding/small_bert')
        encoder.eval()
        encoder.index_datasets(ds, field_name='words')
        word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
        word_pieces_res = encoder(word_pieces)

        vocab = Vocabulary()
        vocab.from_dataset(ds, field_name='words')
        vocab.index_dataset(ds, field_name='words', new_field_name='words')
        ds.set_input('words')
        words = torch.LongTensor(ds['words'].get([0, 1]))
        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            pool_method='first',
            include_cls_sep=True,
            pooled_cls=False,
            min_freq=1)
        embed.eval()
        words_res = embed(words)

        # 检查word piece什么的是正常work的
        self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0)
        self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0)
        self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
Ejemplo n.º 3
0
 def test_save_load(self):
     bert_save_test = 'bert_save_test'
     try:
         os.makedirs(bert_save_test, exist_ok=True)
         embed = BertWordPieceEncoder(
             model_dir_or_name='test/data_for_tests/embedding/small_bert',
             word_dropout=0.0,
             layers='-2')
         ds = DataSet({'words': ["this is a test . [SEP]".split()]})
         embed.index_datasets(ds, field_name='words')
         self.assertTrue(ds.has_field('word_pieces'))
         words = torch.LongTensor([[1, 2, 3, 4]])
         embed.save(bert_save_test)
         load_embed = BertWordPieceEncoder.load(bert_save_test)
         embed.eval(), load_embed.eval()
         self.assertEqual((embed(words) - load_embed(words)).sum(), 0)
     finally:
         import shutil
         shutil.rmtree(bert_save_test)
Ejemplo n.º 4
0
 def test_bert_word_piece_encoder(self):
     embed = BertWordPieceEncoder(model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1)
     from fastNLP import DataSet
     ds = DataSet({'words': ["this is a test . [SEP]".split()]})
     embed.index_datasets(ds, field_name='words')
     self.assertTrue(ds.has_field('word_pieces'))
Ejemplo n.º 5
0
data_bundle = get_data()

print(data_bundle)

if args.model_name.split("-")[0] in ("bert", "roberta", "xlnet", "xlmroberta"):
    model_type, args.model_name = (
        args.model_name[:args.model_name.index("-")],
        args.model_name[args.model_name.index("-") + 1:],
    )

if model_type == "roberta":
    embed = RobertaWordPieceEncoder(model_dir_or_name=args.model_name,
                                    requires_grad=True,
                                    num_aspect=1)
elif model_type == "bert":
    embed = BertWordPieceEncoder(model_dir_or_name=args.model_name,
                                 requires_grad=True)
elif model_type == "xlnet":
    embed = XLNetModel.from_pretrained(
        pretrained_model_name_or_path=args.model_name)
elif model_type == "xlmroberta":
    embed = XLMRobertaModel.from_pretrained(
        pretrained_model_name_or_path=args.model_name)


class AspectModel(nn.Module):
    def __init__(self, embed, dropout, num_classes, pool="max"):
        super().__init__()
        assert pool in ("max", "mean")
        self.embed = embed
        self.embed_dropout = nn.Dropout(dropout)
        if hasattr(embed, "embedding_dim"):