Exemple #1
0
 def test_same_vector(self):
     vocab = Vocabulary().add_word_lst(["The", "the", "THE"])
     embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5, lower=True)
     words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE"]]])
     words = embed(words)
     embed_0 = words[0, 0]
     for i in range(1, words.size(1)):
         assert torch.sum(embed_0==words[0, i]).eq(len(embed_0))
Exemple #2
0
 def test_Index2WordProcessor(self):
     vocab = Vocabulary()
     vocab.add_word_lst(["a", "b", "c", "d", "e"])
     proc = Index2WordProcessor(vocab, "tag_id", "tag")
     data_set = DataSet(
         [Instance(tag_id=[np.random.randint(0, 7) for _ in range(32)])])
     data_set = proc(data_set)
     self.assertTrue("tag" in data_set)
Exemple #3
0
    def test_index(self):
        vocab = Vocabulary()
        vocab.update(text)
        res = [vocab[w] for w in set(text)]
        self.assertEqual(len(res), len(set(res)))

        res = [vocab.to_index(w) for w in set(text)]
        self.assertEqual(len(res), len(set(res)))
Exemple #4
0
def get_vocab(dataset):
    vocabulary = Vocabulary(unknown=unk_str, padding=pad_str)
    for data, _ in dataset:
        vocabulary.add_word_lst(data)
    print('vocab', len(vocabulary))
    print('pad', vocabulary.to_index(pad_str))

    return vocabulary
Exemple #5
0
    def test_roberta_ebembedding_2(self):
        # 测试only_use_pretrain_vocab与truncate_embed是否正常工作
        Embedding = RobertaEmbedding
        weight_path = 'test/data_for_tests/embedding/small_roberta'
        vocab = Vocabulary().add_word_lst("this is a texta and".split())
        embed1 = Embedding(vocab,
                           model_dir_or_name=weight_path,
                           layers=list(range(3)),
                           only_use_pretrain_bpe=True,
                           truncate_embed=True,
                           min_freq=1)
        # embed_bpe_vocab_size = len(vocab)-1 + 2  # 排除NotInBERT, 额外加##a, [CLS]
        # self.assertEqual(embed_bpe_vocab_size, len(embed1.model.tokenzier.vocab))

        embed2 = Embedding(vocab,
                           model_dir_or_name=weight_path,
                           layers=list(range(3)),
                           only_use_pretrain_bpe=True,
                           truncate_embed=False,
                           min_freq=1)
        # embed_bpe_vocab_size = num_word  # 排除NotInBERT
        # self.assertEqual(embed_bpe_vocab_size, len(embed2.model.tokenzier.vocab))

        embed3 = Embedding(vocab,
                           model_dir_or_name=weight_path,
                           layers=list(range(3)),
                           only_use_pretrain_bpe=False,
                           truncate_embed=True,
                           min_freq=1)
        # embed_bpe_vocab_size = len(vocab)+2  # 新增##a, [CLS]
        # self.assertEqual(embed_bpe_vocab_size, len(embed3.model.tokenzier.vocab))

        embed4 = Embedding(vocab,
                           model_dir_or_name=weight_path,
                           layers=list(range(3)),
                           only_use_pretrain_bpe=False,
                           truncate_embed=False,
                           min_freq=1)
        # embed_bpe_vocab_size = num_word+1  # 新增##a
        # self.assertEqual(embed_bpe_vocab_size, len(embed4.model.tokenzier.vocab))

        # 测试各种情况下以下tensor的值是相等的
        embed1.eval()
        embed2.eval()
        embed3.eval()
        embed4.eval()
        tensor = torch.LongTensor(
            [[vocab.to_index(w) for w in 'this is a texta and'.split()]])
        t1 = embed1(tensor)
        t2 = embed2(tensor)
        t3 = embed3(tensor)
        t4 = embed4(tensor)

        self.assertEqual((t1 - t2).sum(), 0)
        self.assertEqual((t1 - t3).sum(), 0)
        self.assertEqual((t1 - t4).sum(), 0)
def prepare_env():
    vocab = Vocabulary().add_word_lst("This is a test .".split())
    vocab.add_word_lst("Another test !".split())
    embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5)

    encoder_output = torch.randn(2, 3, 10)
    src_seq_len = torch.LongTensor([3, 2])
    encoder_mask = seq_len_to_mask(src_seq_len)

    return embed, encoder_output, encoder_mask
    def test_fastnlp_1min_tutorial(self):
        # tutorials/fastnlp_1min_tutorial.ipynb
        data_path = "test/data_for_tests/tutorial_sample_dataset.csv"
        ds = DataSet.read_csv(data_path,
                              headers=('raw_sentence', 'label'),
                              sep='\t')
        print(ds[1])

        # 将所有数字转为小写
        ds.apply(lambda x: x['raw_sentence'].lower(),
                 new_field_name='raw_sentence')
        # label转int
        ds.apply(lambda x: int(x['label']),
                 new_field_name='target',
                 is_target=True)

        def split_sent(ins):
            return ins['raw_sentence'].split()

        ds.apply(split_sent, new_field_name='words', is_input=True)

        # 分割训练集/验证集
        train_data, dev_data = ds.split(0.3)
        print("Train size: ", len(train_data))
        print("Test size: ", len(dev_data))

        from fastNLP import Vocabulary
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])

        # index句子, Vocabulary.to_index(word)
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words',
            is_input=True)
        dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']],
                       new_field_name='words',
                       is_input=True)

        from fastNLP.models import CNNText
        model = CNNText((len(vocab), 50),
                        num_classes=5,
                        padding=2,
                        dropout=0.1)

        from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam

        trainer = Trainer(model=model,
                          train_data=train_data,
                          dev_data=dev_data,
                          loss=CrossEntropyLoss(),
                          optimizer=Adam(),
                          metrics=AccuracyMetric(target='target'))
        trainer.train()
        print('Train finished!')
def prepare_env():
    vocab = Vocabulary().add_word_lst("This is a test .".split())
    vocab.add_word_lst("Another test !".split())
    embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5)

    src_words_idx = torch.LongTensor([[3, 1, 2], [1, 2, 0]])
    tgt_words_idx = torch.LongTensor([[1, 2, 3, 4], [2, 3, 0, 0]])
    src_seq_len = torch.LongTensor([3, 2])
    tgt_seq_len = torch.LongTensor([4, 2])

    return embed, src_words_idx, tgt_words_idx, src_seq_len, tgt_seq_len
Exemple #9
0
 def test_iteration(self):
     vocab = Vocabulary()
     text = [
         "FastNLP", "works", "well", "in", "most", "cases", "and", "scales",
         "well", "in", "works", "well", "in", "most", "cases", "scales",
         "well"
     ]
     vocab.update(text)
     text = set(text)
     for word in vocab:
         self.assertTrue(word in text)
Exemple #10
0
    def get_vocab(self):
        self.vocab = Vocabulary(min_freq=10)
        self.train_data.apply(lambda x : [self.vocab.add(word) for word in x['words']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = self.vocab.__len__()

        self.train_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words')
        self.train_data.apply(self.pad_seq,new_field_name='pad_words')
        
        self.test_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words')
        self.test_data.apply(self.pad_seq,new_field_name='pad_words')
Exemple #11
0
 def test_iteration(self):
     vocab = Vocabulary(padding=None, unknown=None)
     text = [
         "FastNLP", "works", "well", "in", "most", "cases", "and", "scales",
         "well", "in", "works", "well", "in", "most", "cases", "scales",
         "well"
     ]
     vocab.update(text)
     text = set(text)
     for word, idx in vocab:
         self.assertTrue(word in text)
         self.assertTrue(idx < len(vocab))
 def test_same_vector2(self):
     vocab = Vocabulary().add_word_lst(["The", 'a', 'b', "the", "THE", "B", 'a', "A"])
     embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d',
                             lower=True)
     words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'b', "B", 'a', 'A']]])
     words = embed(words)
     embed_0 = words[0, 0]
     for i in range(1, 3):
         assert torch.sum(embed_0==words[0, i]).eq(len(embed_0))
     embed_0 = words[0, 3]
     for i in range(3, 5):
         assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0))
Exemple #13
0
    def __init__(self,
                 min_word_count=3,
                 min_char_count=10,
                 train_file=None,
                 dev_file=None):
        self.min_word_count = min_word_count
        self.min_char_count = min_char_count
        self.train_file = train_file
        self.dev_file = dev_file
        print("Loading Squad data.")
        if self.dev_file is not None:
            suffix = self.dev_file.split('.')[-1]
            if suffix == "pkl":
                pickle = True
            else:
                pickle = False
            self.dev_data = self.load_file(self.dev_file, pickle_file=pickle)
        if self.train_file is not None:
            suffix = self.train_file.split('.')[-1]
            if suffix == "pkl":
                pickle = True
            else:
                pickle = False
            self.train_data = self.load_file(self.train_file,
                                             pickle_file=pickle)
        print("Building word vocab.")
        self.word_vocab = Vocabulary(min_freq=self.min_word_count)
        (self.word_vocab).from_dataset(
            self.train_data,
            self.dev_data,
            field_name=['context_word', 'question_word'])
        self.word_vocab.index_dataset(self.train_data,
                                      self.dev_data,
                                      field_name='context_word')
        self.word_vocab.index_dataset(self.train_data,
                                      self.dev_data,
                                      field_name='question_word')
        self.word_vocab_size = len(self.word_vocab)

        print("Building char vocab.")
        self.char_vocab = Vocabulary(min_freq=self.min_char_count)
        (self.char_vocab).from_dataset(
            self.train_data,
            self.dev_data,
            field_name=['context_char', 'question_char'])
        self.char_vocab.index_dataset(self.train_data,
                                      self.dev_data,
                                      field_name='question_char')
        self.char_vocab.index_dataset(self.train_data,
                                      self.dev_data,
                                      field_name='context_char')
        self.char_vocab_size = len(self.char_vocab)
Exemple #14
0
def build_vocab(dataset_list, key):
    """
        Build vocab from the given datasets on certain key
    
        :param dataset_list: List of Dataset
        :param key: string for key
        :return vocab: Vocabulary, the vocab created
    """
    vocab = Vocabulary(min_freq=1)
    for dataset in dataset_list:
        dataset.apply(lambda x: [vocab.add(word) for word in x[key]])
    vocab.build_vocab()
    return vocab
def input_with_span_attr(datasets, vocabs):
    datasets['train'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target',
                                  new_field_name='span_label')
    if 'dev' in datasets:
        datasets['dev'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target',
                                    new_field_name='span_label')
    datasets['test'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target',
                                 new_field_name='span_label')

    datasets['train'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)),
                                  field_name='target', new_field_name='attr_start_label')
    if 'dev' in datasets:
        datasets['dev'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)),
                                    field_name='target', new_field_name='attr_start_label')
    datasets['test'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)),
                                 field_name='target', new_field_name='attr_start_label')

    datasets['train'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)),
                                  field_name='target', new_field_name='attr_end_label')
    if 'dev' in datasets:
        datasets['dev'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)),
                                    field_name='target', new_field_name='attr_end_label')
    datasets['test'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)),
                                 field_name='target', new_field_name='attr_end_label')

    span_label_vocab = Vocabulary()
    attr_label_vocab = Vocabulary()
    span_label_vocab.from_dataset(datasets['train'], field_name='span_label')
    attr_label_vocab.from_dataset(datasets['train'], field_name=['attr_start_label', 'attr_end_label'])
    vocabs['span_label'] = span_label_vocab
    vocabs['attr_label'] = attr_label_vocab
    print(f"span label: {span_label_vocab.word2idx.keys()}")
    print(f"attr label: {attr_label_vocab.word2idx.keys()}")
    return datasets, vocabs
Exemple #16
0
def load_dataset(
    data_dir='/remote-home/ygxu/workspace/Product_all',
    data_path='mr.task.train',
    # bert_dir='/home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12',
    bert_dir='/remote-home/ygxu/workspace/BERT/BERT_English_uncased_L-24_H-1024_A_16',
):

    path = os.path.join(data_dir, data_path)

    ds = DataSet.read_csv(path, headers=('label', 'raw_sentence'), sep='\t')

    ds.apply(lambda x: x['raw_sentence'].lower(),
             new_field_name='raw_sentence')

    ds.apply(lambda x: int(x['label']),
             new_field_name='label_seq',
             is_target=True)

    def transfer_bert_to_fastnlp(ins):
        result = "[CLS] "
        bert_text = ins['bert_tokenize_list']
        for text in bert_text:
            result += text + " "
        return result.strip()

    with open(os.path.join(bert_dir, 'vocab.txt')) as f:
        lines = f.readlines()
    vocabs = []
    for line in lines:
        vocabs.append(line[:-1])

    vocab_bert = Vocabulary(unknown=None, padding=None)
    vocab_bert.add_word_lst(vocabs)
    vocab_bert.build_vocab()
    vocab_bert.unknown = '[UNK]'
    vocab_bert.padding = '[PAD]'

    from pytorch_pretrained_bert import BertTokenizer, BertModel
    tokenizer = BertTokenizer.from_pretrained(
        os.path.join(bert_dir, 'vocab.txt'))
    ds.apply(lambda x: tokenizer.tokenize(x['raw_sentence']),
             new_field_name='bert_tokenize_list')
    ds.apply(transfer_bert_to_fastnlp, new_field_name='bert_tokenize')
    ds.apply(lambda x:
             [vocab_bert.to_index(word) for word in x['bert_tokenize_list']],
             new_field_name='index_words',
             is_input=True)

    ds.rename_field('index_words', 'tokens')
    ds.apply(lambda x: [1.] * len(x['tokens']),
             new_field_name='masks',
             is_input=True)

    return ds
def get_data():
    dataset_train, dataset_test = get_text_classification_datasets()
    # print(dataset_train.data)

    dic_train = {
        "input" : dataset_train.data,
        "target" : dataset_train.target
    }
    dic_test = {
        "input" : dataset_test.data,
        "target" : dataset_test.target
    }

    dataset = DataSet(dic_train)
    test_data = DataSet(dic_test)

    dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input')
    dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input')
    dataset.apply_field(lambda x: x.split(), field_name='input', new_field_name='words')

    test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input')
    test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input')
    test_data.apply_field(lambda x: x.split(), field_name='input', new_field_name='words')


    # **************************
    dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')
    test_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')
    dataset.rename_field('words', Const.INPUT)
    dataset.rename_field('seq_len', Const.INPUT_LEN)
    dataset.rename_field('target', Const.TARGET)
    
    test_data.rename_field('words', Const.INPUT)
    test_data.rename_field('seq_len', Const.INPUT_LEN)
    test_data.rename_field('target', Const.TARGET)

    # dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    dataset.set_input(Const.INPUT)
    dataset.set_target(Const.TARGET)

    # test_data.set_input(Const.INPUT, Const.INPUT_LEN)
    test_data.set_input(Const.INPUT)
    test_data.set_target(Const.TARGET)
    # **************************

    # only use train for vocab or train+dev
    train_data, dev_data = dataset.split(0.1)
    # print(len(train_data), len(dev_data), len(test_data))
    # print(train_data[0])

    vocab = Vocabulary(min_freq=10).from_dataset(train_data, field_name=Const.INPUT)

    vocab.index_dataset(train_data, field_name=Const.INPUT,new_field_name=Const.INPUT)
    vocab.index_dataset(dev_data, field_name=Const.INPUT,new_field_name=Const.INPUT)
    vocab.index_dataset(test_data, field_name=Const.INPUT,new_field_name=Const.INPUT)

    # print(test_data[0])
    print(len(vocab))
    return vocab, train_data, dev_data, test_data
 def test_case_2(self):
     # 测试只需要拥有一样的index就可以concat
     ds = DataSet([
         Instance(words=['hello', 'world']),
         Instance(words=['hello', 'Jack'])
     ])
     vocab1 = Vocabulary().from_dataset(ds, field_name='words')
     vocab2 = Vocabulary().from_dataset(ds, field_name='words')
     self.assertEqual(len(vocab1), 5)
     cnn_embed = CNNCharEmbedding(vocab1, embed_size=60)
     lstm_embed = LSTMCharEmbedding(vocab2, embed_size=70)
     embed = StackEmbedding([cnn_embed, lstm_embed])
     x = torch.LongTensor([[2, 1, 0], [4, 3, 4]])
     y = embed(x)
     self.assertEqual(tuple(y.size()), (2, 3, 130))
Exemple #19
0
class JokeData(object):
    data_set = None
    train_data = None
    test_data = None
    vocab = None
    data_num = 0
    vocab_size = 0
    max_seq_len = 0

    def __init__(self, conf):
        print(conf.data_path)
        self.data_set = get_joke_data(conf.data_path)
        self.data_num = len(self.data_set)
        self.data_set.apply(self.split_sent,new_field_name='words')
        self.max_seq_len = min(self.max_seq_len,conf.max_seq_len)
        self.data_set.apply(lambda x : len(x['words']),new_field_name='seq_len')
        self.train_data,self.test_data = self.data_set.split(0.2)

    def split_chinese_sent(self,ins,remove_punc=False):
        line = ins['raw_joke'].strip()
        words = ['<START>']
        for c in line:
            if c in [',','。','?','!']:
                if remove_punc:
                    continue
                else:
                    words.append(c)
            else:
                words.append(c)
        words.append('<EOS>')
        self.max_seq_len = max(self.max_seq_len,len(words))
        return words
    
    def split_sent(self,ins,remove_punc=False):
        words = ['<START>'] + ins['raw_joke'].split() + ['<EOS>']
        self.max_seq_len = max(self.max_seq_len,len(words))
        return words

    def pad_seq(self,ins):
        words = ins['words']
        if(len(words) < self.max_seq_len):
            words = [0]*(self.max_seq_len-len(words)) + words
        else:
            words = words[:self.max_seq_len]
        return words
        
    def get_vocab(self):
        self.vocab = Vocabulary(min_freq=10)
        self.train_data.apply(lambda x : [self.vocab.add(word) for word in x['words']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = self.vocab.__len__()

        self.train_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words')
        self.train_data.apply(self.pad_seq,new_field_name='pad_words')
        
        self.test_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words')
        self.test_data.apply(self.pad_seq,new_field_name='pad_words')
Exemple #20
0
 def test_elmo_embedding(self):
     vocab = Vocabulary().add_word_lst("This is a test .".split())
     elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_elmo', layers='0,1')
     words = torch.LongTensor([[0, 1, 2]])
     hidden = elmo_embed(words)
     print(hidden.size())
     self.assertEqual(hidden.size(), (1, 3, elmo_embed.embedding_dim))
    def test_bert_embedding_1(self):
        vocab = Vocabulary().add_word_lst(
            "this is a test . [SEP] NotInBERT".split())
        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            word_dropout=0.1)
        requires_grad = embed.requires_grad
        embed.requires_grad = not requires_grad
        embed.train()
        words = torch.LongTensor([[2, 3, 4, 0]])
        result = embed(words)
        self.assertEqual(result.size(), (1, 4, 16))

        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            word_dropout=0.1)
        embed.eval()
        words = torch.LongTensor([[2, 3, 4, 0]])
        result = embed(words)
        self.assertEqual(result.size(), (1, 4, 16))

        # 自动截断而不报错
        embed = BertEmbedding(
            vocab,
            model_dir_or_name='test/data_for_tests/embedding/small_bert',
            word_dropout=0.1,
            auto_truncate=True)

        words = torch.LongTensor([[2, 3, 4, 1] * 10, [2, 3] + [0] * 38])
        result = embed(words)
        self.assertEqual(result.size(), (2, 40, 16))
Exemple #22
0
 def test_elmo_embedding_layer_assertion(self):
     vocab = Vocabulary().add_word_lst("This is a test .".split())
     try:
         elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_elmo',
                                    layers='0,1,2')
     except AssertionError as e:
         print(e)
Exemple #23
0
 def test_fit(self):
     """文本编码.
     """
     print('{} test_fit {}'.format('-' * 15, '-' * 15))
     texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基']
     vocab = Vocabulary()
     for text in texts:
         vocab.add_word_lst(list(text))
     print(len(vocab))
     embed = StaticEmbedding(
         vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt')
     texts_to_id = [[vocab.to_index(word) for word in list(text)]
                    for text in ['朱日和', '东台变']]
     print(texts_to_id)  # [[16, 17, 18], [6, 1, 1]]
     words = torch.LongTensor(texts_to_id)  # 将文本转为index
     print(embed(words).size())  # torch.Size([2, 3, 100])
Exemple #24
0
    def test_roberta_embedding_1(self):
        weight_path = 'test/data_for_tests/embedding/small_roberta'
        vocab = Vocabulary().add_word_lst(
            "this is a test . [SEP] NotInRoberta".split())
        embed = RobertaEmbedding(vocab,
                                 model_dir_or_name=weight_path,
                                 word_dropout=0.1)
        requires_grad = embed.requires_grad
        embed.requires_grad = not requires_grad
        embed.train()
        words = torch.LongTensor([[2, 3, 4, 1]])
        result = embed(words)
        self.assertEqual(result.size(), (1, 4, 16))

        embed = RobertaEmbedding(vocab,
                                 model_dir_or_name=weight_path,
                                 word_dropout=0.1,
                                 only_use_pretrain_bpe=True)
        embed.eval()
        words = torch.LongTensor([[2, 3, 4, 1]])
        result = embed(words)
        self.assertEqual(result.size(), (1, 4, 16))

        # 自动截断而不报错
        embed = RobertaEmbedding(vocab,
                                 model_dir_or_name=weight_path,
                                 word_dropout=0.1,
                                 only_use_pretrain_bpe=True,
                                 auto_truncate=True)
        words = torch.LongTensor([[2, 3, 4, 1] * 10, [2, 3] + [0] * 38])
        result = embed(words)
        self.assertEqual(result.size(), (2, 40, 16))
 def test_case(self):
     vocab = Vocabulary().add_word_lst("This is a test .".split())
     embed = StaticEmbedding(vocab, embedding_dim=5)
     encoder = TransformerSeq2SeqEncoder(embed, num_layers=2, d_model=10, n_head=2)
     words_idx = torch.LongTensor([0, 1, 2]).unsqueeze(0)
     seq_len = torch.LongTensor([3])
     encoder_output, encoder_mask = encoder(words_idx, seq_len)
     self.assertEqual(encoder_output.size(), (1, 3, 10))
 def test_norm1(self):
     # 测试只对可以找到的norm
     vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile'])
     embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_static_embedding/'
                                                      'glove.6B.50d_test.txt',
                             only_norm_found_vector=True)
     self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1)
     self.assertNotEqual(torch.norm(embed(torch.LongTensor([[4]]))).item(), 1)
 def test_case_1(self):
     ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['Jack'])])
     vocab = Vocabulary().from_dataset(ds, field_name='words')
     self.assertEqual(len(vocab), 5)
     embed = LSTMCharEmbedding(vocab, embed_size=60)
     x = torch.LongTensor([[2, 1, 0], [4, 3, 4]])
     y = embed(x)
     self.assertEqual(tuple(y.size()), (2, 3, 60))
Exemple #28
0
def preprocess(batch=16):
    raw_data1 = []
    raw_data2 = []

    for i in range(len(traindata.data)):
        raw_data1.append(
            Instance(sentence=traindata.data[i],
                     label=int(traindata.target[i])))
    trainset = DataSet(raw_data1)
    trainset.apply(lambda x: pre(x['sentence']), new_field_name='words')

    for i in range(len(testdata.data)):
        raw_data2.append(
            Instance(sentence=testdata.data[i], label=int(testdata.target[i])))
    testset = DataSet(raw_data2)
    testset.apply(lambda x: pre(x['sentence']), new_field_name='words')

    global vocab
    vocab = Vocabulary(min_freq=1).from_dataset(trainset,
                                                testset,
                                                field_name='words')
    vocab.index_dataset(trainset,
                        testset,
                        field_name='words',
                        new_field_name='words')
    trainset.set_input('words')
    testset.set_input('words')

    trainset.apply(lambda x: int(x['label']),
                   new_field_name='target',
                   is_target=True)
    testset.apply(lambda x: int(x['label']),
                  new_field_name='target',
                  is_target=True)

    trainset.apply(lambda x: len(x['words']), new_field_name='seq_len')
    testset.apply(lambda x: len(x['words']), new_field_name='seq_len')

    global vocabsize
    vocabsize = len(vocab)
    sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len')
    train_batch = Batch(batch_size=batch, dataset=trainset, sampler=sampler)
    test_batch = Batch(batch_size=batch, dataset=testset, sampler=sampler)

    return train_batch, test_batch, vocabsize
Exemple #29
0
def get_fastnlp_dataset():
    text_train, text_test = get_text_classification_datasets()
    train_data = DataSet()
    test_data = DataSet()
    for i in range(len(text_train.data)):
        train_data.append(
            Instance(text=split_sent(text_train.data[i]),
                     target=int(text_train.target[i])))
    for i in range(len(text_test.data)):
        test_data.append(
            Instance(text=split_sent(text_test.data[i]),
                     target=int(text_test.target[i])))

    # 构建词表
    vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>')
    train_data.apply(lambda x: [vocab.add(word) for word in x['text']])
    vocab.build_vocab()

    # 根据词表映射句子
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']],
                     new_field_name='word_seq')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']],
                    new_field_name='word_seq')

    # 设定特征域和标签域
    train_data.set_input("word_seq")
    test_data.set_input("word_seq")
    train_data.set_target("target")
    test_data.set_target("target")

    return train_data, test_data, vocab
Exemple #30
0
def data_process():
    with open('./data.txt', encoding='utf-8') as fp:
        out = fp.readlines()
        data = list(out)

    poem = []
    cnt = 0
    for temp in data:
        cnt += 1
        if cnt % 2 == 0:
            rec = re.sub(',', '', temp)
            poem.append(rec[:-1])

    poem_normalized = []
    for i in range(len(poem)):
        if len(poem[i]) < 80:
            poem[i] = ' ' * (80 - len(poem[i])) + poem[i]
            poem_normalized.append(poem[i])
        else:
            poem_normalized.append(poem[i][:80])

    vocab = Vocabulary(min_freq=2)
    for temp in poem_normalized:
        for x in temp:
            vocab.add(x)

    vocab.build_vocab()
    dataset = []
    for temp in poem_normalized:
        dataset.append([vocab.to_index(x) for x in temp])
    return vocab, np.array(dataset)