def test_same_vector(self): vocab = Vocabulary().add_word_lst(["The", "the", "THE"]) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5, lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE"]]]) words = embed(words) embed_0 = words[0, 0] for i in range(1, words.size(1)): assert torch.sum(embed_0==words[0, i]).eq(len(embed_0))
def test_Index2WordProcessor(self): vocab = Vocabulary() vocab.add_word_lst(["a", "b", "c", "d", "e"]) proc = Index2WordProcessor(vocab, "tag_id", "tag") data_set = DataSet( [Instance(tag_id=[np.random.randint(0, 7) for _ in range(32)])]) data_set = proc(data_set) self.assertTrue("tag" in data_set)
def test_index(self): vocab = Vocabulary() vocab.update(text) res = [vocab[w] for w in set(text)] self.assertEqual(len(res), len(set(res))) res = [vocab.to_index(w) for w in set(text)] self.assertEqual(len(res), len(set(res)))
def get_vocab(dataset): vocabulary = Vocabulary(unknown=unk_str, padding=pad_str) for data, _ in dataset: vocabulary.add_word_lst(data) print('vocab', len(vocabulary)) print('pad', vocabulary.to_index(pad_str)) return vocabulary
def test_roberta_ebembedding_2(self): # 测试only_use_pretrain_vocab与truncate_embed是否正常工作 Embedding = RobertaEmbedding weight_path = 'test/data_for_tests/embedding/small_roberta' vocab = Vocabulary().add_word_lst("this is a texta and".split()) embed1 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)), only_use_pretrain_bpe=True, truncate_embed=True, min_freq=1) # embed_bpe_vocab_size = len(vocab)-1 + 2 # 排除NotInBERT, 额外加##a, [CLS] # self.assertEqual(embed_bpe_vocab_size, len(embed1.model.tokenzier.vocab)) embed2 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)), only_use_pretrain_bpe=True, truncate_embed=False, min_freq=1) # embed_bpe_vocab_size = num_word # 排除NotInBERT # self.assertEqual(embed_bpe_vocab_size, len(embed2.model.tokenzier.vocab)) embed3 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)), only_use_pretrain_bpe=False, truncate_embed=True, min_freq=1) # embed_bpe_vocab_size = len(vocab)+2 # 新增##a, [CLS] # self.assertEqual(embed_bpe_vocab_size, len(embed3.model.tokenzier.vocab)) embed4 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)), only_use_pretrain_bpe=False, truncate_embed=False, min_freq=1) # embed_bpe_vocab_size = num_word+1 # 新增##a # self.assertEqual(embed_bpe_vocab_size, len(embed4.model.tokenzier.vocab)) # 测试各种情况下以下tensor的值是相等的 embed1.eval() embed2.eval() embed3.eval() embed4.eval() tensor = torch.LongTensor( [[vocab.to_index(w) for w in 'this is a texta and'.split()]]) t1 = embed1(tensor) t2 = embed2(tensor) t3 = embed3(tensor) t4 = embed4(tensor) self.assertEqual((t1 - t2).sum(), 0) self.assertEqual((t1 - t3).sum(), 0) self.assertEqual((t1 - t4).sum(), 0)
def prepare_env(): vocab = Vocabulary().add_word_lst("This is a test .".split()) vocab.add_word_lst("Another test !".split()) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5) encoder_output = torch.randn(2, 3, 10) src_seq_len = torch.LongTensor([3, 2]) encoder_mask = seq_len_to_mask(src_seq_len) return embed, encoder_output, encoder_mask
def test_fastnlp_1min_tutorial(self): # tutorials/fastnlp_1min_tutorial.ipynb data_path = "test/data_for_tests/tutorial_sample_dataset.csv" ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t') print(ds[1]) # 将所有数字转为小写 ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int ds.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) def split_sent(ins): return ins['raw_sentence'].split() ds.apply(split_sent, new_field_name='words', is_input=True) # 分割训练集/验证集 train_data, dev_data = ds.split(0.3) print("Train size: ", len(train_data)) print("Test size: ", len(dev_data)) from fastNLP import Vocabulary vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words', is_input=True) dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words', is_input=True) from fastNLP.models import CNNText model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1) from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), optimizer=Adam(), metrics=AccuracyMetric(target='target')) trainer.train() print('Train finished!')
def prepare_env(): vocab = Vocabulary().add_word_lst("This is a test .".split()) vocab.add_word_lst("Another test !".split()) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5) src_words_idx = torch.LongTensor([[3, 1, 2], [1, 2, 0]]) tgt_words_idx = torch.LongTensor([[1, 2, 3, 4], [2, 3, 0, 0]]) src_seq_len = torch.LongTensor([3, 2]) tgt_seq_len = torch.LongTensor([4, 2]) return embed, src_words_idx, tgt_words_idx, src_seq_len, tgt_seq_len
def test_iteration(self): vocab = Vocabulary() text = [ "FastNLP", "works", "well", "in", "most", "cases", "and", "scales", "well", "in", "works", "well", "in", "most", "cases", "scales", "well" ] vocab.update(text) text = set(text) for word in vocab: self.assertTrue(word in text)
def get_vocab(self): self.vocab = Vocabulary(min_freq=10) self.train_data.apply(lambda x : [self.vocab.add(word) for word in x['words']]) self.vocab.build_vocab() self.vocab.build_reverse_vocab() self.vocab_size = self.vocab.__len__() self.train_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words') self.train_data.apply(self.pad_seq,new_field_name='pad_words') self.test_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words') self.test_data.apply(self.pad_seq,new_field_name='pad_words')
def test_iteration(self): vocab = Vocabulary(padding=None, unknown=None) text = [ "FastNLP", "works", "well", "in", "most", "cases", "and", "scales", "well", "in", "works", "well", "in", "most", "cases", "scales", "well" ] vocab.update(text) text = set(text) for word, idx in vocab: self.assertTrue(word in text) self.assertTrue(idx < len(vocab))
def test_same_vector2(self): vocab = Vocabulary().add_word_lst(["The", 'a', 'b', "the", "THE", "B", 'a', "A"]) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'b', "B", 'a', 'A']]]) words = embed(words) embed_0 = words[0, 0] for i in range(1, 3): assert torch.sum(embed_0==words[0, i]).eq(len(embed_0)) embed_0 = words[0, 3] for i in range(3, 5): assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0))
def __init__(self, min_word_count=3, min_char_count=10, train_file=None, dev_file=None): self.min_word_count = min_word_count self.min_char_count = min_char_count self.train_file = train_file self.dev_file = dev_file print("Loading Squad data.") if self.dev_file is not None: suffix = self.dev_file.split('.')[-1] if suffix == "pkl": pickle = True else: pickle = False self.dev_data = self.load_file(self.dev_file, pickle_file=pickle) if self.train_file is not None: suffix = self.train_file.split('.')[-1] if suffix == "pkl": pickle = True else: pickle = False self.train_data = self.load_file(self.train_file, pickle_file=pickle) print("Building word vocab.") self.word_vocab = Vocabulary(min_freq=self.min_word_count) (self.word_vocab).from_dataset( self.train_data, self.dev_data, field_name=['context_word', 'question_word']) self.word_vocab.index_dataset(self.train_data, self.dev_data, field_name='context_word') self.word_vocab.index_dataset(self.train_data, self.dev_data, field_name='question_word') self.word_vocab_size = len(self.word_vocab) print("Building char vocab.") self.char_vocab = Vocabulary(min_freq=self.min_char_count) (self.char_vocab).from_dataset( self.train_data, self.dev_data, field_name=['context_char', 'question_char']) self.char_vocab.index_dataset(self.train_data, self.dev_data, field_name='question_char') self.char_vocab.index_dataset(self.train_data, self.dev_data, field_name='context_char') self.char_vocab_size = len(self.char_vocab)
def build_vocab(dataset_list, key): """ Build vocab from the given datasets on certain key :param dataset_list: List of Dataset :param key: string for key :return vocab: Vocabulary, the vocab created """ vocab = Vocabulary(min_freq=1) for dataset in dataset_list: dataset.apply(lambda x: [vocab.add(word) for word in x[key]]) vocab.build_vocab() return vocab
def input_with_span_attr(datasets, vocabs): datasets['train'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target', new_field_name='span_label') if 'dev' in datasets: datasets['dev'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target', new_field_name='span_label') datasets['test'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target', new_field_name='span_label') datasets['train'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_start_label') if 'dev' in datasets: datasets['dev'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_start_label') datasets['test'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_start_label') datasets['train'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_end_label') if 'dev' in datasets: datasets['dev'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_end_label') datasets['test'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_end_label') span_label_vocab = Vocabulary() attr_label_vocab = Vocabulary() span_label_vocab.from_dataset(datasets['train'], field_name='span_label') attr_label_vocab.from_dataset(datasets['train'], field_name=['attr_start_label', 'attr_end_label']) vocabs['span_label'] = span_label_vocab vocabs['attr_label'] = attr_label_vocab print(f"span label: {span_label_vocab.word2idx.keys()}") print(f"attr label: {attr_label_vocab.word2idx.keys()}") return datasets, vocabs
def load_dataset( data_dir='/remote-home/ygxu/workspace/Product_all', data_path='mr.task.train', # bert_dir='/home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12', bert_dir='/remote-home/ygxu/workspace/BERT/BERT_English_uncased_L-24_H-1024_A_16', ): path = os.path.join(data_dir, data_path) ds = DataSet.read_csv(path, headers=('label', 'raw_sentence'), sep='\t') ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) def transfer_bert_to_fastnlp(ins): result = "[CLS] " bert_text = ins['bert_tokenize_list'] for text in bert_text: result += text + " " return result.strip() with open(os.path.join(bert_dir, 'vocab.txt')) as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line[:-1]) vocab_bert = Vocabulary(unknown=None, padding=None) vocab_bert.add_word_lst(vocabs) vocab_bert.build_vocab() vocab_bert.unknown = '[UNK]' vocab_bert.padding = '[PAD]' from pytorch_pretrained_bert import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained( os.path.join(bert_dir, 'vocab.txt')) ds.apply(lambda x: tokenizer.tokenize(x['raw_sentence']), new_field_name='bert_tokenize_list') ds.apply(transfer_bert_to_fastnlp, new_field_name='bert_tokenize') ds.apply(lambda x: [vocab_bert.to_index(word) for word in x['bert_tokenize_list']], new_field_name='index_words', is_input=True) ds.rename_field('index_words', 'tokens') ds.apply(lambda x: [1.] * len(x['tokens']), new_field_name='masks', is_input=True) return ds
def get_data(): dataset_train, dataset_test = get_text_classification_datasets() # print(dataset_train.data) dic_train = { "input" : dataset_train.data, "target" : dataset_train.target } dic_test = { "input" : dataset_test.data, "target" : dataset_test.target } dataset = DataSet(dic_train) test_data = DataSet(dic_test) dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input') dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input') dataset.apply_field(lambda x: x.split(), field_name='input', new_field_name='words') test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input') test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input') test_data.apply_field(lambda x: x.split(), field_name='input', new_field_name='words') # ************************** dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') test_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dataset.rename_field('words', Const.INPUT) dataset.rename_field('seq_len', Const.INPUT_LEN) dataset.rename_field('target', Const.TARGET) test_data.rename_field('words', Const.INPUT) test_data.rename_field('seq_len', Const.INPUT_LEN) test_data.rename_field('target', Const.TARGET) # dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_input(Const.INPUT) dataset.set_target(Const.TARGET) # test_data.set_input(Const.INPUT, Const.INPUT_LEN) test_data.set_input(Const.INPUT) test_data.set_target(Const.TARGET) # ************************** # only use train for vocab or train+dev train_data, dev_data = dataset.split(0.1) # print(len(train_data), len(dev_data), len(test_data)) # print(train_data[0]) vocab = Vocabulary(min_freq=10).from_dataset(train_data, field_name=Const.INPUT) vocab.index_dataset(train_data, field_name=Const.INPUT,new_field_name=Const.INPUT) vocab.index_dataset(dev_data, field_name=Const.INPUT,new_field_name=Const.INPUT) vocab.index_dataset(test_data, field_name=Const.INPUT,new_field_name=Const.INPUT) # print(test_data[0]) print(len(vocab)) return vocab, train_data, dev_data, test_data
def test_case_2(self): # 测试只需要拥有一样的index就可以concat ds = DataSet([ Instance(words=['hello', 'world']), Instance(words=['hello', 'Jack']) ]) vocab1 = Vocabulary().from_dataset(ds, field_name='words') vocab2 = Vocabulary().from_dataset(ds, field_name='words') self.assertEqual(len(vocab1), 5) cnn_embed = CNNCharEmbedding(vocab1, embed_size=60) lstm_embed = LSTMCharEmbedding(vocab2, embed_size=70) embed = StackEmbedding([cnn_embed, lstm_embed]) x = torch.LongTensor([[2, 1, 0], [4, 3, 4]]) y = embed(x) self.assertEqual(tuple(y.size()), (2, 3, 130))
class JokeData(object): data_set = None train_data = None test_data = None vocab = None data_num = 0 vocab_size = 0 max_seq_len = 0 def __init__(self, conf): print(conf.data_path) self.data_set = get_joke_data(conf.data_path) self.data_num = len(self.data_set) self.data_set.apply(self.split_sent,new_field_name='words') self.max_seq_len = min(self.max_seq_len,conf.max_seq_len) self.data_set.apply(lambda x : len(x['words']),new_field_name='seq_len') self.train_data,self.test_data = self.data_set.split(0.2) def split_chinese_sent(self,ins,remove_punc=False): line = ins['raw_joke'].strip() words = ['<START>'] for c in line: if c in [',','。','?','!']: if remove_punc: continue else: words.append(c) else: words.append(c) words.append('<EOS>') self.max_seq_len = max(self.max_seq_len,len(words)) return words def split_sent(self,ins,remove_punc=False): words = ['<START>'] + ins['raw_joke'].split() + ['<EOS>'] self.max_seq_len = max(self.max_seq_len,len(words)) return words def pad_seq(self,ins): words = ins['words'] if(len(words) < self.max_seq_len): words = [0]*(self.max_seq_len-len(words)) + words else: words = words[:self.max_seq_len] return words def get_vocab(self): self.vocab = Vocabulary(min_freq=10) self.train_data.apply(lambda x : [self.vocab.add(word) for word in x['words']]) self.vocab.build_vocab() self.vocab.build_reverse_vocab() self.vocab_size = self.vocab.__len__() self.train_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words') self.train_data.apply(self.pad_seq,new_field_name='pad_words') self.test_data.apply(lambda x : [self.vocab.to_index(word) for word in x['words']],new_field_name='words') self.test_data.apply(self.pad_seq,new_field_name='pad_words')
def test_elmo_embedding(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_elmo', layers='0,1') words = torch.LongTensor([[0, 1, 2]]) hidden = elmo_embed(words) print(hidden.size()) self.assertEqual(hidden.size(), (1, 3, elmo_embed.embedding_dim))
def test_bert_embedding_1(self): vocab = Vocabulary().add_word_lst( "this is a test . [SEP] NotInBERT".split()) embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1) requires_grad = embed.requires_grad embed.requires_grad = not requires_grad embed.train() words = torch.LongTensor([[2, 3, 4, 0]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1) embed.eval() words = torch.LongTensor([[2, 3, 4, 0]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) # 自动截断而不报错 embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1, auto_truncate=True) words = torch.LongTensor([[2, 3, 4, 1] * 10, [2, 3] + [0] * 38]) result = embed(words) self.assertEqual(result.size(), (2, 40, 16))
def test_elmo_embedding_layer_assertion(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) try: elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_elmo', layers='0,1,2') except AssertionError as e: print(e)
def test_fit(self): """文本编码. """ print('{} test_fit {}'.format('-' * 15, '-' * 15)) texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基'] vocab = Vocabulary() for text in texts: vocab.add_word_lst(list(text)) print(len(vocab)) embed = StaticEmbedding( vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt') texts_to_id = [[vocab.to_index(word) for word in list(text)] for text in ['朱日和', '东台变']] print(texts_to_id) # [[16, 17, 18], [6, 1, 1]] words = torch.LongTensor(texts_to_id) # 将文本转为index print(embed(words).size()) # torch.Size([2, 3, 100])
def test_roberta_embedding_1(self): weight_path = 'test/data_for_tests/embedding/small_roberta' vocab = Vocabulary().add_word_lst( "this is a test . [SEP] NotInRoberta".split()) embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1) requires_grad = embed.requires_grad embed.requires_grad = not requires_grad embed.train() words = torch.LongTensor([[2, 3, 4, 1]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1, only_use_pretrain_bpe=True) embed.eval() words = torch.LongTensor([[2, 3, 4, 1]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) # 自动截断而不报错 embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1, only_use_pretrain_bpe=True, auto_truncate=True) words = torch.LongTensor([[2, 3, 4, 1] * 10, [2, 3] + [0] * 38]) result = embed(words) self.assertEqual(result.size(), (2, 40, 16))
def test_case(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) embed = StaticEmbedding(vocab, embedding_dim=5) encoder = TransformerSeq2SeqEncoder(embed, num_layers=2, d_model=10, n_head=2) words_idx = torch.LongTensor([0, 1, 2]).unsqueeze(0) seq_len = torch.LongTensor([3]) encoder_output, encoder_mask = encoder(words_idx, seq_len) self.assertEqual(encoder_output.size(), (1, 3, 10))
def test_norm1(self): # 测试只对可以找到的norm vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile']) embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_static_embedding/' 'glove.6B.50d_test.txt', only_norm_found_vector=True) self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1) self.assertNotEqual(torch.norm(embed(torch.LongTensor([[4]]))).item(), 1)
def test_case_1(self): ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['Jack'])]) vocab = Vocabulary().from_dataset(ds, field_name='words') self.assertEqual(len(vocab), 5) embed = LSTMCharEmbedding(vocab, embed_size=60) x = torch.LongTensor([[2, 1, 0], [4, 3, 4]]) y = embed(x) self.assertEqual(tuple(y.size()), (2, 3, 60))
def preprocess(batch=16): raw_data1 = [] raw_data2 = [] for i in range(len(traindata.data)): raw_data1.append( Instance(sentence=traindata.data[i], label=int(traindata.target[i]))) trainset = DataSet(raw_data1) trainset.apply(lambda x: pre(x['sentence']), new_field_name='words') for i in range(len(testdata.data)): raw_data2.append( Instance(sentence=testdata.data[i], label=int(testdata.target[i]))) testset = DataSet(raw_data2) testset.apply(lambda x: pre(x['sentence']), new_field_name='words') global vocab vocab = Vocabulary(min_freq=1).from_dataset(trainset, testset, field_name='words') vocab.index_dataset(trainset, testset, field_name='words', new_field_name='words') trainset.set_input('words') testset.set_input('words') trainset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) testset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) trainset.apply(lambda x: len(x['words']), new_field_name='seq_len') testset.apply(lambda x: len(x['words']), new_field_name='seq_len') global vocabsize vocabsize = len(vocab) sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len') train_batch = Batch(batch_size=batch, dataset=trainset, sampler=sampler) test_batch = Batch(batch_size=batch, dataset=testset, sampler=sampler) return train_batch, test_batch, vocabsize
def get_fastnlp_dataset(): text_train, text_test = get_text_classification_datasets() train_data = DataSet() test_data = DataSet() for i in range(len(text_train.data)): train_data.append( Instance(text=split_sent(text_train.data[i]), target=int(text_train.target[i]))) for i in range(len(text_test.data)): test_data.append( Instance(text=split_sent(text_test.data[i]), target=int(text_test.target[i]))) # 构建词表 vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['text']]) vocab.build_vocab() # 根据词表映射句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') # 设定特征域和标签域 train_data.set_input("word_seq") test_data.set_input("word_seq") train_data.set_target("target") test_data.set_target("target") return train_data, test_data, vocab
def data_process(): with open('./data.txt', encoding='utf-8') as fp: out = fp.readlines() data = list(out) poem = [] cnt = 0 for temp in data: cnt += 1 if cnt % 2 == 0: rec = re.sub(',', '', temp) poem.append(rec[:-1]) poem_normalized = [] for i in range(len(poem)): if len(poem[i]) < 80: poem[i] = ' ' * (80 - len(poem[i])) + poem[i] poem_normalized.append(poem[i]) else: poem_normalized.append(poem[i][:80]) vocab = Vocabulary(min_freq=2) for temp in poem_normalized: for x in temp: vocab.add(x) vocab.build_vocab() dataset = [] for temp in poem_normalized: dataset.append([vocab.to_index(x) for x in temp]) return vocab, np.array(dataset)