def load_dataset( data_dir='/remote-home/ygxu/workspace/Product_all', data_path='mr.task.train', # bert_dir='/home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12', bert_dir='/remote-home/ygxu/workspace/BERT/BERT_English_uncased_L-24_H-1024_A_16', ): path = os.path.join(data_dir, data_path) ds = DataSet.read_csv(path, headers=('label', 'raw_sentence'), sep='\t') ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) def transfer_bert_to_fastnlp(ins): result = "[CLS] " bert_text = ins['bert_tokenize_list'] for text in bert_text: result += text + " " return result.strip() with open(os.path.join(bert_dir, 'vocab.txt')) as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line[:-1]) vocab_bert = Vocabulary(unknown=None, padding=None) vocab_bert.add_word_lst(vocabs) vocab_bert.build_vocab() vocab_bert.unknown = '[UNK]' vocab_bert.padding = '[PAD]' from pytorch_pretrained_bert import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained( os.path.join(bert_dir, 'vocab.txt')) ds.apply(lambda x: tokenizer.tokenize(x['raw_sentence']), new_field_name='bert_tokenize_list') ds.apply(transfer_bert_to_fastnlp, new_field_name='bert_tokenize') ds.apply(lambda x: [vocab_bert.to_index(word) for word in x['bert_tokenize_list']], new_field_name='index_words', is_input=True) ds.rename_field('index_words', 'tokens') ds.apply(lambda x: [1.] * len(x['tokens']), new_field_name='masks', is_input=True) return ds
def test_fastnlp_advanced_tutorial(self): import os os.chdir("tutorials/fastnlp_advanced_tutorial") from fastNLP import DataSet from fastNLP import Instance from fastNLP import Vocabulary from fastNLP import Trainer from fastNLP import Tester # ### Instance # Instance表示一个样本,由一个或者多个field(域、属性、特征)组成,每个field具有自己的名字以及值 # 在初始化Instance的时候可以定义它包含的field,使用"field_name=field_value"的写法 # In[2]: # 组织一个Instance,这个Instance由premise、hypothesis、label三个field组成 instance = Instance(premise='an premise example .', hypothesis='an hypothesis example.', label=1) instance # In[3]: data_set = DataSet([instance] * 5) data_set.append(instance) data_set[-2:] # In[4]: # 如果某一个field的类型与dataset对应的field类型不一样仍可被加入dataset中 instance2 = Instance(premise='the second premise example .', hypothesis='the second hypothesis example.', label='1') try: data_set.append(instance2) except: pass data_set[-2:] # In[5]: # 如果某一个field的名字不对,则该instance不能被append到dataset中 instance3 = Instance(premises='the third premise example .', hypothesis='the third hypothesis example.', label=1) try: data_set.append(instance3) except: print('cannot append instance') pass data_set[-2:] # In[6]: # 除了文本以外,还可以将tensor作为其中一个field的value import torch tensor_ins = Instance(image=torch.randn(5, 5), label=0) ds = DataSet() ds.append(tensor_ins) ds from fastNLP import DataSet from fastNLP import Instance # 从csv读取数据到DataSet # 类csv文件,即每一行为一个example的文件,都可以使用这种方法进行数据读取 dataset = DataSet.read_csv('tutorial_sample_dataset.csv', headers=('raw_sentence', 'label'), sep='\t') # 查看DataSet的大小 len(dataset) # In[8]: # 使用数字索引[k],获取第k个样本 dataset[0] # In[9]: # 获取的样本是一个Instance type(dataset[0]) # In[10]: # 使用数字索引[a: b],获取第a到第b个样本 dataset[0:3] # In[11]: # 索引也可以是负数 dataset[-1] data_path = ['premise', 'hypothesis', 'label'] # 读入文件 with open(data_path[0]) as f: premise = f.readlines() with open(data_path[1]) as f: hypothesis = f.readlines() with open(data_path[2]) as f: label = f.readlines() assert len(premise) == len(hypothesis) and len(hypothesis) == len( label) # 组织DataSet data_set = DataSet() for p, h, l in zip(premise, hypothesis, label): p = p.strip() # 将行末空格去除 h = h.strip() # 将行末空格去除 data_set.append(Instance(premise=p, hypothesis=h, truth=l)) data_set[0] # ### DataSet的其他操作 # 在构建完毕DataSet后,仍然可以对DataSet的内容进行操作,函数接口为DataSet.apply() # In[13]: # 将premise域的所有文本转成小写 data_set.apply(lambda x: x['premise'].lower(), new_field_name='premise') data_set[-2:] # In[14]: # label转int data_set.apply(lambda x: int(x['truth']), new_field_name='truth') data_set[-2:] # In[15]: # 使用空格分割句子 def split_sent(ins): return ins['premise'].split() data_set.apply(split_sent, new_field_name='premise') data_set.apply(lambda x: x['hypothesis'].split(), new_field_name='hypothesis') data_set[-2:] # In[16]: # 筛选数据 origin_data_set_len = len(data_set) data_set.drop(lambda x: len(x['premise']) <= 6) origin_data_set_len, len(data_set) # In[17]: # 增加长度信息 data_set.apply(lambda x: [1] * len(x['premise']), new_field_name='premise_len') data_set.apply(lambda x: [1] * len(x['hypothesis']), new_field_name='hypothesis_len') data_set[-1] # In[18]: # 设定特征域、标签域 data_set.set_input("premise", "premise_len", "hypothesis", "hypothesis_len") data_set.set_target("truth") # In[19]: # 重命名field data_set.rename_field('truth', 'label') data_set[-1] # In[20]: # 切分训练、验证集、测试集 train_data, vad_data = data_set.split(0.5) dev_data, test_data = vad_data.split(0.4) len(train_data), len(dev_data), len(test_data) # In[21]: # 深拷贝一个数据集 import copy train_data_2, dev_data_2 = copy.deepcopy(train_data), copy.deepcopy( dev_data) del copy # 初始化词表,该词表最大的vocab_size为10000,词表中每个词出现的最低频率为2,'<unk>'表示未知词语,'<pad>'表示padding词语 # Vocabulary默认初始化参数为max_size=None, min_freq=None, unknown='<unk>', padding='<pad>' vocab = Vocabulary(max_size=10000, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['premise']]) train_data.apply( lambda x: [vocab.add(word) for word in x['hypothesis']]) vocab.build_vocab() # In[23]: # 根据词表index句子 train_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') train_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') dev_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') dev_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') test_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') test_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') train_data[-1], dev_data[-1], test_data[-1] # 读入vocab文件 with open('vocab.txt') as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line.strip()) # 实例化Vocabulary vocab_bert = Vocabulary(unknown=None, padding=None) # 将vocabs列表加入Vocabulary vocab_bert.add_word_lst(vocabs) # 构建词表 vocab_bert.build_vocab() # 更新unknown与padding的token文本 vocab_bert.unknown = '[UNK]' vocab_bert.padding = '[PAD]' # In[25]: # 根据词表index句子 train_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise') train_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') dev_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise') dev_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') train_data_2[-1], dev_data_2[-1] # step 1:加载模型参数(非必选) from fastNLP.io.config_io import ConfigSection, ConfigLoader args = ConfigSection() ConfigLoader().load_config("./data/config", {"esim_model": args}) args["vocab_size"] = len(vocab) args.data # In[27]: # step 2:加载ESIM模型 from fastNLP.models import ESIM model = ESIM(**args.data) model # In[28]: # 另一个例子:加载CNN文本分类模型 from fastNLP.models import CNNText cnn_text_model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) cnn_text_model from fastNLP import CrossEntropyLoss from fastNLP import Adam from fastNLP import AccuracyMetric trainer = Trainer( train_data=train_data, model=model, loss=CrossEntropyLoss(pred='pred', target='label'), metrics=AccuracyMetric(), n_epochs=3, batch_size=16, print_every=-1, validate_every=-1, dev_data=dev_data, use_cuda=False, optimizer=Adam(lr=1e-3, weight_decay=0), check_code_level=-1, metric_key='acc', use_tqdm=False, ) trainer.train() tester = Tester( data=test_data, model=model, metrics=AccuracyMetric(), batch_size=args["batch_size"], ) tester.test() os.chdir("../..")