def test_save_and_load(self): model = CNNText((10, 10), 2) saver = ModelSaver('tmp') loader = ModelLoader() saver.save_pytorch(model) new_cnn = CNNText((10, 10), 2) loader.load_pytorch(new_cnn, 'tmp') new_model = loader.load_pytorch_model('tmp') for i in range(10): for j in range(10): self.assertEqual(model.embed.embed.weight[i, j], new_cnn.embed.embed.weight[i, j]) self.assertEqual(model.embed.embed.weight[i, j], new_model["embed.embed.weight"][i, j]) os.system('rm tmp')
def test_fastnlp_1min_tutorial(self): # tutorials/fastnlp_1min_tutorial.ipynb data_path = "test/data_for_tests/tutorial_sample_dataset.csv" ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t') print(ds[1]) # 将所有数字转为小写 ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int ds.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) def split_sent(ins): return ins['raw_sentence'].split() ds.apply(split_sent, new_field_name='words', is_input=True) # 分割训练集/验证集 train_data, dev_data = ds.split(0.3) print("Train size: ", len(train_data)) print("Test size: ", len(dev_data)) from fastNLP import Vocabulary vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words', is_input=True) dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words', is_input=True) from fastNLP.models import CNNText model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1) from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), optimizer=Adam(), metrics=AccuracyMetric(target='target')) trainer.train() print('Train finished!')
def test_fastnlp_10min_tutorial(self): # 从csv读取数据到DataSet sample_path = "test/data_for_tests/tutorial_sample_dataset.csv" dataset = CSVLoader(headers=['raw_sentence', 'label'], sep=' ')._load(sample_path) print(len(dataset)) print(dataset[0]) print(dataset[-3]) dataset.append(Instance(raw_sentence='fake data', label='0')) # 将所有数字转为小写 dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int dataset.apply(lambda x: int(x['label']), new_field_name='label') # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split() dataset.apply(split_sent, new_field_name='words') # 增加长度信息 dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') print(len(dataset)) print(dataset[0]) # DataSet.drop(func)筛除数据 dataset.drop(lambda x: x['seq_len'] <= 3, inplace=True) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor # set target,loss或evaluate中的golden,计算loss,模型评估时使用 dataset.set_target("label") # set input,模型forward时使用 dataset.set_input("words", "seq_len") # 分出测试集、训练集 test_data, train_data = dataset.split(0.5) print(len(test_data)) print(len(train_data)) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') print(test_data[0]) # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具 from fastNLP.core.batch import DataSetIter from fastNLP.core.sampler import RandomSampler batch_iterator = DataSetIter(dataset=train_data, batch_size=2, sampler=RandomSampler()) for batch_x, batch_y in batch_iterator: print("batch_x has: ", batch_x) print("batch_y has: ", batch_y) break from fastNLP.models import CNNText model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1) from fastNLP import Trainer from copy import deepcopy # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 train_data.rename_field('label', 'label_seq') test_data.rename_field('label', 'label_seq') loss = CrossEntropyLoss(target="label_seq") metric = AccuracyMetric(target="label_seq") # 实例化Trainer,传入模型和数据,进行训练 # 先在test_data拟合(确保模型的实现是正确的) copy_model = deepcopy(model) overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=loss, batch_size=32, n_epochs=5, dev_data=test_data, metrics=metric, save_path=None) overfit_trainer.train() # 用train_data训练,在test_data验证 trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, loss=CrossEntropyLoss(target="label_seq"), metrics=AccuracyMetric(target="label_seq"), save_path=None, batch_size=32, n_epochs=5) trainer.train() print('Train finished!') # 调用Tester在test_data上评价效果 from fastNLP import Tester tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(target="label_seq"), batch_size=4) acc = tester.test() print(acc)
def run_cnn(): dataset_train_p2, dataset_test_p2 = get_text_classification_datasets() line_len = len(dataset_train_p2.data) with open("formalized_train_data.csv", "w") as file: for i in range(line_len): file.write( document2line(dataset_train_p2.data[i]) + "\t" + str(dataset_train_p2.target[i]) + '\n') file.close() line_len = len(dataset_test_p2.data) with open("formalized_test_data.csv", "w") as file2: for i in range(line_len): file2.write( document2line(dataset_test_p2.data[i]) + "\t" + str(dataset_test_p2.target[i]) + '\n') file2.close() loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t') train_dataset = loader.load("./formalized_train_data.csv") test_dataset = loader.load("./formalized_test_data.csv") os.remove("./formalized_train_data.csv") os.remove("./formalized_test_data.csv") train_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') train_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) test_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') test_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) #train_dataset[0],test_dataset[0] from fastNLP import Vocabulary # 使用Vocabulary类统计单词,并将单词序列转化为数字序列 vocab = Vocabulary(min_freq=2).from_dataset(train_dataset, field_name='words') vocab.index_dataset(train_dataset, field_name='words', new_field_name='words') vocab.index_dataset(test_dataset, field_name='words', new_field_name='words') #train_dataset[0],test_dataset[0] # 将label转为整数,并设置为 target train_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) test_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) #train_dataset[0],test_dataset[0] from fastNLP.models import CNNText embed_dim = 2048 #50 model = CNNText((len(vocab), embed_dim), num_classes=4, padding=2, dropout=0.1) model from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric # 定义trainer并进行训练 trainer = Trainer(model=model, train_data=train_dataset, dev_data=test_dataset, loss=CrossEntropyLoss(), metrics=AccuracyMetric()) trainer.train()
def test_tutorial(self): # 从csv读取数据到DataSet sample_path = "./data_for_tests/tutorial_sample_dataset.csv" dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), sep='\t') print(len(dataset)) print(dataset[0]) dataset.append(Instance(raw_sentence='fake data', label='0')) dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int dataset.apply(lambda x: int(x['label']), new_field_name='label') # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split() dataset.apply(split_sent, new_field_name='words') # 增加长度信息 dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') # print(len(dataset)) # print(dataset[0]) # DataSet.drop(func)筛除数据 dataset.drop(lambda x: x['seq_len'] <= 3) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor # set target,loss或evaluate中的golden,计算loss,模型评估时使用 dataset.set_target("label") # set input,模型forward时使用 dataset.set_input("words") # 分出测试集、训练集 test_data, train_data = dataset.split(0.5) # print(len(test_data)) # print(len(train_data)) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') test_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') print(test_data[0]) model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) from fastNLP import Trainer from copy import deepcopy # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 train_data.rename_field('label', 'label_seq') test_data.rename_field('words', 'word_seq') test_data.rename_field('label', 'label_seq') # 实例化Trainer,传入模型和数据,进行训练 copy_model = deepcopy(model) overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4, dev_data=test_data, save_path="./save") overfit_trainer.train() trainer = Trainer(train_data=train_data, model=model, loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4, dev_data=test_data, save_path="./save") trainer.train() print('Train finished!') # 使用fastNLP的Tester测试脚本 tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), batch_size=4) acc = tester.test() print(acc)
def test_summary(self): model = CNNText(embed=(4, 4), num_classes=2, kernel_nums=(9,5), kernel_sizes=(1,3)) # 4 * 4 + 4 * (9 * 1 + 5 * 3) + 2 * (9 + 5 + 1) = 142 self.assertSequenceEqual((142, 142, 0), summary(model)) model.embed.requires_grad = False self.assertSequenceEqual((142, 126, 16), summary(model))
target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl') logger.warn('加载数据集') data_bundle = load_serialize_obj(train_data_bundle_pkl_file) logger.warn('获取词典') char_vocab = data_bundle.get_vocab('words') logger.info('char_vocab:{}'.format(char_vocab)) target_vocab = data_bundle.get_vocab('target') logger.info('target_vocab:{}'.format(target_vocab)) save_serialize_obj(char_vocab, char_vocab_pkl_file) save_serialize_obj(target_vocab, target_vocab_pkl_file) logger.info('词典序列化:{}'.format(char_vocab_pkl_file)) logger.warn('选择预训练词向量') word2vec_embed = StaticEmbedding(char_vocab, model_dir_or_name='cn-char-fastnlp-100d') logger.warn('神经网络模型') model = CNNText(word2vec_embed, num_classes=len(target_vocab)) logger.info(model) logger.warn('训练超参数设定') loss = CrossEntropyLoss() optimizer = Adam( [param for param in model.parameters() if param.requires_grad]) # metric = AccuracyMetric() metric = ClassifyFPreRecMetric( tag_vocab=data_bundle.get_vocab(Const.TARGET), only_gross=False) # 若only_gross=False, 即还会返回各个label的metric统计值 device = 'cuda' if torch.cuda.is_available( ) else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快 logger.info('device:{}'.format(device)) batch_size = 32 n_epochs = 10 early_stopping = 10
def train(args): text_data = TextData() with open(os.path.join(args.vocab_dir, args.vocab_data), 'rb') as fin: text_data = pickle.load(fin) vocab_size = text_data.vocab_size class_num = text_data.class_num # class_num = 1 seq_len = text_data.max_seq_len print("(vocab_size,class_num,seq_len):({0},{1},{2})".format( vocab_size, class_num, seq_len)) train_data = text_data.train_set val_data = text_data.val_set test_data = text_data.test_set train_data.set_input('words', 'seq_len') train_data.set_target('target') val_data.set_input('words', 'seq_len') val_data.set_target('target') test_data.set_input('words', 'seq_len') test_data.set_target('target') init_embeds = None if args.pretrain_model == "None": print("No pretrained model with be used.") print("vocabsize:{0}".format(vocab_size)) init_embeds = (vocab_size, args.embed_size) elif args.pretrain_model == "word2vec": embeds_path = os.path.join(args.prepare_dir, 'w2v_embeds.pkl') print("Loading Word2Vec pretrained embedding from {0}.".format( embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) elif args.pretrain_model == 'glove': embeds_path = os.path.join(args.prepare_dir, 'glove_embeds.pkl') print( "Loading Glove pretrained embedding from {0}.".format(embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) elif args.pretrain_model == 'glove2wv': embeds_path = os.path.join(args.prepare_dir, 'glove2wv_embeds.pkl') print( "Loading Glove pretrained embedding from {0}.".format(embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) else: init_embeds = (vocab_size, args.embed_size) if args.model == "CNNText": print("Using CNN Model.") model = CNNText(init_embeds, num_classes=class_num, padding=2, dropout=args.dropout) elif args.model == "StarTransformer": print("Using StarTransformer Model.") model = STSeqCls(init_embeds, num_cls=class_num, hidden_size=args.hidden_size) elif args.model == "MyCNNText": model = MyCNNText(init_embeds=init_embeds, num_classes=class_num, padding=2, dropout=args.dropout) print("Using user defined CNNText") elif args.model == "LSTMText": print("Using LSTM Model.") model = LSTMText(init_embeds=init_embeds, output_dim=class_num, hidden_dim=args.hidden_size, num_layers=args.num_layers, dropout=args.dropout) elif args.model == "Bert": print("Using Bert Model.") else: print("Using default model: CNNText.") model = CNNText((vocab_size, args.embed_size), num_classes=class_num, padding=2, dropout=0.1) print(model) if args.cuda: device = torch.device('cuda') else: device = None print("train_size:{0} ; val_size:{1} ; test_size:{2}".format( train_data.get_length(), val_data.get_length(), test_data.get_length())) if args.optim == "Adam": print("Using Adam as optimizer.") optimizer = fastnlp_optim.Adam(lr=0.001, weight_decay=args.weight_decay) if (args.model_suffix == "default"): args.model_suffix == args.optim else: print("No Optimizer will be used.") optimizer = None criterion = CrossEntropyLoss() metric = AccuracyMetric() model_save_path = os.path.join(args.model_dir, args.model, args.model_suffix) earlystop = EarlyStopCallback(args.patience) fitlog_back = FitlogCallback({"val": val_data, "train": train_data}) trainer = Trainer(train_data=train_data, model=model, save_path=model_save_path, device=device, n_epochs=args.epochs, optimizer=optimizer, dev_data=val_data, loss=criterion, batch_size=args.batch_size, metrics=metric, callbacks=[fitlog_back, earlystop]) trainer.train() print("Train Done.") tester = Tester(data=val_data, model=model, metrics=metric, batch_size=args.batch_size, device=device) tester.test() print("Test Done.") print("Predict the answer with best model...") acc = 0.0 output = [] data_iterator = Batch(test_data, batch_size=args.batch_size) for data_x, batch_y in data_iterator: i_data = Variable(data_x['words']).cuda() pred = model(i_data)[C.OUTPUT] pred = pred.sigmoid() # print(pred.shape) output.append(pred.cpu().data) output = torch.cat(output, 0).numpy() print(output.shape) print("Predict Done. {} records".format(len(output))) result_save_path = os.path.join(args.result_dir, args.model + "_" + args.model_suffix) with open(result_save_path + ".pkl", 'wb') as f: pickle.dump(output, f) output = output.squeeze()[:, 1].tolist() projectid = text_data.test_projectid.values answers = [] count = 0 for i in range(len(output)): if output[i] > 0.5: count += 1 print("true sample count:{}".format(count)) add_count = 0 for i in range(len(projectid) - len(output)): output.append([0.13]) add_count += 1 print("Add {} default result in predict.".format(add_count)) df = pd.DataFrame() df['projectid'] = projectid df['y'] = output df.to_csv(result_save_path + ".csv", index=False) print("Predict Done, results saved to {}".format(result_save_path)) fitlog.finish()
def test_fastnlp_advanced_tutorial(self): import os os.chdir("tutorials/fastnlp_advanced_tutorial") from fastNLP import DataSet from fastNLP import Instance from fastNLP import Vocabulary from fastNLP import Trainer from fastNLP import Tester # ### Instance # Instance表示一个样本,由一个或者多个field(域、属性、特征)组成,每个field具有自己的名字以及值 # 在初始化Instance的时候可以定义它包含的field,使用"field_name=field_value"的写法 # In[2]: # 组织一个Instance,这个Instance由premise、hypothesis、label三个field组成 instance = Instance(premise='an premise example .', hypothesis='an hypothesis example.', label=1) instance # In[3]: data_set = DataSet([instance] * 5) data_set.append(instance) data_set[-2:] # In[4]: # 如果某一个field的类型与dataset对应的field类型不一样仍可被加入dataset中 instance2 = Instance(premise='the second premise example .', hypothesis='the second hypothesis example.', label='1') try: data_set.append(instance2) except: pass data_set[-2:] # In[5]: # 如果某一个field的名字不对,则该instance不能被append到dataset中 instance3 = Instance(premises='the third premise example .', hypothesis='the third hypothesis example.', label=1) try: data_set.append(instance3) except: print('cannot append instance') pass data_set[-2:] # In[6]: # 除了文本以外,还可以将tensor作为其中一个field的value import torch tensor_ins = Instance(image=torch.randn(5, 5), label=0) ds = DataSet() ds.append(tensor_ins) ds from fastNLP import DataSet from fastNLP import Instance # 从csv读取数据到DataSet # 类csv文件,即每一行为一个example的文件,都可以使用这种方法进行数据读取 dataset = DataSet.read_csv('tutorial_sample_dataset.csv', headers=('raw_sentence', 'label'), sep='\t') # 查看DataSet的大小 len(dataset) # In[8]: # 使用数字索引[k],获取第k个样本 dataset[0] # In[9]: # 获取的样本是一个Instance type(dataset[0]) # In[10]: # 使用数字索引[a: b],获取第a到第b个样本 dataset[0:3] # In[11]: # 索引也可以是负数 dataset[-1] data_path = ['premise', 'hypothesis', 'label'] # 读入文件 with open(data_path[0]) as f: premise = f.readlines() with open(data_path[1]) as f: hypothesis = f.readlines() with open(data_path[2]) as f: label = f.readlines() assert len(premise) == len(hypothesis) and len(hypothesis) == len( label) # 组织DataSet data_set = DataSet() for p, h, l in zip(premise, hypothesis, label): p = p.strip() # 将行末空格去除 h = h.strip() # 将行末空格去除 data_set.append(Instance(premise=p, hypothesis=h, truth=l)) data_set[0] # ### DataSet的其他操作 # 在构建完毕DataSet后,仍然可以对DataSet的内容进行操作,函数接口为DataSet.apply() # In[13]: # 将premise域的所有文本转成小写 data_set.apply(lambda x: x['premise'].lower(), new_field_name='premise') data_set[-2:] # In[14]: # label转int data_set.apply(lambda x: int(x['truth']), new_field_name='truth') data_set[-2:] # In[15]: # 使用空格分割句子 def split_sent(ins): return ins['premise'].split() data_set.apply(split_sent, new_field_name='premise') data_set.apply(lambda x: x['hypothesis'].split(), new_field_name='hypothesis') data_set[-2:] # In[16]: # 筛选数据 origin_data_set_len = len(data_set) data_set.drop(lambda x: len(x['premise']) <= 6) origin_data_set_len, len(data_set) # In[17]: # 增加长度信息 data_set.apply(lambda x: [1] * len(x['premise']), new_field_name='premise_len') data_set.apply(lambda x: [1] * len(x['hypothesis']), new_field_name='hypothesis_len') data_set[-1] # In[18]: # 设定特征域、标签域 data_set.set_input("premise", "premise_len", "hypothesis", "hypothesis_len") data_set.set_target("truth") # In[19]: # 重命名field data_set.rename_field('truth', 'label') data_set[-1] # In[20]: # 切分训练、验证集、测试集 train_data, vad_data = data_set.split(0.5) dev_data, test_data = vad_data.split(0.4) len(train_data), len(dev_data), len(test_data) # In[21]: # 深拷贝一个数据集 import copy train_data_2, dev_data_2 = copy.deepcopy(train_data), copy.deepcopy( dev_data) del copy # 初始化词表,该词表最大的vocab_size为10000,词表中每个词出现的最低频率为2,'<unk>'表示未知词语,'<pad>'表示padding词语 # Vocabulary默认初始化参数为max_size=None, min_freq=None, unknown='<unk>', padding='<pad>' vocab = Vocabulary(max_size=10000, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['premise']]) train_data.apply( lambda x: [vocab.add(word) for word in x['hypothesis']]) vocab.build_vocab() # In[23]: # 根据词表index句子 train_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') train_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') dev_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') dev_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') test_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') test_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') train_data[-1], dev_data[-1], test_data[-1] # 读入vocab文件 with open('vocab.txt') as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line.strip()) # 实例化Vocabulary vocab_bert = Vocabulary(unknown=None, padding=None) # 将vocabs列表加入Vocabulary vocab_bert.add_word_lst(vocabs) # 构建词表 vocab_bert.build_vocab() # 更新unknown与padding的token文本 vocab_bert.unknown = '[UNK]' vocab_bert.padding = '[PAD]' # In[25]: # 根据词表index句子 train_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise') train_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') dev_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise') dev_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') train_data_2[-1], dev_data_2[-1] # step 1:加载模型参数(非必选) from fastNLP.io.config_io import ConfigSection, ConfigLoader args = ConfigSection() ConfigLoader().load_config("./data/config", {"esim_model": args}) args["vocab_size"] = len(vocab) args.data # In[27]: # step 2:加载ESIM模型 from fastNLP.models import ESIM model = ESIM(**args.data) model # In[28]: # 另一个例子:加载CNN文本分类模型 from fastNLP.models import CNNText cnn_text_model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) cnn_text_model from fastNLP import CrossEntropyLoss from fastNLP import Adam from fastNLP import AccuracyMetric trainer = Trainer( train_data=train_data, model=model, loss=CrossEntropyLoss(pred='pred', target='label'), metrics=AccuracyMetric(), n_epochs=3, batch_size=16, print_every=-1, validate_every=-1, dev_data=dev_data, use_cuda=False, optimizer=Adam(lr=1e-3, weight_decay=0), check_code_level=-1, metric_key='acc', use_tqdm=False, ) trainer.train() tester = Tester( data=test_data, model=model, metrics=AccuracyMetric(), batch_size=args["batch_size"], ) tester.test() os.chdir("../..")
def train(args): text_data = TextData() with open(os.path.join(args.vocab_dir, args.vocab_data), 'rb') as fin: text_data = pickle.load(fin) vocab_size = text_data.vocab_size class_num = text_data.class_num seq_len = text_data.max_seq_len print("(vocab_size,class_num,seq_len):({0},{1},{2})".format( vocab_size, class_num, seq_len)) train_data = text_data.train_set test_dev_data = text_data.test_set train_data.set_input('words', 'seq_len') train_data.set_target('target') test_dev_data.set_input('words', 'seq_len') test_dev_data.set_target('target') test_data, dev_data = test_dev_data.split(0.2) test_data = test_dev_data init_embeds = None if args.pretrain_model == "None": print("No pretrained model with be used.") print("vocabsize:{0}".format(vocab_size)) init_embeds = (vocab_size, args.embed_size) elif args.pretrain_model == "word2vec": embeds_path = os.path.join(args.prepare_dir, 'w2v_embeds.pkl') print("Loading Word2Vec pretrained embedding from {0}.".format( embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) elif args.pretrain_model == 'glove': embeds_path = os.path.join(args.prepare_dir, 'glove_embeds.pkl') print( "Loading Glove pretrained embedding from {0}.".format(embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) elif args.pretrain_model == 'glove2wv': embeds_path = os.path.join(args.prepare_dir, 'glove2wv_embeds.pkl') print( "Loading Glove pretrained embedding from {0}.".format(embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) else: init_embeds = (vocab_size, args.embed_size) if args.model == "CNNText": print("Using CNN Model.") model = CNNText(init_embeds, num_classes=class_num, padding=2, dropout=args.dropout) elif args.model == "StarTransformer": print("Using StarTransformer Model.") model = STSeqCls(init_embeds, num_cls=class_num, hidden_size=args.hidden_size) elif args.model == "MyCNNText": model = MyCNNText(init_embeds=init_embeds, num_classes=class_num, padding=2, dropout=args.dropout) print("Using user defined CNNText") elif args.model == "LSTMText": print("Using LSTM Model.") model = LSTMText(init_embeds=init_embeds, output_dim=class_num, hidden_dim=args.hidden_size, num_layers=args.num_layers, dropout=args.dropout) elif args.model == "Bert": print("Using Bert Model.") else: print("Using default model: CNNText.") model = CNNText((vocab_size, args.embed_size), num_classes=class_num, padding=2, dropout=0.1) print(model) if args.cuda: device = torch.device('cuda') else: device = None print("train_size:{0} ; dev_size:{1} ; test_size:{2}".format( train_data.get_length(), dev_data.get_length(), test_data.get_length())) if args.optim == "Adam": print("Using Adam as optimizer.") optimizer = fastnlp_optim.Adam(lr=0.001, weight_decay=args.weight_decay) if (args.model_suffix == "default"): args.model_suffix == args.optim else: print("No Optimizer will be used.") optimizer = None criterion = CrossEntropyLoss() metric = AccuracyMetric() model_save_path = os.path.join(args.model_dir, args.model, args.model_suffix) earlystop = EarlyStopCallback(args.patience) trainer = Trainer(train_data=train_data, model=model, save_path=model_save_path, device=device, n_epochs=args.epochs, optimizer=optimizer, dev_data=test_data, loss=criterion, batch_size=args.batch_size, metrics=metric, callbacks=[FitlogCallback(test_data), earlystop]) trainer.train() print("Train Done.") tester = Tester(data=test_data, model=model, metrics=metric, batch_size=args.batch_size, device=device) tester.test() print("Test Done.") fitlog.finish()
test_dataset.set_input(Const.INPUT, Const.INPUT_LEN) test_dataset.set_target(Const.TARGET) # Split into development dataset train_dataset, dev_dataset = train_dataset.split(0.1) return train_dataset, dev_dataset, test_dataset, vocab if __name__ == '__main__': # Usage train_data, dev_data, test_data, vocab = get_train_dev_test_vocab() model_cnn = CNNText((len(vocab), 50), num_classes=20, padding=2, dropout=0.1) loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET) metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET) trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics) trainer.train() tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())