def train(): # Config Loader train_args = ConfigSection() test_args = ConfigSection() ConfigLoader().load_config(cfgfile, { "train": train_args, "test": test_args }) print("loading data set...") data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load) data.load(cws_data_path) data_train, data_dev = data.split(ratio=0.3) train_args["vocab_size"] = len(data.word_vocab) train_args["num_classes"] = len(data.label_vocab) print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab))) change_field_is_target(data_dev, "truth", True) save_pickle(data_dev, "./save/", "data_dev.pkl") save_pickle(data.word_vocab, "./save/", "word2id.pkl") save_pickle(data.label_vocab, "./save/", "label2id.pkl") # Trainer trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"], validate=train_args["validate"], use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"], save_best_dev=True, print_every_step=10, model_name="trained_model.pkl", evaluator=SeqLabelEvaluator()) # Model model = AdvSeqLabel(train_args) try: ModelLoader.load_pytorch(model, "./save/saved_model.pkl") print('model parameter loaded!') except Exception as e: print("No saved model. Continue.") pass # Start training trainer.train(model, data_train, data_dev) print("Training finished!") # Saver saver = ModelSaver("./save/trained_model.pkl") saver.save_pytorch(model) print("Model saved!")
def predict(): # Config Loader test_args = ConfigSection() ConfigLoader().load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) index2label = load_pickle(pickle_path, "label2id.pkl") test_args["num_classes"] = len(index2label) # load dev data dev_data = load_pickle(pickle_path, "data_dev.pkl") # Define the same model model = AdvSeqLabel(test_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./save/trained_model.pkl") print("model loaded!") # Tester test_args["evaluator"] = SeqLabelEvaluator() tester = SeqLabelTester(**test_args.data) # Start testing tester.test(model, dev_data)
def infer(): # Config Loader test_args = ConfigSection() ConfigLoader().load_config(cfgfile, {"POS_test": test_args}) # fetch dictionary size and number of labels from pickle files word2index = load_pickle(pickle_path, "word2id.pkl") test_args["vocab_size"] = len(word2index) index2label = load_pickle(pickle_path, "label2id.pkl") test_args["num_classes"] = len(index2label) # Define the same model model = AdvSeqLabel(test_args) try: ModelLoader.load_pytorch(model, "./save/trained_model.pkl") print('model loaded!') except Exception as e: print('cannot load model!') raise # Data Loader infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines) infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True) print('data loaded') # Inference interface infer = SeqLabelInfer(pickle_path) results = infer.predict(model, infer_data) print(results) print("Inference finished!")
def test_case_2(self): config = "[section_A]\n[section_B]\n" with open("./test.cfg", "w", encoding="utf-8") as f: f.write(config) saver = ConfigSaver("./test.cfg") section = ConfigSection() section["doubles"] = 0.8 section["tt"] = [1, 2, 3] section["test"] = 105 section["str"] = "this is a str" saver.save_config_file("section_A", section) os.system("rm ./test.cfg")
def _load_all(src): model_path = src src = os.path.dirname(src) word_v = _load(src + '/word_v.pkl') pos_v = _load(src + '/pos_v.pkl') tag_v = _load(src + '/tag_v.pkl') pos_pp = torch.load(src + '/pos_pp.pkl')['pipeline'] model_args = ConfigSection() ConfigLoader.load_config('cfg.cfg', {'model': model_args}) model_args['word_vocab_size'] = len(word_v) model_args['pos_vocab_size'] = len(pos_v) model_args['num_label'] = len(tag_v) model = BiaffineParser(**model_args.data) model.load_state_dict(torch.load(model_path)) return { 'word_v': word_v, 'pos_v': pos_v, 'tag_v': tag_v, 'model': model, 'pos_pp': pos_pp, }
# emb_file_name = '/home/yfshao/glove.6B.100d.txt' # loader = ConlluDataLoader() datadir = '/home/yfshao/workdir/parser-data/' train_data_name = "train_ctb5.txt" dev_data_name = "dev_ctb5.txt" test_data_name = "test_ctb5.txt" emb_file_name = "/home/yfshao/workdir/parser-data/word_OOVthr_30_100v.txt" # emb_file_name = "/home/yfshao/workdir/word_vector/cc.zh.300.vec" loader = CTBDataLoader() cfgfile = './cfg.cfg' processed_datadir = './save' # Config Loader train_args = ConfigSection() test_args = ConfigSection() model_args = ConfigSection() optim_args = ConfigSection() ConfigLoader.load_config( cfgfile, { "train": train_args, "test": test_args, "model": model_args, "optim": optim_args }) print('trainre Args:', train_args.data) print('test Args:', test_args.data) print('optim Args:', optim_args.data)
import os import torch from fastNLP import Trainer from fastNLP import Tester from fastNLP import CrossEntropyLoss from fastNLP import Adam from fastNLP import AccuracyMetric from fastNLP.io.config_io import ConfigSection, ConfigLoader import fastNLP.core.utils as util from model import myESIM args = ConfigSection() ConfigLoader().load_config("../data/config.json", {"train": args}) # 加载训练、验证数据集和词向量 print("\t* Loading train data...") train_data = util.load_pickle(os.path.normpath(args["data_dir"]), args["train_file"]) print("\t* Loading dev data...") dev_data = util.load_pickle(os.path.normpath(args["data_dir"]), args["dev_file"]) print("\t* Loading word embeddings...") embeddings = util.load_pickle(os.path.normpath(args["data_dir"]), args["embeddings_file"]) embeddings = torch.Tensor(embeddings) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = myESIM(embeddings.shape[0], embeddings.shape[1], 300, embeddings=embeddings, dropout=0.5,
def train(train_data_path, dev_data_path, checkpoint=None, save=None): # load config train_param = ConfigSection() model_param = ConfigSection() ConfigLoader().load_config(cfgfile, { "train": train_param, "model": model_param }) print("config loaded") # Data Loader print("loading training set...") dataset = ConllxDataLoader().load(train_data_path, return_dataset=True) print("loading dev set...") dev_data = ConllxDataLoader().load(dev_data_path, return_dataset=True) print(dataset) print("================= dataset ready =====================") dataset.rename_field("tag", "truth") dev_data.rename_field("tag", "truth") vocab_proc = VocabIndexerProcessor("words", new_added_filed_name="word_seq") tag_proc = VocabIndexerProcessor("truth", is_input=True) seq_len_proc = SeqLenProcessor(field_name="word_seq", new_added_field_name="word_seq_origin_len", is_input=True) set_input_proc = SetInputProcessor("word_seq", "word_seq_origin_len") vocab_proc(dataset) tag_proc(dataset) seq_len_proc(dataset) # index dev set word_vocab, tag_vocab = vocab_proc.vocab, tag_proc.vocab dev_data.apply(lambda ins: [word_vocab.to_index(w) for w in ins["words"]], new_field_name="word_seq") dev_data.apply(lambda ins: [tag_vocab.to_index(w) for w in ins["truth"]], new_field_name="truth") dev_data.apply(lambda ins: len(ins["word_seq"]), new_field_name="word_seq_origin_len") # set input & target dataset.set_input("word_seq", "word_seq_origin_len", "truth") dev_data.set_input("word_seq", "word_seq_origin_len", "truth") dataset.set_target("truth", "word_seq_origin_len") dev_data.set_target("truth", "word_seq_origin_len") # dataset.set_is_target(tag_ids=True) model_param["vocab_size"] = vocab_proc.get_vocab_size() model_param["num_classes"] = tag_proc.get_vocab_size() print("vocab_size={} num_classes={}".format(model_param["vocab_size"], model_param["num_classes"])) # define a model if checkpoint is None: # pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx) pre_trained = None model = AdvSeqLabel(model_param, id2words=None, emb=pre_trained) print(model) else: model = torch.load(checkpoint) # call trainer to train trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric( tag_proc.vocab, pred="predict", target="truth", seq_lens="word_seq_origin_len"), dev_data=dev_data, metric_key="f", use_tqdm=True, use_cuda=True, print_every=10, n_epochs=20, save_path=save) trainer.train(load_best_model=True) # save model & pipeline model_proc = ModelProcessor(model, seq_len_field_name="word_seq_origin_len") id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag") pp = Pipeline( [vocab_proc, seq_len_proc, set_input_proc, model_proc, id2tag]) save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab} torch.save(save_dict, os.path.join(save, "model_pp.pkl")) print("pipeline saved")
def train(checkpoint=None): # load config train_param = ConfigSection() model_param = ConfigSection() ConfigLoader().load_config(cfgfile, { "train": train_param, "model": model_param }) print("config loaded") # Data Loader dataset = ZhConllPOSReader().load("/home/hyan/train.conllx") print(dataset) print("dataset transformed") dataset.rename_field("tag", "truth") vocab_proc = VocabIndexerProcessor("words", new_added_filed_name="word_seq") tag_proc = VocabIndexerProcessor("truth") seq_len_proc = SeqLenProcessor(field_name="word_seq", new_added_field_name="word_seq_origin_len", is_input=True) vocab_proc(dataset) tag_proc(dataset) seq_len_proc(dataset) dataset.set_input("word_seq", "word_seq_origin_len", "truth") dataset.set_target("truth", "word_seq_origin_len") print("processors defined") # dataset.set_is_target(tag_ids=True) model_param["vocab_size"] = vocab_proc.get_vocab_size() model_param["num_classes"] = tag_proc.get_vocab_size() print("vocab_size={} num_classes={}".format(model_param["vocab_size"], model_param["num_classes"])) # define a model if checkpoint is None: # pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx) pre_trained = None model = AdvSeqLabel(model_param, id2words=tag_proc.vocab.idx2word, emb=pre_trained) print(model) else: model = torch.load(checkpoint) # call trainer to train trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric( tag_proc.vocab, pred="predict", target="truth", seq_lens="word_seq_origin_len"), dev_data=dataset, metric_key="f", use_tqdm=True, use_cuda=True, print_every=5, n_epochs=6, save_path="./save") trainer.train(load_best_model=True) # save model & pipeline model_proc = ModelProcessor(model, seq_len_field_name="word_seq_origin_len") id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag") pp = Pipeline([vocab_proc, seq_len_proc, model_proc, id2tag]) save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab} torch.save(save_dict, "model_pp.pkl") print("pipeline saved") torch.save(model, "./save/best_model.pkl")
dim=attention_unit, num_vec=attention_hops) self.mlp = MLP( size_layer=[lstm_hidden_size * 2 * attention_hops, nfc, class_num]) def forward(self, x): x_emb = self.embedding(x) output = self.lstm(x_emb) after_attention, penalty = self.attention(output, x) after_attention = after_attention.view(after_attention.size(0), -1) output = self.mlp(after_attention) return output def loss(self, predict, ground_truth): print("predict:%s; g:%s" % (str(predict.size()), str(ground_truth.size()))) print(ground_truth) return F.cross_entropy(predict, ground_truth) train_args = ConfigSection() ConfigLoader("good path").load_config('config.cfg', {"train": train_args}) train_args['vocab'] = len(word2index) trainer = ClassificationTrainer(**train_args.data) # for k in train_args.__dict__.keys(): # print(k, train_args[k]) model = SELF_ATTENTION_YELP_CLASSIFICATION(train_args) trainer.train(model, train_data, dev_data)
def test_fastnlp_advanced_tutorial(self): import os os.chdir("tutorials/fastnlp_advanced_tutorial") from fastNLP import DataSet from fastNLP import Instance from fastNLP import Vocabulary from fastNLP import Trainer from fastNLP import Tester # ### Instance # Instance表示一个样本,由一个或者多个field(域、属性、特征)组成,每个field具有自己的名字以及值 # 在初始化Instance的时候可以定义它包含的field,使用"field_name=field_value"的写法 # In[2]: # 组织一个Instance,这个Instance由premise、hypothesis、label三个field组成 instance = Instance(premise='an premise example .', hypothesis='an hypothesis example.', label=1) instance # In[3]: data_set = DataSet([instance] * 5) data_set.append(instance) data_set[-2:] # In[4]: # 如果某一个field的类型与dataset对应的field类型不一样仍可被加入dataset中 instance2 = Instance(premise='the second premise example .', hypothesis='the second hypothesis example.', label='1') try: data_set.append(instance2) except: pass data_set[-2:] # In[5]: # 如果某一个field的名字不对,则该instance不能被append到dataset中 instance3 = Instance(premises='the third premise example .', hypothesis='the third hypothesis example.', label=1) try: data_set.append(instance3) except: print('cannot append instance') pass data_set[-2:] # In[6]: # 除了文本以外,还可以将tensor作为其中一个field的value import torch tensor_ins = Instance(image=torch.randn(5, 5), label=0) ds = DataSet() ds.append(tensor_ins) ds from fastNLP import DataSet from fastNLP import Instance # 从csv读取数据到DataSet # 类csv文件,即每一行为一个example的文件,都可以使用这种方法进行数据读取 dataset = DataSet.read_csv('tutorial_sample_dataset.csv', headers=('raw_sentence', 'label'), sep='\t') # 查看DataSet的大小 len(dataset) # In[8]: # 使用数字索引[k],获取第k个样本 dataset[0] # In[9]: # 获取的样本是一个Instance type(dataset[0]) # In[10]: # 使用数字索引[a: b],获取第a到第b个样本 dataset[0:3] # In[11]: # 索引也可以是负数 dataset[-1] data_path = ['premise', 'hypothesis', 'label'] # 读入文件 with open(data_path[0]) as f: premise = f.readlines() with open(data_path[1]) as f: hypothesis = f.readlines() with open(data_path[2]) as f: label = f.readlines() assert len(premise) == len(hypothesis) and len(hypothesis) == len( label) # 组织DataSet data_set = DataSet() for p, h, l in zip(premise, hypothesis, label): p = p.strip() # 将行末空格去除 h = h.strip() # 将行末空格去除 data_set.append(Instance(premise=p, hypothesis=h, truth=l)) data_set[0] # ### DataSet的其他操作 # 在构建完毕DataSet后,仍然可以对DataSet的内容进行操作,函数接口为DataSet.apply() # In[13]: # 将premise域的所有文本转成小写 data_set.apply(lambda x: x['premise'].lower(), new_field_name='premise') data_set[-2:] # In[14]: # label转int data_set.apply(lambda x: int(x['truth']), new_field_name='truth') data_set[-2:] # In[15]: # 使用空格分割句子 def split_sent(ins): return ins['premise'].split() data_set.apply(split_sent, new_field_name='premise') data_set.apply(lambda x: x['hypothesis'].split(), new_field_name='hypothesis') data_set[-2:] # In[16]: # 筛选数据 origin_data_set_len = len(data_set) data_set.drop(lambda x: len(x['premise']) <= 6) origin_data_set_len, len(data_set) # In[17]: # 增加长度信息 data_set.apply(lambda x: [1] * len(x['premise']), new_field_name='premise_len') data_set.apply(lambda x: [1] * len(x['hypothesis']), new_field_name='hypothesis_len') data_set[-1] # In[18]: # 设定特征域、标签域 data_set.set_input("premise", "premise_len", "hypothesis", "hypothesis_len") data_set.set_target("truth") # In[19]: # 重命名field data_set.rename_field('truth', 'label') data_set[-1] # In[20]: # 切分训练、验证集、测试集 train_data, vad_data = data_set.split(0.5) dev_data, test_data = vad_data.split(0.4) len(train_data), len(dev_data), len(test_data) # In[21]: # 深拷贝一个数据集 import copy train_data_2, dev_data_2 = copy.deepcopy(train_data), copy.deepcopy( dev_data) del copy # 初始化词表,该词表最大的vocab_size为10000,词表中每个词出现的最低频率为2,'<unk>'表示未知词语,'<pad>'表示padding词语 # Vocabulary默认初始化参数为max_size=None, min_freq=None, unknown='<unk>', padding='<pad>' vocab = Vocabulary(max_size=10000, min_freq=2, unknown='<unk>', padding='<pad>') # 构建词表 train_data.apply(lambda x: [vocab.add(word) for word in x['premise']]) train_data.apply( lambda x: [vocab.add(word) for word in x['hypothesis']]) vocab.build_vocab() # In[23]: # 根据词表index句子 train_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') train_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') dev_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') dev_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') test_data.apply( lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise') test_data.apply( lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') train_data[-1], dev_data[-1], test_data[-1] # 读入vocab文件 with open('vocab.txt') as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line.strip()) # 实例化Vocabulary vocab_bert = Vocabulary(unknown=None, padding=None) # 将vocabs列表加入Vocabulary vocab_bert.add_word_lst(vocabs) # 构建词表 vocab_bert.build_vocab() # 更新unknown与padding的token文本 vocab_bert.unknown = '[UNK]' vocab_bert.padding = '[PAD]' # In[25]: # 根据词表index句子 train_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise') train_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') dev_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise') dev_data_2.apply( lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis') train_data_2[-1], dev_data_2[-1] # step 1:加载模型参数(非必选) from fastNLP.io.config_io import ConfigSection, ConfigLoader args = ConfigSection() ConfigLoader().load_config("./data/config", {"esim_model": args}) args["vocab_size"] = len(vocab) args.data # In[27]: # step 2:加载ESIM模型 from fastNLP.models import ESIM model = ESIM(**args.data) model # In[28]: # 另一个例子:加载CNN文本分类模型 from fastNLP.models import CNNText cnn_text_model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1) cnn_text_model from fastNLP import CrossEntropyLoss from fastNLP import Adam from fastNLP import AccuracyMetric trainer = Trainer( train_data=train_data, model=model, loss=CrossEntropyLoss(pred='pred', target='label'), metrics=AccuracyMetric(), n_epochs=3, batch_size=16, print_every=-1, validate_every=-1, dev_data=dev_data, use_cuda=False, optimizer=Adam(lr=1e-3, weight_decay=0), check_code_level=-1, metric_key='acc', use_tqdm=False, ) trainer.train() tester = Tester( data=test_data, model=model, metrics=AccuracyMetric(), batch_size=args["batch_size"], ) tester.test() os.chdir("../..")
def test_case_1(self): config_file_dir = "test/io/" config_file_name = "config" config_file_path = os.path.join(config_file_dir, config_file_name) tmp_config_file_path = os.path.join(config_file_dir, "tmp_config") with open(config_file_path, "r") as f: lines = f.readlines() standard_section = ConfigSection() t_section = ConfigSection() ConfigLoader().load_config(config_file_path, { "test": standard_section, "t": t_section }) config_saver = ConfigSaver(config_file_path) section = ConfigSection() section["doubles"] = 0.8 section["tt"] = 0.5 section["test"] = 105 section["str"] = "this is a str" test_case_2_section = section test_case_2_section["double"] = 0.5 for k in section.__dict__.keys(): standard_section[k] = section[k] config_saver.save_config_file("test", section) config_saver.save_config_file("another-test", section) config_saver.save_config_file("one-another-test", section) config_saver.save_config_file("test-case-2", section) test_section = ConfigSection() at_section = ConfigSection() another_test_section = ConfigSection() one_another_test_section = ConfigSection() a_test_case_2_section = ConfigSection() ConfigLoader().load_config( config_file_path, { "test": test_section, "another-test": another_test_section, "t": at_section, "one-another-test": one_another_test_section, "test-case-2": a_test_case_2_section }) assert test_section == standard_section assert at_section == t_section assert another_test_section == section assert one_another_test_section == section assert a_test_case_2_section == test_case_2_section config_saver.save_config_file("test", section) with open(config_file_path, "w") as f: f.writelines(lines) with open(tmp_config_file_path, "w") as f: f.write('[test]\n') f.write('this is an fault example\n') tmp_config_saver = ConfigSaver(tmp_config_file_path) try: tmp_config_saver._read_section() except Exception as e: pass os.remove(tmp_config_file_path) try: tmp_config_saver = ConfigSaver("file-NOT-exist") except Exception as e: pass