def bilstm_text(): w = pickle.load(open("weight.bin", "rb")) (vocab, train_data, dev_data, test_data) = read_data() model_lstm = MyBLSTMText(class_num=4, vocab_size=len(vocab), dropout=0.5, embed_weights=w) loss = CrossEntropyLoss() metrics = AccuracyMetric() trainer = Trainer(model=model_lstm, train_data=train_data, dev_data=dev_data, optimizer=Adam(lr=0.0015), print_every=10, use_tqdm=False, device='cuda:0', save_path="./lstm_model", loss=loss, metrics=metrics) # callbacks=[EarlyStopCallback(10)]) trainer.train() tester = Tester(test_data, model_lstm, metrics=AccuracyMetric()) tester.test()
def test_trainer_suggestion6(self): # 检查报错提示能否正确提醒用户 # 这里传入多余参数,让其duplicate dataset = prepare_fake_dataset2('x1', 'x_unused') dataset.rename_field('x_unused', 'x2') dataset.set_input('x1', 'x2') dataset.set_target('y', 'x1') class Model(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(5, 4) def forward(self, x1, x2): x1 = self.fc(x1) x2 = self.fc(x2) x = x1 + x2 time.sleep(0.1) # loss = F.cross_entropy(x, y) return {'preds': x} model = Model() with self.assertRaises(NameError): trainer = Trainer(train_data=dataset, model=model, loss=CrossEntropyLoss(), print_every=2, dev_data=dataset, metrics=AccuracyMetric(), use_tqdm=False)
def run1(self): # test distributed training print('local rank', get_local_rank()) set_rng_seed(100) data_set = prepare_fake_dataset() data_set.set_input("x", flag=True) data_set.set_target("y", flag=True) model = NaiveClassifier(2, 2) trainer = DistTrainer( model=model, train_data=data_set, optimizer=SGD(lr=0.1), loss=CrossEntropyLoss(pred="predict", target="y"), batch_size_per_gpu=8, n_epochs=3, print_every=50, save_path=self.save_path, ) trainer.train() """ # 应该正确运行 """ if trainer.is_master and os.path.exists(self.save_path): shutil.rmtree(self.save_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--methods", "-m", default="lstm", choices=["rnn", "lstm", "cnn"]) parser.add_argument("--n_epochs", "-n", default=5, type=int) parser.add_argument("--embedding", "-e", default=100, type=int) parser.add_argument("--category", "-c", default=4, type=int) parser.add_argument("--batch", "-b", default=4, type=int) parser.add_argument("--learning_rate", "-l", default=0.005, type=float) args = parser.parse_args() if args.category > 20 or args.category < 1: raise Exception("the number of category must be between 1 and 20") train_data, test_data, dic_size = handle_data(args.category) if args.methods == "rnn": model = rnn(dic_size, args.category) output = "rnn_model.pth" elif args.methods == "lstm": model = myLSTM(dic_size, args.category) output = "lstm_model.pth" else: #model = cnn(dic_size, args.category) model = torch.load("cnn_model.pth") output = "cnn_model.pth" trainer = Trainer(train_data, model, loss=CrossEntropyLoss(pred="pred", target='target'), optimizer=SGD(model.parameters(), lr=args.learning_rate), n_epochs=args.n_epochs, dev_data=test_data, metrics=AccuracyMetric(pred="pred", target='target'), batch_size=args.batch) trainer.train() torch.save(model, output)
def test_collect_fn3(self): """ 测试应该会覆盖 :return: """ dataset = prepare_fake_dataset2('x1', 'x2') dataset.set_input('x1', 'x2') dataset.set_target('y') import torch def fn(ins_list): x = [] for ind, ins in ins_list: x.append(ins['x1']+ins['x2']) x = torch.FloatTensor(x) return {'x1':torch.zeros_like(x)}, {'target':torch.zeros(x.size(0)).long(), 'y':x} dataset.add_collect_fn(fn) class Model(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(5, 1, bias=False) def forward(self, x1): x1 = self.fc(x1) assert x1.sum()==0, "Should be replaced to one" # loss = F.cross_entropy(x, y) return {'pred': x1} model = Model() trainer = Trainer(train_data=dataset, model=model, loss=CrossEntropyLoss(), print_every=2, dev_data=dataset, metrics=AccuracyMetric(), use_tqdm=False, n_epochs=1) best_metric = trainer.train()['best_eval']['AccuracyMetric']['acc'] self.assertTrue(best_metric==1)
def test_collect_fn2(self): """测试能否实现batch_x, batch_y""" dataset = prepare_fake_dataset2('x1', 'x2') dataset.set_input('x1', 'x2') dataset.set_target('y', 'x1') import torch def fn(ins_list): x = [] for ind, ins in ins_list: x.append(ins['x1']+ins['x2']) x = torch.FloatTensor(x) return {'x':x}, {'target':x[:, :4].argmax(dim=-1)} dataset.add_collect_fn(fn) class Model(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(5, 4) def forward(self, x1, x2, x): x1 = self.fc(x1) x2 = self.fc(x2) x = self.fc(x) sum_x = x1 + x2 + x time.sleep(0.1) # loss = F.cross_entropy(x, y) return {'pred': sum_x} model = Model() trainer = Trainer(train_data=dataset, model=model, loss=CrossEntropyLoss(), print_every=2, dev_data=dataset, metrics=AccuracyMetric(), use_tqdm=False) trainer.train()
def train(): train_data, dev_data, test_data, vocab = get_train_dev_test_vocab() model = CNNText(vocab_size=len(vocab), embedding_dim=50, output_size=20) model = torch.load(load_path) loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET) metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET) ''' trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics, n_epochs=100, save_path=checkpoint_path) trainer.train() ''' tester = Tester(test_data, model, metrics=AccuracyMetric()) tester.test()
def train(config): train_data = pickle.load(open(os.path.join(config.data_path, config.train_name), "rb")) dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb")) test_data = pickle.load(open(os.path.join(config.data_path, config.test_name), "rb")) vocabulary = pickle.load(open(os.path.join(config.data_path, config.vocabulary_name), "rb")) # load w2v data weight = pickle.load(open(os.path.join(config.data_path, config.weight_name), "rb")) if config.task_name == "lstm": text_model = LSTM(vocab_size=len(vocabulary), embed_dim=config.embed_dim, output_dim=config.class_num, hidden_dim=config.hidden_dim, num_layers=config.num_layers, dropout=config.dropout) elif config.task_name == "lstm_maxpool": text_model = LSTM_maxpool(vocab_size=len(vocabulary), embed_dim=config.embed_dim, output_dim=config.class_num, hidden_dim=config.hidden_dim, num_layers=config.num_layers, dropout=config.dropout) elif config.task_name == "rnn": text_model = RNN(vocab_size=len(vocabulary), embed_dim=config.embed_dim, output_dim=config.class_num, hidden_dim=config.hidden_dim, num_layers=config.num_layers, dropout=config.dropout) elif config.task_name == "cnn": text_model = CNN(vocab_size=len(vocabulary), embed_dim=config.embed_dim, class_num=config.class_num, kernel_num=config.kernel_num, kernel_sizes=config.kernel_sizes, dropout=config.dropout, static=config.static, in_channels=config.in_channels) elif config.task_name == "cnn_w2v": text_model = CNN_w2v(vocab_size=len(vocabulary), embed_dim=config.embed_dim, class_num=config.class_num, kernel_num=config.kernel_num, kernel_sizes=config.kernel_sizes, dropout=config.dropout, static=config.static, in_channels=config.in_channels, weight=weight) elif config.task_name == "rcnn": text_model = RCNN(vocab_size=len(vocabulary), embed_dim=config.embed_dim, output_dim=config.class_num, hidden_dim=config.hidden_dim, num_layers=config.num_layers, dropout=config.dropout) optimizer = Adam(lr=config.lr, weight_decay=config.weight_decay) timing = TimingCallback() early_stop = EarlyStopCallback(config.patience) accuracy = AccuracyMetric(pred='output', target='target') trainer = Trainer(train_data=train_data, model=text_model, loss=CrossEntropyLoss(), batch_size=config.batch_size, check_code_level=0, metrics=accuracy, n_epochs=config.epoch, dev_data=dev_data, save_path=config.save_path, print_every=config.print_every, validate_every=config.validate_every, optimizer=optimizer, use_tqdm=False, device=config.device, callbacks=[timing, early_stop]) trainer.train() # test result tester = Tester(test_data, text_model, metrics=accuracy) tester.test()
def train_TextRNN(): model = TextRNN(TextRNNConfig) loss = CrossEntropyLoss(pred="pred", target="target") metrics = AccuracyMetric(pred="pred", target="target") trainer = Trainer(model=model, train_data=dataset_train, dev_data=dataset_dev, loss=loss, metrics=metrics, batch_size=16, n_epochs=20) trainer.train() tester = Tester(dataset_test, model, metrics) tester.test()
def train_classifier(): n_epochs = 50 trainer = Trainer(train_data=fast_data.train_data, dev_data=fast_data.test_data, model=disc, loss=CrossEntropyLoss(target='label_seq'), metrics=AccuracyMetric(target='label_seq'), n_epochs=n_epochs, batch_size=batch_size, optimizer=Adam(lr=0.001, weight_decay=0, model_params=disc.parameters())) trainer.train() print('Disc Train finished!')
def __init__(self, embed,label_vocab,pos_idx=31, Parsing_rnn_layers=3, Parsing_arc_mlp_size=500, Parsing_label_mlp_size=100,Parsing_use_greedy_infer=False, encoding_type='bmeso',embedding_dim=768,dropout=0.1,use_pos_embedding=True, use_average=True): super().__init__() self.embed = embed self.use_pos_embedding=use_pos_embedding self.use_average=use_average self.label_vocab=label_vocab self.pos_idx=pos_idx self.user_dict_weight=0.05 embedding_dim_1=512 embedding_dim_2=256 self.layers_map={'CWS':'-1','POS':'-1','Parsing':'-1','NER':'-1'} #NER self.ner_linear=nn.Linear(embedding_dim,len(label_vocab['NER'])) trans = allowed_transitions(label_vocab['NER'], encoding_type='bmeso', include_start_end=True) self.ner_crf = ConditionalRandomField(len(label_vocab['NER']), include_start_end_trans=True, allowed_transitions=trans) #parsing self.biaffine_parser=BertCharParser( app_index=self.label_vocab['Parsing'].to_index('APP'), vector_size=768, num_label=len(label_vocab['Parsing']), rnn_layers=Parsing_rnn_layers, arc_mlp_size=Parsing_arc_mlp_size, label_mlp_size=Parsing_label_mlp_size, dropout=dropout, use_greedy_infer=Parsing_use_greedy_infer) if self.use_pos_embedding: self.pos_embedding=nn.Embedding(len(self.label_vocab['pos']),embedding_dim, padding_idx=0) self.loss=CrossEntropyLoss(padding_idx=0) #CWS self.cws_mlp=MLP([embedding_dim, embedding_dim_1,embedding_dim_2, len(label_vocab['CWS'])], 'relu', output_activation=None) trans=allowed_transitions(label_vocab['CWS'],include_start_end=True) self.cws_crf = ConditionalRandomField(len(label_vocab['CWS']), include_start_end_trans=True, allowed_transitions=trans) #POS self.pos_mlp=MLP([embedding_dim, embedding_dim_1,embedding_dim_2, len(label_vocab['POS'])], 'relu', output_activation=None) trans=allowed_transitions(label_vocab['POS'],include_start_end=True) self.pos_crf = ConditionalRandomField(len(label_vocab['POS']), include_start_end_trans=True, allowed_transitions=trans)
def cnn_text(): w = pickle.load(open("weight.bin", "rb")) (vocab, train_data, dev_data, test_data) = read_data() model_cnn = MyCNNText(class_num=4, vocab_size=len(vocab), embed_weights=w) loss = CrossEntropyLoss() metrics = AccuracyMetric() trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, batch_size=32, print_every=10, use_tqdm=False, device='cuda:0', save_path="./cnn_model", loss=loss, metrics=metrics) trainer.train() tester = Tester(test_data, model_cnn, metrics=AccuracyMetric()) tester.test()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--method", default='cnn', help="train model and test it", choices=['cnn', 'cnn_glove', 'rnn', 'rnn_maxpool', 'rnn_avgpool']) parser.add_argument("--dataset", default='1', help="1: small dataset; 2: big dataset", choices=['1', '2']) args = parser.parse_args() # 超参数 embedding_dim = 256 batch_size = 32 # RNN hidden_dim = 256 # CNN kernel_sizes = (3, 4, 5) num_channels = (120, 160, 200) acti_function = 'relu' learning_rate = 1e-3 train_patience = 8 cate_num = 4 # GloVe embedding_file_path = "glove.6B.100d.txt" device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') vocab = read_vocab("vocab.txt") print("vocabulary length:", len(vocab)) train_data = DataSet().load("train_set") dev_data = DataSet().load("dev_set") test_data = DataSet().load("test_set") if (args.dataset == '1'): cate_num = 4 num_channels = (48, 48, 48) embedding_dim = 128 hidden_dim = 128 elif (args.dataset == '2'): cate_num = 20 if (args.method == 'cnn'): model = TextCNN(vocab_size=len(vocab), embedding_dim=embedding_dim, kernel_sizes=kernel_sizes, num_channels=num_channels, num_classes=cate_num, activation=acti_function) elif (args.method == 'cnn_glove'): glove_embedding = EmbedLoader.load_with_vocab(embedding_file_path, vocab) embedding_dim = glove_embedding.shape[1] print("GloVe embedding_dim:", embedding_dim) model = TextCNN_glove(vocab_size=len(vocab), embedding_dim=embedding_dim, kernel_sizes=kernel_sizes, num_channels=num_channels, num_classes=cate_num, activation=acti_function) model.embedding.load_state_dict( {"weight": torch.from_numpy(glove_embedding)}) model.constant_embedding.load_state_dict( {"weight": torch.from_numpy(glove_embedding)}) model.constant_embedding.weight.requires_grad = False model.embedding.weight.requires_grad = True elif (args.method == 'rnn'): embedding_dim = 128 hidden_dim = 128 model = BiRNNText(vocab_size=len(vocab), embedding_dim=embedding_dim, output_dim=cate_num, hidden_dim=hidden_dim) elif (args.method == 'rnn_maxpool'): model = BiRNNText_pool(vocab_size=len(vocab), embedding_dim=embedding_dim, output_dim=cate_num, hidden_dim=hidden_dim, pool_name="max") elif (args.method == 'rnn_avgpool'): model = BiRNNText_pool(vocab_size=len(vocab), embedding_dim=embedding_dim, output_dim=cate_num, hidden_dim=hidden_dim, pool_name="avg") tester = Tester(test_data, model, metrics=AccuracyMetric()) trainer = Trainer( train_data=train_data, model=model, loss=CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET), metrics=AccuracyMetric(), n_epochs=80, batch_size=batch_size, print_every=10, validate_every=-1, dev_data=dev_data, optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate), check_code_level=2, metric_key='acc', use_tqdm=True, callbacks=[EarlyStopCallback(train_patience)], device=device, ) trainer.train() tester.test()
{ 'dev_matched': data_info.datasets['dev_matched'], 'dev_mismatched': data_info.datasets['dev_mismatched'] }, verbose=1)) trainer = Trainer(train_data=data_info.datasets['train'], model=model, optimizer=optimizer, num_workers=0, batch_size=arg.batch_size, n_epochs=arg.n_epochs, print_every=-1, dev_data=data_info.datasets[arg.devset_name], metrics=AccuracyMetric(pred="pred", target="target"), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], check_code_level=-1, callbacks=callbacks, loss=CrossEntropyLoss(pred="pred", target="target")) trainer.train(load_best_model=True) tester = Tester( data=data_info.datasets[arg.testset_name], model=model, metrics=AccuracyMetric(), batch_size=arg.batch_size, device=[i for i in range(torch.cuda.device_count())], ) tester.test()
def train(): train_data = pickle.load(open(opt.train_data_path, 'rb')) validate_data = pickle.load(open(opt.validate_data_path, 'rb')) vocab = pickle.load(open(opt.vocab, 'rb')) word2idx = vocab.word2idx idx2word = vocab.idx2word input_size = len(word2idx) vocab_size = opt.class_num class_num = opt.class_num embedding_dim = opt.embedding_dim if opt.model_name == "LSTMModel": model = utils.find_class_by_name(opt.model_name, [models])(input_size, vocab_size, embedding_dim, opt.use_word2vec, opt.embedding_weight_path) elif opt.model_name == "B_LSTMModel": model = utils.find_class_by_name(opt.model_name, [models])(input_size, vocab_size, embedding_dim, opt.use_word2vec, opt.embedding_weight_path) elif opt.model_name == "CNNModel": model = utils.find_class_by_name(opt.model_name, [models])(input_size, vocab_size, embedding_dim, opt.use_word2vec, opt.embedding_weight_path) elif opt.model_name == "MyBertModel": #bert_dir = "./BertPretrain" #bert_dir = None #model = utils.find_class_by_name(opt.model_name, [models])(10, 0.1, 4, bert_dir) train_data.apply(lambda x: x['input_data'][:2500], new_field_name='input_data') validate_data.apply(lambda x: x['input_data'][:2500], new_field_name='input_data') model = utils.find_class_by_name(opt.model_name, [models])( input_size=input_size, hidden_size=512, hidden_dropout_prob=0.1, num_labels=class_num, use_word2vec=opt.use_word2vec, embedding_weight_path=opt.embedding_weight_path, ) if not os.path.exists(opt.save_model_path): os.mkdir(opt.save_model_path) # define dataloader train_data.set_input('input_data', flag=True) train_data.set_target('target', flag=True) validate_data.set_input('input_data', flag=True) validate_data.set_target('target', flag=True) if opt.optimizer == 'SGD': _optimizer = SGD(lr=opt.learning_rate, momentum=0) elif opt.optimizer == 'SGD_momentum': _optimizer = SGD(lr=opt.learning_rate, momentum=0.9) elif opt.optimizer == 'Adam': _optimizer = Adam(lr=opt.learning_rate, weight_decay=0) overfit_trainer = Trainer( model=model, train_data=train_data, loss=CrossEntropyLoss(pred="output", target="target"), n_epochs=opt.epoch, batch_size=opt.batch_size, device=[0, 1, 2, 3], #device=None, dev_data=validate_data, metrics=AccuracyMetric(pred="output", target="target"), metric_key="+acc", validate_every=opt.validate_every, optimizer=_optimizer, callbacks=[EarlyStopCallback(opt.patience)], save_path=opt.save_model_path) overfit_trainer.train()
#pad the input array bundle.set_pad_val("words", 0) bundle.set_input("words") bundle.set_target("target") model = BertForSentenceMatching(embed) from fastNLP import AccuracyMetric metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET) # train the model from fastNLP import Trainer from fastNLP import CrossEntropyLoss N_EPOCHS = 1 BATCH_SIZE = 16 trainer = Trainer(loss=CrossEntropyLoss(), model=model, train_data=bundle.get_dataset("train"), dev_data=bundle.get_dataset("dev"), metrics=metrics, n_epochs=N_EPOCHS, batch_size=BATCH_SIZE) trainer.train() from fastNLP import Tester tester = Tester(bundle.get_dataset("test"), model, metrics) tester.test() # print result
chars = self.embedding(chars) outputs = self.mlp(chars) return {Const.OUTPUT: outputs} embed = BertEmbedding(data.get_vocab(Const.CHAR_INPUT), model_dir_or_name='cn-wwm-ext', pool_method='first', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5) callbacks = [ GradientClipCallback(clip_type='norm', clip_value=1), WarmupCallback(warmup=0.1, schedule='linear') ] model = BertCNNER(embed, len(data.vocabs[Const.TARGET])) optimizer = AdamW(model.parameters(), lr=3e-5) for name, dataset in data.datasets.items(): original_len = len(dataset) dataset.drop(lambda x:x['seq_len']>256, inplace=True) clipped_len = len(dataset) print("Delete {} instances in {}.".format(original_len-clipped_len, name)) trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), device=0, dev_data=data.datasets['test'], batch_size=6, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), loss=CrossEntropyLoss(reduction='sum'), callbacks=callbacks, num_workers=2, n_epochs=5, check_code_level=0, update_every=3) trainer.train()
def run_cnn(): dataset_train_p2, dataset_test_p2 = get_text_classification_datasets() line_len = len(dataset_train_p2.data) with open("formalized_train_data.csv", "w") as file: for i in range(line_len): file.write( document2line(dataset_train_p2.data[i]) + "\t" + str(dataset_train_p2.target[i]) + '\n') file.close() line_len = len(dataset_test_p2.data) with open("formalized_test_data.csv", "w") as file2: for i in range(line_len): file2.write( document2line(dataset_test_p2.data[i]) + "\t" + str(dataset_test_p2.target[i]) + '\n') file2.close() loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t') train_dataset = loader.load("./formalized_train_data.csv") test_dataset = loader.load("./formalized_test_data.csv") os.remove("./formalized_train_data.csv") os.remove("./formalized_test_data.csv") train_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') train_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) test_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') test_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) #train_dataset[0],test_dataset[0] from fastNLP import Vocabulary # 使用Vocabulary类统计单词,并将单词序列转化为数字序列 vocab = Vocabulary(min_freq=2).from_dataset(train_dataset, field_name='words') vocab.index_dataset(train_dataset, field_name='words', new_field_name='words') vocab.index_dataset(test_dataset, field_name='words', new_field_name='words') #train_dataset[0],test_dataset[0] # 将label转为整数,并设置为 target train_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) test_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) #train_dataset[0],test_dataset[0] from fastNLP.models import CNNText embed_dim = 2048 #50 model = CNNText((len(vocab), embed_dim), num_classes=4, padding=2, dropout=0.1) model from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric # 定义trainer并进行训练 trainer = Trainer(model=model, train_data=train_dataset, dev_data=test_dataset, loss=CrossEntropyLoss(), metrics=AccuracyMetric()) trainer.train()
class CharModel(nn.Module): def __init__(self, embed, label_vocab, pos_idx, Parsing_rnn_layers, Parsing_arc_mlp_size, Parsing_label_mlp_size, Parsing_use_greedy_infer=False, encoding_type='bmeso', embedding_dim=768, dropout=0.1, use_pos_embedding=False, use_average=False): super().__init__() self.embed = embed self.use_pos_embedding = use_pos_embedding self.use_average = use_average self.label_vocab = label_vocab self.pos_idx = pos_idx embedding_dim_1 = 512 embedding_dim_2 = 256 self.layers_map = {'CWS': '1', 'POS': '2', 'Parsing': '3', 'NER': '2'} #NER self.ner_linear = nn.Linear(embedding_dim, len(label_vocab['NER'])) trans = allowed_transitions(label_vocab['NER'], encoding_type='bmeso', include_start_end=True) self.ner_crf = ConditionalRandomField(len(label_vocab['NER']), include_start_end_trans=True, allowed_transitions=trans) #parsing self.biaffine_parser = BertCharParser( vector_size=768, num_label=len(label_vocab['Parsing']), rnn_layers=Parsing_rnn_layers, arc_mlp_size=Parsing_arc_mlp_size, label_mlp_size=Parsing_label_mlp_size, dropout=dropout, use_greedy_infer=Parsing_use_greedy_infer) if self.use_pos_embedding: self.pos_embedding = nn.Embedding(len(self.label_vocab['pos']), embedding_dim, padding_idx=0) self.loss = CrossEntropyLoss(padding_idx=0) #CWS self.cws_mlp = MLP([ embedding_dim, embedding_dim_1, embedding_dim_2, len(label_vocab['CWS']) ], 'relu', output_activation=None) #POS self.pos_mlp = MLP([ embedding_dim, embedding_dim_1, embedding_dim_2, len(label_vocab['POS']) ], 'relu', output_activation=None) def _generate_embedding(self, feats, word_lens, seq_len, pos): new_feats = [] batch_size = feats.size()[0] sentence_length = feats.size()[1] device = feats.device if self.use_average == False: for i in range(batch_size): new_feats.append(torch.index_select(feats[i], 0, word_lens[i])) new_feats = torch.stack(new_feats, 0) else: for i in range(batch_size): feats_for_one_sample = [] for j in range(word_lens.size()[1]): if word_lens[i][j] == 0 and j != 0: feats_for_one_word = torch.zeros(feats.size()[-1]) else: if j == word_lens.size()[1] - 1 or word_lens[i][ j + 1] == 0: index = range(word_lens[i][j], seq_len[i]) else: index = range(word_lens[i][j], word_lens[i][j + 1]) index = torch.tensor(index).to(device) feats_for_one_word = torch.index_select( feats[i], 0, index) word_len = feats_for_one_word.size()[0] feats_for_one_word = torch.mean(feats_for_one_word, dim=0) feats_for_one_sample.append(feats_for_one_word) feats_for_one_sample = torch.stack(feats_for_one_sample, dim=0) new_feats.append(feats_for_one_sample) new_feats = torch.stack(new_feats, 0) if self.use_pos_embedding: pos_feats = self.pos_embedding(pos) new_feats = new_feats + pos_feats return new_feats def _generate_from_pos(self, paths, seq_len): device = paths.device word_lens = [] batch_size = paths.size()[0] new_seq_len = [] batch_pos = [] for i in range(batch_size): word_len = [] pos = [] for j in range(seq_len[i]): tag = paths[i][j] tag = self.label_vocab['POS'].to_word(int(tag)) if tag.startswith('<'): continue tag1, tag2 = tag.split('-') tag2 = self.label_vocab['pos'].to_index(tag2) if tag1 == 'S' or tag1 == 'B': word_len.append(j) pos.append(tag2) if len(pos) == 1: word_len.append(seq_len[i] - 1) pos.append(tag2) new_seq_len.append(len(pos)) word_lens.append(word_len) batch_pos.append(pos) max_len = max(new_seq_len) for i in range(batch_size): word_lens[i] = word_lens[i] + [0] * (max_len - new_seq_len[i]) batch_pos[i] = batch_pos[i] + [0] * (max_len - new_seq_len[i]) word_lens = torch.tensor(word_lens, device=device) batch_pos = torch.tensor(batch_pos, device=device) new_seq_len = torch.tensor(new_seq_len, device=device) return word_lens, batch_pos, new_seq_len def _decode_parsing(self, dep_head, dep_label, seq_len, seq_len_for_wordlist, word_lens): device = dep_head.device heads = [] labels = [] batch_size = dep_head.size()[0] app_index = self.label_vocab['Parsing'].to_index('APP') max_len = seq_len.max() for i in range(batch_size): head = list(range(1, seq_len[i] + 1)) label = [app_index] * int(seq_len[i]) head[0] = 0 for j in range(1, seq_len_for_wordlist[i]): if j + 1 == seq_len_for_wordlist[i]: idx = seq_len[i] - 1 else: idx = word_lens[i][j + 1] - 1 label[idx] = int(dep_label[i][j]) root = dep_head[i][j] if root >= seq_len_for_wordlist[i] - 1: head[idx] = int(seq_len[i] - 1) else: try: head[idx] = int(word_lens[i][root + 1] - 1) except: print(len(head), idx, word_lens.size(), i, root) head = head + [0] * int(max_len - seq_len[i]) label = label + [0] * int(max_len - seq_len[i]) heads.append(head) labels.append(label) heads = torch.tensor(heads, device=device) labels = torch.tensor(labels, device=device) return heads, labels def forward(self, chars, seq_len, task_class, target, seq_len_for_wordlist=None, dep_head=None, dep_label=None, pos=None, word_lens=None): task = task_class[0] mask = chars.ne(0) layers = self.layers_map[task] feats = self.embed(chars, layers) if task == 'Parsing': parsing_feats = self._generate_embedding(feats, word_lens, seq_len, pos) loss_parsing = self.biaffine_parser(parsing_feats, seq_len_for_wordlist, dep_head, dep_label) return loss_parsing if task == 'NER': #?需要relu吗 feats = F.relu(self.ner_linear(feats)) logits = F.log_softmax(feats, dim=-1) loss = self.ner_crf(logits, target, mask) return {'loss': loss} if task == 'CWS': feats = self.cws_mlp(feats) #logits=F.log_softmax(feats, dim=-1) #loss=self.cws_crf(logits, target, mask) loss = self.loss.get_loss(feats, target, seq_len) return {'loss': loss} if task == 'POS': feats = self.pos_mlp(feats) #logits=F.log_softmax(feats, dim=-1) #loss=self.pos_crf(logits, target, mask) loss = self.loss.get_loss(feats, target, seq_len) return {'loss': loss} def predict(self, chars, seq_len, task_class): task = task_class[0] mask = chars.ne(0) layers = self.layers_map[task] feats = self.embed(chars, layers) if task == 'Parsing': for sample in chars: sample[0] = self.pos_idx pos_feats = self.embed(chars, '2') pos_feats = self.pos_mlp(pos_feats) #logits = F.log_softmax(pos_feats, dim=-1) #paths, _ = self.pos_crf.viterbi_decode(logits, mask) paths = pos_feats.max(dim=-1)[1] word_lens, batch_pos, seq_len_for_wordlist = self._generate_from_pos( paths, seq_len) parsing_feats = self._generate_embedding(feats, word_lens, seq_len, batch_pos) answer = self.biaffine_parser.predict(parsing_feats, seq_len_for_wordlist) head_preds = answer['head_preds'] label_preds = answer['label_preds'] heads, labels = self._decode_parsing(head_preds, label_preds, seq_len, seq_len_for_wordlist, word_lens) return {'head_preds': heads, 'label_preds': labels, 'pred': paths} if task == 'CWS': feats = self.cws_mlp(feats) #logits = F.log_softmax(feats, dim=-1) #paths, _ = self.cws_crf.viterbi_decode(logits, mask) paths = feats.max(dim=-1)[1] return {'pred': paths} if task == 'POS': feats = self.pos_mlp(feats) #logits = F.log_softmax(feats, dim=-1) #paths, _ = self.pos_crf.viterbi_decode(logits, mask) paths = feats.max(dim=-1)[1] return {'pred': paths} #output=feats.max(dim=-1)[1] if task == 'NER': feats = F.relu(self.ner_linear(feats)) logits = F.log_softmax(feats, dim=-1) paths, _ = self.ner_crf.viterbi_decode(logits, mask) return {'pred': paths}
from create_dataset import create_dataset from rnn import lstm from fastNLP import Trainer from fastNLP import CrossEntropyLoss from fastNLP import AccuracyMetric vocab, train_data, dev_data, test_data = create_dataset() model = lstm(vocab_size=len(vocab), embedding_length=200, hidden_size=128, output_size=20) model.cuda() loss = CrossEntropyLoss(pred='pred', target='target') metrics = AccuracyMetric(pred='pred', target='target') trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics, save_path='./', device=0, n_epochs=20) trainer.train()
if torch.cuda.is_available(): model.cuda() optimizer = optim.AdamW(model.parameters(), lr=lr) data = {} for name in ['seen', 'unseen', 'desc']: data[name] = data_bundle.get_dataset(name) callbacks = [GradientClipCallback(clip_type='value', clip_value=5), WarmupCallback(warmup=0.01, schedule='linear')] callbacks.append(FitlogCallback(data=data, verbose=1)) train_data = data_bundle.get_dataset('train') train_data.add_seq_len('input') # from collections import Counter # print(Counter(train_data.get_field('seq_len').content)) # exit(0) sampler = BucketSampler() clip_max_length(train_data, data_bundle) trainer = Trainer(train_data=train_data, model=model, optimizer=optimizer, loss=CrossEntropyLoss(), batch_size=batch_size, sampler=sampler, drop_last=False, update_every=1, num_workers=1, n_epochs=n_epochs, print_every=5, dev_data=data_bundle.get_dataset('dev'), metrics=MonoMetric(), metric_key='t10', validate_every=-1, save_path='save_models/', use_tqdm=True, device=None, callbacks=callbacks, check_code_level=0) trainer.train(load_best_model=False) fitlog.add_other(trainer.start_time, name='start_time')
vocab.from_dataset(train_dataset, field_name='words', no_create_entry_dataset=[test_dataset]) vocab.index_dataset(train_dataset, test_dataset, field_name='words') target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.from_dataset(train_dataset, field_name='target', no_create_entry_dataset=[test_dataset]) target_vocab.index_dataset(train_dataset, test_dataset, field_name='target') '''build bundle''' data_dict = {"train":train_dataset, "test":test_dataset} vocab_dict = {"words":vocab, "target":target_vocab} data_bundle = DataBundle(vocab_dict, data_dict) print(data_bundle) '''build model''' embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-uncased', include_cls_sep=True) model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target'))) # model = BertForSequenceClassification(embed, 2) device = 0 if torch.cuda.is_available() else 'cpu' trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer=Adam(model_params=model.parameters(), lr=2e-5), loss=CrossEntropyLoss(), device=device, batch_size=8, dev_data=data_bundle.get_dataset('train'), metrics=AccuracyMetric(), n_epochs=10, print_every=1) trainer.train() tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric()) tester.test()
"mysgd": lambda: MySGD(params=net.parameters(), lr=C.lr), "adam": lambda: tc.optim.Adam(params=net.parameters(), lr=C.lr), "sgd": lambda: tc.optim.SGD(params=net.parameters(), lr=C.lr), } optim = optims[C.optim]() trainer = Trainer( train_data=data["train"], dev_data=data[C.valid_data], model=net, batch_size=C.batch_size, loss=CrossEntropyLoss(pred="pred", target="label"), metrics=AccuracyMetric(pred="pred", target="label"), optimizer=optim, n_epochs=C.n_epochs, save_path=os.path.join(C.model_save, C.model, "./"), device=C.gpus, use_tqdm=True, check_code_level=-1, ) train_result = trainer.train(load_best_model=True) logger.log("train: {0}".format(train_result)) print("Training done. Now testing.") tester = Tester( data=test_data,
if __name__ == "__main__": vocab = pickle.load(open(config.vocab_path, 'rb')) train_data = pickle.load(open(config.train_data_path, 'rb')) dev_data = pickle.load(open(config.dev_data_path, 'rb')) test_data = pickle.load(open(config.test_data_path, 'rb')) if config.model == "CNN": model = CNN(len(vocab), config.intput_size, config.class_num) elif config.model == "RNN": model = RNN(len(vocab), config.intput_size, config.hidden_size, config.class_num, config.rnn_type) optimizer = Adam(lr=config.learning_rate, weight_decay=0) loss = CrossEntropyLoss(pred="output", target="target") metrics = AccuracyMetric(pred="output", target="target") trainer = Trainer(model=model, n_epochs=config.epoch, validate_every=config.validate_every, optimizer=optimizer, train_data=train_data, dev_data=dev_data, metrics=metrics, loss=loss, batch_size=config.batch_size, device='cuda:0', save_path=config.save_path, metric_key="acc", callbacks=[EarlyStopCallback(config.patience)]) trainer.train()
data_dict = {"train": train_dataset, "dev": dev_dataset, "test": test_dataset} vocab_dict = {"words": vocab, "target": target_vocab} data_bundle = DataBundle(vocab_dict, data_dict) print(data_bundle) '''build model''' embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-uncased', include_cls_sep=True) model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target'))) # model = BertForSequenceClassification(embed, 2) device = 0 if torch.cuda.is_available() else 'cpu' trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer=Adam(model_params=model.parameters(), lr=2e-5), loss=CrossEntropyLoss(target='target'), device=device, batch_size=8, dev_data=data_bundle.get_dataset('dev'), metrics=AccuracyMetric(target='target'), n_epochs=2, print_every=1) trainer.train() tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric()) tester.test()
print("\t* Loading word embeddings...") embeddings = util.load_pickle(os.path.normpath(args["data_dir"]), args["embeddings_file"]) embeddings = torch.Tensor(embeddings) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = myESIM(embeddings.shape[0], embeddings.shape[1], 300, embeddings=embeddings, dropout=0.5, num_classes=3, device=device).to(device) trainer = Trainer(train_data=train_data, model=model, loss=CrossEntropyLoss(pred='pred', target='label'), metrics=AccuracyMetric(), n_epochs=10, batch_size=32, print_every=-1, validate_every=-1, dev_data=dev_data, use_cuda=True, optimizer=Adam(lr=0.0004, weight_decay=0), check_code_level=-1, metric_key='acc', use_tqdm=False) trainer.train() # 训练结束后model为dev的最佳模型,保存 torch.save(model.state_dict(), '../data/checkpoints/best_model.pkl')
kernel_sizes=kernel_sizes, padding=padding) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(sum(kernel_nums), num_classes) def forward(self, words, seq_len=None): x = self.embed(words) # [N,L] -> [N,L,C] x = self.conv_pool(x) # [N,L,C] -> [N,C] x = self.dropout(x) x = self.fc(x) # [N,C] -> [N, N_class] return {C.OUTPUT: x} def predict(self, words, seq_len=None): output = self(words, seq_len) _, predict = output[C.OUTPUT].max(dim=1) return {C.OUTPUT: predict} #demo version trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence') trainData.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) vocab = Vocabulary(min_freq=2) vocab = vocab.from_dataset(trainData, field_name='words') #change to index vocab.index_dataset(trainData, field_name='words',new_field_name='words') trainData.set_target('target') model = CNNText((len(vocab),128), num_classes=20, padding=2, dropout=0.1) train_data, dev_data = trainData.split(0.2) trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), metrics=AccuracyMetric(), batch_size=16) trainer.train()
dataset.rename_field('words', Const.INPUT) dataset.rename_field('target', Const.TARGET) dataset.rename_field('seq_len', Const.INPUT_LEN) dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_target(Const.TARGET) testset.rename_field('words', Const.INPUT) testset.rename_field('target', Const.TARGET) testset.rename_field('seq_len', Const.INPUT_LEN) testset.set_input(Const.INPUT, Const.INPUT_LEN) testset.set_target(Const.TARGET) train_data, dev_data = dataset.split(0.1) loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET) metrics = AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET) trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=loss, batch_size=16, metrics=metrics, n_epochs=20, callbacks=[FitlogCallback(dataset)]) trainer.train() tester = Tester(data=testset, model=model, metrics=metrics) tester.test() tester = Tester(data=train_data, model=model, metrics=metrics)
logger.warn('获取词典') char_vocab = data_bundle.get_vocab('words') logger.info('char_vocab:{}'.format(char_vocab)) target_vocab = data_bundle.get_vocab('target') logger.info('target_vocab:{}'.format(target_vocab)) save_serialize_obj(char_vocab, char_vocab_pkl_file) save_serialize_obj(target_vocab, target_vocab_pkl_file) logger.info('词典序列化:{}'.format(char_vocab_pkl_file)) logger.warn('选择预训练词向量') word2vec_embed = StaticEmbedding(char_vocab, model_dir_or_name='cn-char-fastnlp-100d') logger.warn('神经网络模型') model = CNNText(word2vec_embed, num_classes=len(target_vocab)) logger.info(model) logger.warn('训练超参数设定') loss = CrossEntropyLoss() optimizer = Adam( [param for param in model.parameters() if param.requires_grad]) # metric = AccuracyMetric() metric = ClassifyFPreRecMetric( tag_vocab=data_bundle.get_vocab(Const.TARGET), only_gross=False) # 若only_gross=False, 即还会返回各个label的metric统计值 device = 'cuda' if torch.cuda.is_available( ) else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快 logger.info('device:{}'.format(device)) batch_size = 32 n_epochs = 5 early_stopping = 10 trainer = Trainer(save_path=model_path, train_data=data_bundle.get_dataset('train'), model=model,
def run_rnn(): dataset_train_p2, dataset_test_p2 = get_text_classification_datasets() line_len = len(dataset_train_p2.data) with open("formalized_train_data.csv", "w") as file: for i in range(line_len): file.write( document2line(dataset_train_p2.data[i]) + "\t" + str(dataset_train_p2.target[i]) + '\n') file.close() line_len = len(dataset_test_p2.data) with open("formalized_test_data.csv", "w") as file2: for i in range(line_len): file2.write( document2line(dataset_test_p2.data[i]) + "\t" + str(dataset_test_p2.target[i]) + '\n') file2.close() loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t') train_dataset = loader.load("./formalized_train_data.csv") test_dataset = loader.load("./formalized_test_data.csv") train_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') train_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) test_dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') test_dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) from fastNLP import Vocabulary # 使用Vocabulary类统计单词,并将单词序列转化为数字序列 vocab = Vocabulary(min_freq=2).from_dataset(train_dataset, field_name='words') vocab.index_dataset(train_dataset, field_name='words', new_field_name='words') vocab.index_dataset(test_dataset, field_name='words', new_field_name='words') # 将label转为整数,并设置为 target train_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) test_dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) embed_dim = 1024 hidden_dim = 128 layer = 4 model = Rnn(len(vocab), embed_dim, hidden_dim, layer, 4) use_gpu = torch.cuda.is_available() # 判断是否有GPU加速 if use_gpu: model = model.cuda() trainer = Trainer(model=model, train_data=train_dataset, dev_data=test_dataset, loss=CrossEntropyLoss(), n_epochs=100, metrics=AccuracyMetric()) trainer.train()