def train_mlp(train_dataset, val_dataset): cudnn.benchmark = True torch.backends.cudnn.deterministic = False torch.backends.cudnn.enabled = True input_size = train_dataset.input_size() config = {'input_size': input_size, 'output_size': 12, 'print_freq': 100} model = HaptMlpModel(config).cuda() weight_tensor = [1 for _ in range(config['output_size'] - 2)] + [10, 10] criterion = nn.CrossEntropyLoss(torch.Tensor(weight_tensor)).cuda() # criterion = nn.CrossEntropyLoss(torch.Tensor([0.1, 0.1, 0.1, 0.1])).cuda() optimizer = torch.optim.Adam( # model.parameters(), filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True) summary = SummaryWriter() best_val_acc = 0 best_val_f1 = 0 for epoch in range(total_epoch): train(config, train_loader, model, criterion, optimizer, epoch, summary) val_acc, val_f1 = validate(config, val_loader, model, criterion, epoch, summary) save_checkpoint(model, epoch, optimizer, './checkpoints', 'checkpoint_mlp.pth.tar') if val_f1 > best_val_f1: save_checkpoint(model, epoch, optimizer, './checkpoints', 'best_mlp.pth.tar') best_val_acc = val_acc best_val_f1 = val_f1 return best_val_acc, best_val_f1
def run(mtd="fold_split"): def _eval(data): model.eval() # 不启用 BatchNormalization 和 Dropout # data = dev_data y_pred = [] y_true = [] with torch.no_grad(): for batch_data in dataset_processer.data_iter( data, config['test_batch_size'], shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = dataset_processer.batch2tensor( batch_data) batch_outputs = model(batch_inputs) y_pred.extend( torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) score, dev_f1 = scores.get_score(y_true, y_pred) return score, dev_f1 if mtd == "fold_split": demo_preprocess.split_dataset(raw_path, train_path, dev_path, test_path) elif mtd == "process_data": demo_preprocess.process_data(config, train_path, dev_path) elif mtd == "train": Train_data = file_utils.read_json(config["train_set"]) Dev_data = file_utils.read_json(config["dev_set"]) # 生成模型可处理的格式 train_data = dataset_processer.get_examples(Train_data, label_encoder) dev_data = dataset_processer.get_examples(Dev_data, label_encoder) del Train_data, Dev_data # 一个epoch的batch个数 batch_num = int( np.ceil(len(train_data) / float(config["train_batch_size"]))) print("batch_num:{}".format(batch_num)) # model = BertSoftmaxModel(cfg.bert_path, label_encoder) optimizer = Optimizer(model.all_parameters, steps=batch_num * config["epochs"]) # 优化器 # loss # criterion = nn.CrossEntropyLoss() # obj criterion = loss_factory.focal_loss() best_train_f1, best_dev_f1 = 0, 0 early_stop = -1 EarlyStopEpochs = 10 # 当多个epoch,dev的指标都没有提升,则早停 # train print("start train") for epoch in range(cfg.RESUME_EPOCH + 1, config["epochs"] + 1): optimizer.zero_grad() model.train() # 启用 BatchNormalization 和 Dropout overall_losses = 0 losses = 0 # batch_idx = 1 y_pred = [] y_true = [] step = 0 for batch_data in dataset_processer.data_iter( train_data, config["train_batch_size"], shuffle=True): torch.cuda.empty_cache() batch_inputs, batch_labels = dataset_processer.batch2tensor( batch_data) batch_outputs = model(batch_inputs) print(batch_outputs.shape) # loss = criterion(batch_outputs, batch_labels) loss.backward() loss_value = loss.detach().cpu().item() losses += loss_value overall_losses += loss_value y_pred.extend( torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) # nn.utils.clip_grad_norm_(optimizer.all_params, max_norm=config["clip"]) # 梯度裁剪 for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers): cur_optim.step() scheduler.step() optimizer.zero_grad() step += 1 # print(step, time.time()) overall_losses /= batch_num overall_losses = scores.reformat(overall_losses, 4) score, train_f1 = scores.get_score(y_true, y_pred) print("epoch:{},train_score:{}, train_f1:{}, overall_loss:{} ". format(epoch, train_f1, score, overall_losses)) # if set(y_true) == set(y_pred): # print("report") # report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names) # # logging.info('\n' + report) # print(report) # eval _, dev_f1 = _eval(data=dev_data) if best_dev_f1 < dev_f1: best_dev_f1 = dev_f1 early_stop = 0 best_train_f1 = train_f1 save_path = model_utils.save_checkpoint( model, epoch, save_folder=os.path.join(cfg.proj_path, "data/bert_nn")) print("save_path:{}".format(save_path)) # torch.save(model.state_dict(), save_model) else: early_stop += 1 if early_stop == EarlyStopEpochs: # 达到早停次数,则停止训练 break print( "early_stop:{}, score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}" .format(early_stop, dev_f1, score, best_train_f1, best_dev_f1))
def train_and_evaluate(model, data_loader, optimizer, loss_fn, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_data: (dict) training data with keys 'data' and 'labels' val_data: (dict) validaion data with keys 'data' and 'labels' optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) """ ema = utils.EMA(model, params.ema_decay) # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 for epoch in range(params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) num_steps = (params.train_size + 1) // params.batch_size train_data_iterator = data_loader.data_iterator( split='train', batch_size=params.batch_size) train(model, optimizer, loss_fn, train_data_iterator, metrics, params, num_steps, ema) # Evaluate for one epoch on validation set ema.assign(model) num_steps = (params.val_size + 1) // params.batch_size val_data_iterator = data_loader.data_iterator( split='val', batch_size=params.batch_size) val_metrics = evaluate(model, loss_fn, val_data_iterator, metrics, params, num_steps) val_acc = val_metrics['f1'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path) ema.resume(model)
def run(method="train", save_path=None, infer_texts=[]): shuffle_slicer = ShuffleSlicer() # start_time = time.time() raw_data_path = "/home/wujinjie/kesci_question_multilabel_classification/data/raw_data/baidu/nlp_db.baidu_text.csv" texts = pd.read_csv(raw_data_path) train_df, dev_df, test_df = shuffle_slicer.split(texts, dev=True) # Test_data = {'label': [0] * len(texts), 'text': test_texts} clip = 5.0 epochs = 100 # log_interval = 50 test_batch_size = 128 train_batch_size = 128 train_texts, train_labels = process_corpus_dl(train_df) Train_data = {'label': train_labels, 'text': train_texts} dev_texts, dev_labels = process_corpus_dl(dev_df) Dev_data = {'label': dev_labels, 'text': dev_texts} vocab = Vocab(Train_data) step = 0 def _eval(data): model.eval() # 不启用 BatchNormalization 和 Dropout # data = dev_data y_pred = [] y_true = [] with torch.no_grad(): for batch_data in data_iter(data, test_batch_size, shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) score, dev_f1 = get_score(y_true, y_pred) return score, dev_f1 def _infer(data): model.eval() # data = dev_data y_pred = [] with torch.no_grad(): for batch_data in data_iter(data, test_batch_size, shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) print(label_encoder.label2name(y_pred)) if method == "train": model = Model(vocab, label_encoder) # loss criterion = nn.CrossEntropyLoss() # obj # 生成模型可处理的格式 train_data = get_examples_bert(Train_data, model.word_encoder, vocab, label_encoder) dev_data = get_examples_bert(Dev_data, model.word_encoder, vocab, label_encoder) # 一个epoch的batch个数 batch_num = int(np.ceil(len(train_data) / float(train_batch_size))) optimizer = Optimizer(model.all_parameters, steps=batch_num * epochs) # 优化器 best_train_f1, best_dev_f1 = 0, 0 early_stop = -1 EarlyStopEpochs = 3 # 当多个epoch,dev的指标都没有提升,则早停 # train print("start train") for epoch in range(1, epochs + 1): optimizer.zero_grad() model.train() # 启用 BatchNormalization 和 Dropout overall_losses = 0 losses = 0 # batch_idx = 1 y_pred = [] y_true = [] for batch_data in data_iter(train_data, train_batch_size, shuffle=True): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) loss = criterion(batch_outputs, batch_labels) loss.backward() loss_value = loss.detach().cpu().item() losses += loss_value overall_losses += loss_value y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) nn.utils.clip_grad_norm_( optimizer.all_params, max_norm=clip) # 梯度裁剪 for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers): cur_optim.step() scheduler.step() optimizer.zero_grad() step += 1 # print(step) print(epoch) overall_losses /= batch_num overall_losses = reformat(overall_losses, 4) score, train_f1 = get_score(y_true, y_pred) print("score:{}, train_f1:{}".format(train_f1, score)) # if set(y_true) == set(y_pred): # print("report") # report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names) # # logging.info('\n' + report) # print(report) # eval _, dev_f1 = _eval(data=dev_data) if best_dev_f1 <= dev_f1: best_dev_f1 = dev_f1 early_stop = 0 best_train_f1 = train_f1 save_path = model_utils.save_checkpoint( model, epoch, save_folder="/home/wujinjie/kesci_question_multilabel_classification/data/textbert") print("save_path:{}".format(save_path)) # torch.save(model.state_dict(), save_model) else: early_stop += 1 if early_stop == EarlyStopEpochs: # 达到早停次数,则停止训练 break print("score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}".format( dev_f1, score, best_train_f1, best_dev_f1)) else: model = model_utils.load_checkpoint(save_path) if method == "test": test_texts, test_labels = process_corpus_dl(train_df) Test_data = {'label': test_labels, 'text': test_texts} test_data = get_examples_bert(Test_data, model.word_encoder, vocab, label_encoder) # model.load_state_dict(torch.load(save_model)) _, dev_f1 = _eval(data=test_data) print(dev_f1) elif method == "infer": infer_texts = list(map(segment, infer_texts)) # print(infer_texts) Infer_data = {'label': [0] * len(infer_texts), 'text': infer_texts} infer_data = get_examples_bert(Infer_data, model.word_encoder, vocab, label_encoder) _infer(data=infer_data)
def run(self, method="train", save_path=None, infer_texts=[]): step = 0 if method == "train": self.model.train() optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=0.001) loss_funtion = F.cross_entropy epochs = 10 best_dev_f1 = 0.0 for epoch in range(epochs): overall_losses = 0 y_pred = [] y_true = [] for step, batch in enumerate(train_iter): optimizer.zero_grad() batch_labels = batch.label # print(batch_labels.shape) batch_input = batch.text.transpose(0, 1) # print(batch_input) batch_outputs = self.model(batch_input) loss = loss_funtion(batch_outputs, batch_labels) loss.backward() optimizer.step() loss_value = loss.detach().cpu().item() overall_losses += loss_value # print(batch_outputs) # print(torch.argmax(batch_outputs, dim=1)) # print(batch_labels) y_pred.extend( torch.argmax(batch_outputs, dim=1).cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) overall_losses /= train_iter.batch_size overall_losses = reformat(overall_losses, 4) score, train_f1 = metric_obj.get_score(y_true, y_pred) print("epoch:{}, {}:{}, train_f1:{}".format( epoch, metric_obj.name(), train_f1, score)) _, dev_f1 = self._eval(data=val_iter) if best_dev_f1 <= dev_f1: best_dev_f1 = dev_f1 early_stop = 0 best_train_f1 = train_f1 save_path = model_utils.save_checkpoint( self.model, epoch, save_folder="./data/datawhale") print("save_path:{}".format(save_path)) else: early_stop += 1 if early_stop == EarlyStopEpochs: break print( "score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}, overall_loss:{}" .format(dev_f1, score, best_train_f1, best_dev_f1, overall_losses))