losses_per_batch.append(args.gradient_acc_steps * total_loss / 100) print( '[Epoch: %d, %5d/ %d points] total loss per batch: %.7f' % (e, (i + 1) * args.batch_size, train_length, losses_per_batch[-1])) total_loss = 0.0 losses_per_epoch.append(sum(losses_per_batch) / len(losses_per_batch)) accuracy_per_epoch.append(evaluate_results(net, train_iter, cuda)) print("Losses at Epoch %d: %.7f" % (e, losses_per_epoch[-1])) print("Accuracy at Epoch %d: %.7f" % (e, accuracy_per_epoch[-1])) if accuracy_per_epoch[-1] > acc: acc = accuracy_per_epoch[-1] torch.save({ 'epoch': e + 1,\ 'state_dict': net.state_dict(),\ 'best_acc': acc,\ 'optimizer' : optimizer.state_dict(),\ 'scheduler' : scheduler.state_dict(),\ }, os.path.join("./data/" ,\ "test_model_best_%d.pth.tar" % args.model_no)) if (e % 1) == 0: save_as_pickle("test_losses_per_epoch_%d.pkl" % args.model_no, losses_per_epoch) save_as_pickle("test_accuracy_per_epoch_%d.pkl" % args.model_no, accuracy_per_epoch) torch.save({ 'epoch': e + 1,\ 'state_dict': net.state_dict(),\ 'best_acc': accuracy_per_epoch[-1],\ 'optimizer' : optimizer.state_dict(),\
def main(): device = torch.device("cpu" if hparams.no_cuda else "cuda") print("=== build model ===") start = time.time() model = Transformer(hparams.d_model, hparams.d_ff, vocab_size, hparams.num_heads, hparams.num_layers, hparams.max_len, hparams.dropout, EOS_id, PAD_id, device).to(device) end = time.time() print("=== build model done === {} seconds".format(end - start)) train.global_step = 0 # train_dataset, val_dataset = split_data(train_path_en, train_path_de, hparams.validation_rate) train_dataset = make_dataset(train_path_en, train_path_de) val_dataset = make_dataset(val_path_en, val_path_de) train_loader = DataLoader(train_dataset, batch_size=hparams.batch_size, collate_fn=custom_collate, shuffle=True, num_workers=hparams.num_workers) val_loader = DataLoader(val_dataset, batch_size=hparams.batch_size, collate_fn=custom_collate, num_workers=hparams.num_workers) criterion = torch.nn.NLLLoss(ignore_index=PAD_id, reduction="sum").to(device) optimizer = torch.optim.Adam(model.parameters(), hparams.lr) writer = SummaryWriter() for epoch in range(hparams.max_epochs): """train""" print("=== train start ===") start = time.time() loss, bleu_score = train(model, train_loader, criterion, optimizer, device, writer, epoch, hparams.print_steps) end = time.time() print("=== train done === {} seconds".format(end - start)) print("epoch: {}/{}, loss: {}, bleu score: {}".format( epoch + 1, hparams.max_epochs, loss, bleu_score)) torch.save(model.state_dict(), save_path) print("model saved to '{}'".format(os.path.abspath(save_path))) writer.add_scalar("Loss/train", loss, epoch + 1) writer.add_scalar("Bleu score/train", bleu_score, epoch + 1) """""" print("=== evaluation start ===") start = time.time() loss, bleu_score = evaluate(model, val_loader, criterion, optimizer, device, writer) end = time.time() print("=== evaluation done === {} seconds".format(end - start)) print("epoch: {}/{}, loss: {}, bleu score: {}".format( epoch + 1, hparams.max_epochs, loss, bleu_score)) writer.add_scalar("Loss/eval", loss, epoch + 1) writer.add_scalar("Bleu score/eval", bleu_score, epoch + 1)
class ModelDev: def __init__(self, config): self.config = config self.prepare_dataloaders(config['data']) # self.model = MLP(config['MLP']) # self.model = MLP_3D(config['MLP']) # self.model = LSTM(config['LSTM']) self.model = Transformer(config['Trans']) print(self.model) self.model_name = config['train']['model_name'] self.checkpoint_dir = './checkpoint_dir/{}/'.format(self.model_name) if not os.path.exists(self.checkpoint_dir): os.mkdir(self.checkpoint_dir) self.tb_log_dir = './tb_log/{}/'.format(self.model_name) if not os.path.exists(self.tb_log_dir): os.mkdir(self.tb_log_dir) self.optimal_metric = 100000 self.cur_metric = 100000 self.loss = nn.MSELoss() self.optim = optim.Adam(self.model.parameters(), lr=self.config['train']['lr'], betas=(0.5, 0.999)) def prepare_dataloaders(self, config): data = Rossler(config) train_data = dataset(data.train_X, data.train_Y, data.train_Z, config['w_size']) self.train_dataloader = DataLoader(train_data, batch_size=config['batch_size'], shuffle=True, drop_last=True) valid_data = dataset(data.valid_X, data.valid_Y, data.valid_Z, config['w_size']) self.valid_dataloader = DataLoader(valid_data, batch_size=config['batch_size'], shuffle=False, drop_last=True) test_data = dataset(data.test_X, data.test_Y, data.test_Z, config['w_size']) self.test_dataloader = DataLoader(test_data, batch_size=config['batch_size'], shuffle=False, drop_last=True) self.data = data def train(self): self.writer = SummaryWriter(self.tb_log_dir) for self.epoch in range(self.config['train']['epochs']): self.train_on_epoch() self.cur_metric = self.valid_on_epoch() print(self.cur_metric) if self.needToSave(): self.saveWeights() def train_on_epoch(self): self.model.train(False) LOSS = [] for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.train_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) X_i += torch.normal(0, 0.1, X_i.shape) self.model.zero_grad() pred = self.model(X_i) loss = self.loss(pred, X_o) loss.backward() self.optim.step() LOSS.append(loss.data.cpu().numpy()) self.writer.add_scalar('train Loss', np.mean(LOSS), self.epoch) def valid_on_epoch(self): self.model.train(False) LOSS = [] with torch.no_grad(): for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.valid_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) pred = self.model(X_i) loss = self.loss(pred, X_o) LOSS.append(loss.data.cpu().numpy()) self.writer.add_scalar('valid Loss', np.mean(LOSS), self.epoch) return np.mean(LOSS) def cast_to_float(self, X_i, Y_i, Z_i, X_o, Y_o, Z_o): X_i = X_i.float() Y_i = Y_i.float() Z_i = Z_i.float() X_o = X_o.float() Y_o = Y_o.float() Z_o = Z_o.float() return X_i, Y_i, Z_i, X_o, Y_o, Z_o def needToSave(self): if self.cur_metric < self.optimal_metric: self.optimal_metric = self.cur_metric return True return False def saveWeights(self, clean_previous=True): if clean_previous: files = glob(self.checkpoint_dir + '*.pth') for f in files: os.remove(f) torch.save(self.model.state_dict(), '{}model_{}.pth'.format(self.checkpoint_dir, self.epoch)) def test_MSE(self): self.model.train(False) self.load_weights() LOSS = [] with torch.no_grad(): for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.test_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) pred = self.model(X_i) loss = self.loss(pred, X_o) LOSS.append(loss.data.cpu().numpy()) return np.mean(LOSS) def test_a_window(self): self.model.train(False) self.load_weights() idx = 0 with torch.no_grad(): for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.test_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) pred = self.model(X_i) show_a_test_window(X_i.data.numpy()[0, :], X_o.data.numpy()[0, :], pred.data.numpy()[0, :], idx, self.config['data']['stride']) idx += 1 def test_long_window(self, length): self.model.train(False) self.load_weights() for start_idx in [100, 200, 300, 400, 500]: X_I = self.data.test_X[start_idx:start_idx + length] X_pred = X_I[:self.config['data']['w_size'] - 1] with torch.no_grad(): while len(X_pred) < len(X_I): nparray = np.array(X_pred[-self.config['data']['w_size'] + 1:]) nparray = np.expand_dims(nparray, axis=0) torchTensor = torch.FloatTensor(nparray) pred = self.model( torchTensor).data.squeeze().numpy().tolist() X_pred.append(pred) show_long_window(X_I, X_pred, self.config['data']['stride'], self.config['data']['w_size'], start_idx, self.config['train']['model_name']) def load_weights(self): target_file = list(glob(self.checkpoint_dir + 'model*.pth'))[0] print('loading ', target_file) weights = torch.load(target_file) self.model.load_state_dict(weights)