elif args.dataset_name == 'coildel': EMBED_DIM = 2 num_classes = 100 num_heads = 8 depth = 6 p, q = 1, 1 # k, num_heads, depth, seq_length, num_tokens, num_ model = Transformer(EMBED_DIM, num_heads, test_dataset.walklength, depth, num_classes).to(device) lr_warmup = 10000 lr = 1e-3 opt = torch.optim.Adam(lr=lr, params=model.parameters()) sch = torch.optim.lr_scheduler.LambdaLR( opt, lambda i: min(i / (lr_warmup / args.batch_size), 1.0)) loss_func = nn.NLLLoss() def train_validate(model, loader, opt, loss_func, train, device): if train: model.train() else: model.eval() batch_loss = 0 batch_acc = 0
def main(): device = torch.device("cpu" if hparams.no_cuda else "cuda") print("=== build model ===") start = time.time() model = Transformer(hparams.d_model, hparams.d_ff, vocab_size, hparams.num_heads, hparams.num_layers, hparams.max_len, hparams.dropout, EOS_id, PAD_id, device).to(device) end = time.time() print("=== build model done === {} seconds".format(end - start)) train.global_step = 0 # train_dataset, val_dataset = split_data(train_path_en, train_path_de, hparams.validation_rate) train_dataset = make_dataset(train_path_en, train_path_de) val_dataset = make_dataset(val_path_en, val_path_de) train_loader = DataLoader(train_dataset, batch_size=hparams.batch_size, collate_fn=custom_collate, shuffle=True, num_workers=hparams.num_workers) val_loader = DataLoader(val_dataset, batch_size=hparams.batch_size, collate_fn=custom_collate, num_workers=hparams.num_workers) criterion = torch.nn.NLLLoss(ignore_index=PAD_id, reduction="sum").to(device) optimizer = torch.optim.Adam(model.parameters(), hparams.lr) writer = SummaryWriter() for epoch in range(hparams.max_epochs): """train""" print("=== train start ===") start = time.time() loss, bleu_score = train(model, train_loader, criterion, optimizer, device, writer, epoch, hparams.print_steps) end = time.time() print("=== train done === {} seconds".format(end - start)) print("epoch: {}/{}, loss: {}, bleu score: {}".format( epoch + 1, hparams.max_epochs, loss, bleu_score)) torch.save(model.state_dict(), save_path) print("model saved to '{}'".format(os.path.abspath(save_path))) writer.add_scalar("Loss/train", loss, epoch + 1) writer.add_scalar("Bleu score/train", bleu_score, epoch + 1) """""" print("=== evaluation start ===") start = time.time() loss, bleu_score = evaluate(model, val_loader, criterion, optimizer, device, writer) end = time.time() print("=== evaluation done === {} seconds".format(end - start)) print("epoch: {}/{}, loss: {}, bleu score: {}".format( epoch + 1, hparams.max_epochs, loss, bleu_score)) writer.add_scalar("Loss/eval", loss, epoch + 1) writer.add_scalar("Bleu score/eval", bleu_score, epoch + 1)
type=int, default=350, help="No of epochs") args = parser.parse_args() train_iter, FR, EN, train_length = load_dataloaders(args) src_vocab = len(EN.vocab) trg_vocab = len(FR.vocab) cuda = torch.cuda.is_available() net = Transformer(src_vocab=src_vocab, trg_vocab=trg_vocab, d_model=args.d_model, num=args.num, n_heads=args.n_heads) for p in net.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) criterion = nn.CrossEntropyLoss(reduction="mean", ignore_index=1) optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) #scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20,30,40,50,100,200], gamma=0.7) scheduler = CosineWithRestarts(optimizer, T_max=500) if cuda: net.cuda() start_epoch, acc = load_state(net, optimizer, scheduler, args.model_no,
model = Transformer(len(SRC.vocab), len(TGT.vocab), N=ARGS.n_layers, d_model=ARGS.d_model, d_ff=4 * ARGS.d_model, h=ARGS.n_heads, dropout=ARGS.p_dropout).to(ARGS.device) criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1).to(ARGS.device) # train if ARGS.run_mode == 'train': optimizer = NoamOpt( ARGS.d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) iter_cnt = 1 min_norm_val_loss = math.inf model.train() for epoch in range(ARGS.n_epochs): for train_batch in train_iter: train_batch = utils.rebatch(pad_idx, train_batch) train_out = model(train_batch.src, train_batch.trg, train_batch.src_mask, train_batch.trg_mask) train_loss = criterion( train_out.contiguous().view(-1, train_out.size(-1)), train_batch.trg_y.contiguous().view(-1)) / train_batch.ntokens train_loss.backward() optimizer.step()
class ModelDev: def __init__(self, config): self.config = config self.prepare_dataloaders(config['data']) # self.model = MLP(config['MLP']) # self.model = MLP_3D(config['MLP']) # self.model = LSTM(config['LSTM']) self.model = Transformer(config['Trans']) print(self.model) self.model_name = config['train']['model_name'] self.checkpoint_dir = './checkpoint_dir/{}/'.format(self.model_name) if not os.path.exists(self.checkpoint_dir): os.mkdir(self.checkpoint_dir) self.tb_log_dir = './tb_log/{}/'.format(self.model_name) if not os.path.exists(self.tb_log_dir): os.mkdir(self.tb_log_dir) self.optimal_metric = 100000 self.cur_metric = 100000 self.loss = nn.MSELoss() self.optim = optim.Adam(self.model.parameters(), lr=self.config['train']['lr'], betas=(0.5, 0.999)) def prepare_dataloaders(self, config): data = Rossler(config) train_data = dataset(data.train_X, data.train_Y, data.train_Z, config['w_size']) self.train_dataloader = DataLoader(train_data, batch_size=config['batch_size'], shuffle=True, drop_last=True) valid_data = dataset(data.valid_X, data.valid_Y, data.valid_Z, config['w_size']) self.valid_dataloader = DataLoader(valid_data, batch_size=config['batch_size'], shuffle=False, drop_last=True) test_data = dataset(data.test_X, data.test_Y, data.test_Z, config['w_size']) self.test_dataloader = DataLoader(test_data, batch_size=config['batch_size'], shuffle=False, drop_last=True) self.data = data def train(self): self.writer = SummaryWriter(self.tb_log_dir) for self.epoch in range(self.config['train']['epochs']): self.train_on_epoch() self.cur_metric = self.valid_on_epoch() print(self.cur_metric) if self.needToSave(): self.saveWeights() def train_on_epoch(self): self.model.train(False) LOSS = [] for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.train_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) X_i += torch.normal(0, 0.1, X_i.shape) self.model.zero_grad() pred = self.model(X_i) loss = self.loss(pred, X_o) loss.backward() self.optim.step() LOSS.append(loss.data.cpu().numpy()) self.writer.add_scalar('train Loss', np.mean(LOSS), self.epoch) def valid_on_epoch(self): self.model.train(False) LOSS = [] with torch.no_grad(): for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.valid_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) pred = self.model(X_i) loss = self.loss(pred, X_o) LOSS.append(loss.data.cpu().numpy()) self.writer.add_scalar('valid Loss', np.mean(LOSS), self.epoch) return np.mean(LOSS) def cast_to_float(self, X_i, Y_i, Z_i, X_o, Y_o, Z_o): X_i = X_i.float() Y_i = Y_i.float() Z_i = Z_i.float() X_o = X_o.float() Y_o = Y_o.float() Z_o = Z_o.float() return X_i, Y_i, Z_i, X_o, Y_o, Z_o def needToSave(self): if self.cur_metric < self.optimal_metric: self.optimal_metric = self.cur_metric return True return False def saveWeights(self, clean_previous=True): if clean_previous: files = glob(self.checkpoint_dir + '*.pth') for f in files: os.remove(f) torch.save(self.model.state_dict(), '{}model_{}.pth'.format(self.checkpoint_dir, self.epoch)) def test_MSE(self): self.model.train(False) self.load_weights() LOSS = [] with torch.no_grad(): for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.test_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) pred = self.model(X_i) loss = self.loss(pred, X_o) LOSS.append(loss.data.cpu().numpy()) return np.mean(LOSS) def test_a_window(self): self.model.train(False) self.load_weights() idx = 0 with torch.no_grad(): for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.test_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) pred = self.model(X_i) show_a_test_window(X_i.data.numpy()[0, :], X_o.data.numpy()[0, :], pred.data.numpy()[0, :], idx, self.config['data']['stride']) idx += 1 def test_long_window(self, length): self.model.train(False) self.load_weights() for start_idx in [100, 200, 300, 400, 500]: X_I = self.data.test_X[start_idx:start_idx + length] X_pred = X_I[:self.config['data']['w_size'] - 1] with torch.no_grad(): while len(X_pred) < len(X_I): nparray = np.array(X_pred[-self.config['data']['w_size'] + 1:]) nparray = np.expand_dims(nparray, axis=0) torchTensor = torch.FloatTensor(nparray) pred = self.model( torchTensor).data.squeeze().numpy().tolist() X_pred.append(pred) show_long_window(X_I, X_pred, self.config['data']['stride'], self.config['data']['w_size'], start_idx, self.config['train']['model_name']) def load_weights(self): target_file = list(glob(self.checkpoint_dir + 'model*.pth'))[0] print('loading ', target_file) weights = torch.load(target_file) self.model.load_state_dict(weights)
child_name = "{}.{}".format(name, n) param_trace(child_name, m, depth + 1, max_depth, threshold) param_trace('seq2seq', seq2seq, 0, max_depth=5, threshold=K * 100) exit() # optimizer = optim.SGD(seq2seq.parameters(), lr=0.25) # lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, min_lr=1e-4, # verbose=True) #optimizer = optim.Adamax(seq2seq.parameters()) T_ep = len(train_loader) #optimizer = optim.Adam(seq2seq.parameters(), lr=3e-4, betas=(0.9, 0.98), eps=1e-9) optimizer = optim.Adam(seq2seq.parameters(), lr=3e-4) lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_ep * epochs, eta_min=3e-6) if 'warmup' in cfg['train']: warmup_ep = cfg['train']['warmup'] lr_scheduler = WarmupLR(optimizer, init_scale=1e-3, T_max=T_ep * warmup_ep, after=lr_scheduler) if VIZ_ATTN: utils.makedirs('evals') evaluateAndShowAttentions(seq2seq, dset.in_lang, dset.out_lang,