def train(model: nn.Module, scheduler, optimizer, images, datasets, n_epoches: int, batch_size: int, eval_valid_freq: int = 1, eval_test_freq: int = 3, device=None): log_dir = inc_folder_no(ROOT_DIR / "runs" / "s02_exp" / "run_") writer = SummaryWriter(log_dir=log_dir) global_step = 0 model.train() valid_res = { 'loss': float('nan'), 'nt_accuracy': float('nan'), 'nbtn_accuracy': float('nan'), 'nrow_accuracy': float('nan') } test_res = { 'loss': float('nan'), 'nt_accuracy': float('nan'), 'nbtn_accuracy': float('nan'), 'nrow_accuracy': float('nan') } best_performance = 0.0 train_examples, valid_examples, test_examples = datasets try: with tqdm(range(n_epoches), desc='epoch') as epoches, tqdm(total=math.ceil( sum([len(x) for x in train_examples]) / batch_size) * n_epoches, desc='training') as pbar: for i in epoches: scheduler.step() for bimgs, bx, bnx, bxlen, bnt, bnbtn, bnrow in iter_batch( batch_size, images, train_examples, shuffle=True, device=device): pbar.update() global_step += 1 model.zero_grad() bnt_pred, bnts_pred, bnbtn_pred, bnrow_pred = model( bimgs, bx, bxlen) bnt_acc = (torch.argmax( bnt_pred, dim=1) == bnt).float().mean().item() bnbtn_acc = (torch.argmax( bnbtn_pred, dim=1) == bnbtn).float().mean().item() bnrow_acc = (torch.argmax( bnrow_pred, dim=1) == bnrow).float().mean().item() loss, sublosses = model.loss_func(bnts_pred, bnx, bnt_pred, bnt, bnbtn_pred, bnbtn, bnrow_pred, bnrow) loss.backward() optimizer.step() writer.add_scalar('train/total_loss', loss, global_step) for j, l in enumerate(sublosses): writer.add_scalar(f'train/loss_{j}', l.item(), global_step) writer.add_scalar('train/nt_accuracy', bnt_acc, global_step) writer.add_scalar('train/nbtn_accuracy', bnbtn_acc, global_step) writer.add_scalar('train/nrow_accuracy', bnrow_acc, global_step) pbar.set_postfix(loss=f"{loss:.5f}", nt_accuracy=f"{bnt_acc:.5f}", nbtn_accuracy=f"{bnbtn_acc:.5f}", nrow_accuracy=f"{bnrow_acc:.5f}", **{ f"loss_{j}": f"{l.item():.5f}" for j, l in enumerate(sublosses) }) if (i + 1) % eval_valid_freq == 0: valid_res = eval(model, images, valid_examples, device) for k, v in valid_res.items(): writer.add_scalar(f'valid/{k}', v, global_step) if valid_res['nt_accuracy'] > best_performance: best_performance = valid_res['nt_accuracy'] torch.save(model, log_dir + f"/model.{i}.bin") if (i + 1) % eval_test_freq == 0: test_res = eval(model, images, test_examples, device) for k, v in test_res.items(): writer.add_scalar(f'test/{k}', v, global_step) epoches.set_postfix( v_l=f'{valid_res["loss"]:.5f}', v_nt_a=f'{valid_res["nt_accuracy"]:.5f}', v_nbtn_a=f'{valid_res["nbtn_accuracy"]:.5f}', v_nrow_a=f'{valid_res["nrow_accuracy"]:.5f}', t_l=f'{test_res["loss"]:.5f}', t_nt_a=f'{test_res["nt_accuracy"]:.5f}', t_nbtn_a=f'{test_res["nbtn_accuracy"]:.5f}', t_nrow_a=f'{test_res["nrow_accuracy"]:.5f}', ) finally: writer.close()
def train(model: nn.Module, scheduler, optimizer, images, datasets, n_epoches: int, batch_size: int, eval_valid_freq: int = 1, eval_test_freq: int = 3, device=None): log_dir = inc_folder_no(ROOT_DIR / "runs" / "s04_exp" / "run_") writer = SummaryWriter(log_dir=log_dir) global_step = 0 model.train() valid_res = {'loss': float('nan'), 'accuracy': float('nan')} test_res = {'loss': float('nan'), 'accuracy': float('nan')} best_performance = 0.0 train_examples, valid_examples, test_examples = datasets try: with tqdm(range(n_epoches), desc='epoch') as epoches, tqdm(total=math.ceil( sum([len(x) for x in train_examples]) / batch_size) * n_epoches, desc='training') as pbar: for i in epoches: scheduler.step() for bimgs, bx, bnts, bxlen, bnt, bnbtn, bnrow in iter_batch( batch_size, images, train_examples, shuffle=True, device=device): pbar.update() global_step += 1 model.zero_grad() bnts_pred = model(bimgs, bx, bxlen) loss, mask, btokens = padded_aware_nllloss(bnts_pred, bnts) accuracy = ((torch.argmax(bnts_pred, dim=1) == bnts.view(-1)).float() * mask).sum().item() / btokens loss.backward() optimizer.step() writer.add_scalar('train/loss', loss, global_step) writer.add_scalar('train/accuracy', accuracy, global_step) pbar.set_postfix(loss=f"{loss:.5f}", accuracy=f"{accuracy:.5f}") if (i + 1) % eval_valid_freq == 0: valid_res = eval(model, images, valid_examples, device) for k, v in valid_res.items(): writer.add_scalar(f'valid/{k}', v, global_step) if valid_res['accuracy'] > best_performance: best_performance = valid_res['accuracy'] torch.save(model, log_dir + f"/model.{i}.bin") if (i + 1) % eval_test_freq == 0: test_res = eval(model, images, test_examples, device) for k, v in test_res.items(): writer.add_scalar(f'test/{k}', v, global_step) epoches.set_postfix( v_l=f'{valid_res["loss"]:.5f}', v_a=f'{valid_res["accuracy"]:.5f}', t_l=f'{test_res["loss"]:.5f}', t_a=f'{test_res["accuracy"]:.5f}', ) finally: writer.close()
def train(model: nn.Module, loss_func, scheduler, optimizer, images, datasets, n_epoches: int, batch_size: int, eval_valid_freq: int = 1, eval_test_freq: int = 3, device=None): histories = {'train': [], 'valid': [], 'test': []} log_dir = inc_folder_no(ROOT_DIR / "runs" / f"s02_exp_") writer = SummaryWriter(log_dir=log_dir) global_step = 0 model.train() valid_res = {'loss': float('nan'), 'accuracy': float('nan')} test_res = {'loss': float('nan'), 'accuracy': float('nan')} best_performance = 0.0 train_examples, valid_examples, test_examples = datasets try: with tqdm(range(n_epoches), desc='epoch') as epoches, tqdm(total=math.ceil( sum([len(x) for x in train_examples]) / batch_size) * n_epoches, desc='training') as pbar: for i in epoches: scheduler.step() for bimgs, bx, by, bxlen in iter_batch(batch_size, images, train_examples, shuffle=True, device=device): pbar.update() global_step += 1 model.zero_grad() bypred = model(bimgs, bx, bxlen) accuracy = (torch.argmax( bypred, dim=1) == by).float().mean().item() loss = loss_func(bypred, by) loss.backward() optimizer.step() histories['train'].append((loss, accuracy)) writer.add_scalar('train/loss', loss, global_step) writer.add_scalar('train/accuracy', accuracy, global_step) pbar.set_postfix(train_loss=f"{loss:.5f}", train_accuracy=f"{accuracy:.5f}") if (i + 1) % eval_valid_freq == 0: valid_res = eval(model, loss_func, images, valid_examples, device) writer.add_scalar('valid/loss', valid_res['loss'], global_step) writer.add_scalar('valid/accuracy', valid_res['accuracy'], global_step) histories['valid'].append(valid_res) if valid_res['accuracy'] > best_performance: best_performance = valid_res['accuracy'] torch.save(model, log_dir + f"/model.{i}.bin") if (i + 1) % eval_test_freq == 0: test_res = eval(model, loss_func, images, test_examples, device) writer.add_scalar('test/loss', test_res['loss'], global_step) writer.add_scalar('test/accuracy', test_res['accuracy'], global_step) histories['test'].append(test_res) epoches.set_postfix( valid_l=f'{valid_res["loss"]:.5f}', valid_a=f'{valid_res["accuracy"]:.5f}', test_l=f'{test_res["loss"]:.5f}', test_a=f'{test_res["accuracy"]:.5f}', ) finally: writer.close() return histories
def train(model: nn.Module, loss_func, scheduler, optimizer, images, datasets, n_epoches: int, batch_size: int, clip_grad_val: float, eval_batch_size: int = 500, eval_valid_freq: int = 1, eval_test_freq: int = 3, device=None, exp_dir: str = "exp"): log_dir = inc_folder_no(ROOT_DIR / "runs" / exp_dir / "run_") print("log_dir:", log_dir) writer = SummaryWriter(log_dir=log_dir) global_step = 0 model.train() valid_res = { 'loss': AverageMeter(), 'top_1_acc': AverageMeter(), 'top_3_acc': AverageMeter(), 'top_5_acc': AverageMeter() } test_res = { 'loss': AverageMeter(), 'top_1_acc': AverageMeter(), 'top_3_acc': AverageMeter(), 'top_5_acc': AverageMeter() } best_performance = 0.0 train_examples, valid_examples, test_examples = datasets try: with tqdm(range(n_epoches), desc='epoch---') as epoches, tqdm( total=math.ceil(len(train_examples) / batch_size) * n_epoches, desc='training-') as pbar: for i in epoches: scheduler.step() batch_loss = AverageMeter() batch_top_1_acc = AverageMeter() batch_top_3_acc = AverageMeter() for bimgs, bx, bnx, bxlen, sorted_idx in iter_batch( batch_size, images, train_examples, shuffle=True, device=device): pbar.update() global_step += 1 model.zero_grad() bnx_pred = model(bimgs, bx, bxlen) bnx_pred, _ = pack_padded_sequence(bnx_pred, bxlen, batch_first=True) bnx, _ = pack_padded_sequence(bnx, bxlen, batch_first=True) loss = loss_func(bnx_pred, bnx) loss.backward() clip_grad_value_(model.parameters(), clip_grad_val) # prevent explode optimizer.step() loss = float(loss) top_k_acc = seq_accuracy(bnx_pred, bnx, [1, 3]) writer.add_scalar('train/loss', loss, global_step) writer.add_scalar('train/top_1_acc', top_k_acc[0], global_step) writer.add_scalar('train/top_3_acc', top_k_acc[1], global_step) n_tokens = bnx.shape[0] batch_loss.update(loss, n_tokens) batch_top_1_acc.update(top_k_acc[0], n_tokens) batch_top_3_acc.update(top_k_acc[1], n_tokens) pbar.set_postfix(loss=f"{loss:.5f}", top_1_acc=f"{top_k_acc[0]:.5f}", top_3_acc=f"{top_k_acc[1]:.5f}") if (i + 1) % eval_valid_freq == 0: valid_res = evaluate(model, loss_func, images, valid_examples, device, batch_size=eval_batch_size) for k, v in valid_res.items(): writer.add_scalar(f'valid/{k}', v.avg, i) if valid_res['top_1_acc'].avg > best_performance: best_performance = valid_res['top_1_acc'].avg torch.save( { "epoch": i, "model": model.state_dict(), "optimizer": optimizer.state_dict(), }, log_dir + f"/model.bin") if (i + 1) % eval_test_freq == 0: test_res = evaluate(model, loss_func, images, test_examples, device, batch_size=eval_batch_size) for k, v in test_res.items(): writer.add_scalar(f'test/{k}', v.avg, i) epoches.set_postfix( b_l=f'{batch_loss.avg:.5f}', b_a1=f'{batch_top_1_acc.avg:.5f}', b_a3=f'{batch_top_3_acc.avg:.5f}', v_l=f'{valid_res["loss"].avg:.5f}', v_a1=f'{valid_res["top_1_acc"].avg:.5f}', v_a3=f'{valid_res["top_3_acc"].avg:.5f}', v_a5=f'{valid_res["top_5_acc"].avg:.5f}', t_l=f'{test_res["loss"].avg:.5f}', t_a1=f'{test_res["top_1_acc"].avg:.5f}', t_a3=f'{test_res["top_3_acc"].avg:.5f}', t_a5=f'{test_res["top_5_acc"].avg:.5f}', ) finally: writer.close()
def train(model: nn.Module, loss_func1, loss_func2, scheduler, optimizer, datasets, n_epoches: int, batch_size: int, device=None): task1_histories = {'train': [], 'valid': [], 'test': []} task2_histories = {'train': [], 'valid': [], 'test': []} train_X, train_y1, train_y2 = datasets['train'] valid_X, valid_y1, valid_y2 = datasets['valid'] test_X, test_y1, test_y2 = datasets['test'] writer = SummaryWriter(log_dir=inc_folder_no(ROOT_DIR / "runs" / f"s01_exp_")) global_step = 0 try: with tqdm(range(n_epoches), desc='epoch') as epoches, tqdm( total=math.ceil(len(train_X) / batch_size) * n_epoches, desc='training') as pbar: for i in epoches: scheduler.step() for bx, by1, by2 in iter_batch(batch_size, train_X, train_y1, train_y2, shuffle=True, device=device): pbar.update() global_step += 1 model.zero_grad() by1_pred = model.forward_task1(bx) loss1 = loss_func1(by1_pred, by1) loss1.backward() optimizer.step() model.zero_grad() by2_pred = model.forward_task2(bx) loss2 = loss_func2(by2_pred, by2) loss2.backward() optimizer.step() task1_histories['train'].append(loss1) task2_histories['train'].append(loss2) writer.add_scalar('train/loss1', loss1, global_step) writer.add_scalar('train/loss2', loss2, global_step) pbar.set_postfix(train_loss1=f"{loss1:.5f}", train_loss2=f'{loss2:.5f}') valid_res = eval(model, loss_func1, loss_func2, valid_X, valid_y1, valid_y2, device) epoches.set_postfix() writer.add_scalar('valid/loss1', valid_res['task1_loss'], global_step) writer.add_scalar('valid/loss2', valid_res['task2_loss'], global_step) writer.add_scalar('valid/accuracies1', valid_res['task1_accuracies'], global_step) writer.add_scalar('valid/accuracies2', valid_res['task2_accuracies'], global_step) test_res = eval(model, loss_func1, loss_func2, test_X, test_y1, test_y2, device) writer.add_scalar('test/loss1', test_res['task1_loss'], global_step) writer.add_scalar('test/loss2', test_res['task2_loss'], global_step) writer.add_scalar('test/accuracies1', test_res['task1_accuracies'], global_step) writer.add_scalar('test/accuracies2', test_res['task2_accuracies'], global_step) task1_histories['valid'].append(valid_res) task2_histories['test'].append(test_res) epoches.set_postfix( valid_l1=f'{valid_res["task1_loss"]:.5f}', valid_l2=f'{valid_res["task2_loss"]:.5f}', valid_a1=f'{valid_res["task1_accuracies"]:.5f}', valid_a2=f'{valid_res["task2_accuracies"]:.5f}', test_l1=f'{test_res["task1_loss"]:.5f}', test_l2=f'{test_res["task2_loss"]:.5f}', test_a1=f'{test_res["task1_accuracies"]:.5f}', test_a2=f'{test_res["task2_accuracies"]:.5f}', ) finally: writer.close() return task1_histories, task2_histories