def run(): df = pd.read_csv(config.TRAIN_PATH) kfold = KFold(n_splits=5, random_state=config.SEED, shuffle=True) fold_losses = [] for i, (train_idx, val_idx) in enumerate(kfold.split(df)): print("-------------------------------------------------------") print(f"Training fold {i}") print("-------------------------------------------------------") train = df.iloc[train_idx] validation = df.iloc[val_idx] train_dataset = PicDataset(train) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.BATCH_SIZE ) val_dataset = PicDataset(validation) val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=config.BATCH_SIZE ) device = 'cuda:0' if torch.cuda.is_available() else "cpu" model = DNN() model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=config.LR) loss = 0 for _ in range(config.EPOCHS): engine.train_fn(train_data_loader, model, optimizer, device) loss = engine.eval_fn(val_data_loader, model, device) print(f"Loss on fold {i} is {loss}") fold_losses.append(loss) torch.save(model.state_dict(), f'./models/model_{i}.bin') print(f"Average loss on cross validation is {sum(fold_losses) / 5}")
def train(args, config, io): train_loader, validation_loader = get_loader(args, config) device = torch.device("cuda" if args.cuda else "cpu") # print(len(train_loader), len(validation_loader)) #Try to load models model = DNN(args).to(device) """if device == torch.device("cuda"): model = nn.DataParallel(model)""" if args.model_path != "": model.load_state_dict(torch.load(args.model_path)) # for para in list(model.parameters())[:-5]: # para.requires_grad=False # print(model) if args.use_sgd: # print("Use SGD") opt = optim.SGD(model.parameters(), lr=args.lr * 100, momentum=args.momentum, weight_decay=1e-4) else: # print("Use Adam") opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) """opt = optim.Adam([ {'params': list(model.parameters())[:-1], 'lr':args.lr/50, 'weight_decay': 1e-4}, {'params': list(model.parameters())[-1], 'lr':args.lr, 'weight_decay': 1e-4} ]) """ scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=args.lr) criterion = nn.MSELoss() best_test_loss = 9999999. for epoch in range(args.epochs): startTime = time.time() #################### # Train #################### train_loss = 0.0 train_dis = 0.0 count = 0.0 model.train() for data, label in train_loader: data, label = data.to(device), label.to(device) data = drop(jitter(data, device), device) # data = jitter(data, device, delta=0.05) batch_size = data.shape[0] logits = model(data) loss = criterion(logits, label) opt.zero_grad() loss.backward() opt.step() dis = distance(logits, label) count += batch_size train_loss += loss.item() * batch_size train_dis += dis.item() * batch_size scheduler.step() outstr = 'Train %d, loss: %.6f, distance: %.6f' % ( epoch, train_loss * 1.0 / count, train_dis * 1.0 / count) io.cprint(outstr) #################### # Evaluation #################### test_loss = 0.0 test_dis = 0.0 count = 0.0 model.eval() with torch.no_grad(): for data, label in validation_loader: data, label = data.to(device), label.to(device) batch_size = data.shape[0] logits = model(data) loss = criterion(logits, label) dis = distance(logits, label) count += batch_size test_loss += loss.item() * batch_size test_dis += dis.item() * batch_size outstr = 'Test %d, loss: %.6f, distance: %.6f' % ( epoch, test_loss * 1.0 / count, test_dis * 1.0 / count) io.cprint(outstr) if test_loss <= best_test_loss: best_test_loss = test_loss torch.save(model.state_dict(), 'checkpoints/%s/models/model.t7' % args.exp_name) torch.save(model, (config.root + config.model_path)) io.cprint('Time: %.3f sec' % (time.time() - startTime))
# print(correct, total) loss = criterion(y_pred, y.long()) optimizer.zero_grad() loss.backward() optimizer.step() if (idx + 1) % 100 == 0: print( 'Epoch: [{}/{}], Step: [{}/{}], Loss: {:.4f}, Acc: {:.4f}'. format(epoch + 1, args.epochs, idx + 1, len(train_loader), loss.item(), 100 * correct / total)) # 保存模型参数 torch.save( model.state_dict(), os.path.join( './log', '{}_{}_{}.ckpt'.format(args.model, args.dataset, args.epochs))) model.eval() with torch.no_grad(): correct = 0 total = 0 for idx, (x, y) in enumerate(test_loader): x, y = x.to(device), y.to(device) y_pred = model(x) _, y_pred = torch.max(y_pred.data, 1) total += y.size(0) correct += (y_pred == y).sum().item() # print(result) if idx % 100 == 0:
def train(args, config, io): train_loader, validation_loader, unlabelled_loader = get_loader( args, config) device = torch.device("cuda" if args.cuda else "cpu") #Try to load models model = DNN(args).to(device) ema_model = DNN(args).to(device) for param in ema_model.parameters(): param.detach_() if device == torch.device("cuda"): model = nn.DataParallel(model) ema_model = nn.DataParallel(ema_model) if args.model_path != "": model.load_state_dict(torch.load(args.model_path)) ema_model.load_state_dict(torch.load(args.model_path)) if args.use_sgd: print("Use SGD") opt = optim.SGD(model.parameters(), lr=args.lr * 100, momentum=args.momentum, weight_decay=1e-4) else: print("Use Adam") opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=args.lr) criterion = nn.MSELoss() consistency_criterion = nn.MSELoss() best_test_loss = 9999999. global_step = 0 for epoch in range(args.epochs): startTime = time.time() #################### # Train #################### train_loss = 0.0 count = 0.0 model.train() ema_model.train() i = -1 for (data, label), (u, _) in zip(cycle(train_loader), unlabelled_loader): i = i + 1 if data.shape[0] != u.shape[0]: bt_size = np.minimum(data.shape[0], u.shape[0]) data = data[0:bt_size] label = label[0:bt_size] u = u[0:bt_size] data, label, u = data.to(device), label.to(device), u.to(device) batch_size = data.shape[0] logits = model(data) class_loss = criterion(logits, label) u_student = jitter(u, device) u_teacher = jitter(u, device) logits_unlabeled = model(u_student) ema_logits_unlabeled = ema_model(u_teacher) ema_logits_unlabeled = Variable(ema_logits_unlabeled.detach().data, requires_grad=False) consistency_loss = consistency_criterion(logits_unlabeled, ema_logits_unlabeled) if epoch < args.consistency_rampup_starts: consistency_weight = 0.0 else: consistency_weight = get_current_consistency_weight( args, args.final_consistency, epoch, i, len(unlabelled_loader)) consistency_loss = consistency_weight * consistency_loss loss = class_loss + consistency_loss opt.zero_grad() loss.backward() opt.step() global_step += 1 # print(global_step) update_ema_variables(model, ema_model, args.ema_decay, global_step) count += batch_size train_loss += loss.item() * batch_size scheduler.step() outstr = 'Train %d, loss: %.6f' % (epoch, train_loss * 1.0 / count) io.cprint(outstr) #################### # Evaluation #################### test_loss = 0.0 count = 0.0 model.eval() ema_model.eval() for data, label in validation_loader: data, label = data.to(device), label.to(device) batch_size = data.shape[0] logits = ema_model(data) loss = criterion(logits, label) count += batch_size test_loss += loss.item() * batch_size outstr = 'Test %d, loss: %.6f' % (epoch, test_loss * 1.0 / count) io.cprint(outstr) if test_loss <= best_test_loss: best_test_loss = test_loss torch.save(ema_model.state_dict(), 'checkpoints/%s/models/model.t7' % args.exp_name) torch.save(ema_model, (config.root + config.model_path)) io.cprint('Time: %.3f sec' % (time.time() - startTime))
def main(): print('> Starting execution...') # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') group = parser.add_mutually_exclusive_group() group.add_argument('--fit', action='store_true', help='fit the tuned model on digits 0-4') group.add_argument('--transfer', action='store_true', help='train a pretrained model on digits 5-9') parser.add_argument('--batch-size', type=int, default=256, metavar='N', help='input batch size for training (default: 256)') parser.add_argument('--epochs', type=int, default=50, metavar='E', help='number of epochs to train (default: 50)') parser.add_argument('--lr', type=float, default=1e-3, metavar='L', help='learning rate (default: 1e-3)') parser.add_argument('--early-stopping', type=int, default=7, metavar='E', help='early stopping (default: 7 epochs)') parser.add_argument( '--size', type=int, default=100, metavar='S', help='size of the training data for transfer learning (default: 100)') parser.add_argument('--seed', type=int, default=23, metavar='S', help='random seed (default: 23)') args = parser.parse_args() use_cuda = torch.cuda.is_available() # use cuda if available device = torch.device("cuda" if use_cuda else "cpu") torch.manual_seed(args.seed) # random seed print('> Loading MNIST data') train_set = datasets.MNIST(MNIST_DATA_DIR, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) test_set = datasets.MNIST(MNIST_DATA_DIR, train=False, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) train_digits_04 = np.where(train_set.train_labels < 5)[0] train_digits_59 = np.where(train_set.train_labels > 4)[0] test_digits_04 = np.where(test_set.test_labels < 5)[0] test_digits_59 = np.where(test_set.test_labels > 4)[0] if args.fit: # Training the tuned model on digits 0-4 print('> Training a new model on MNIST digits 0-4') X_train_04, y_train_04, X_valid_04, y_valid_04 = data_to_numpy( train_set, test_set, INPUT_DIM, train_digits_04, test_digits_04) torch.manual_seed(args.seed) print('> Initializing the model') model = DNN(INPUT_DIM, OUTPUT_DIM, HIDDEN_DIM, batch_norm=True) model.apply(init_he_normal) # He initialization model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr) print('> Training the model') model, _, _ = train_model(model, device, X_train_04, y_train_04, criterion, optimizer, X_valid=X_valid_04, y_valid=y_valid_04, batch_size=args.batch_size, n_epochs=args.epochs, early_stopping=args.early_stopping) print(f'> Saving the model state at {MODEL_04_PATH}') torch.save(model.state_dict(), MODEL_04_PATH) elif args.transfer: # Transfer learning print( '> Training a model on MNIST digits 5-9 from a pretrained model for digits 0-4' ) if os.path.isfile(MODEL_04_PATH): print('> Loading the pretrained model') model = DNN(INPUT_DIM, OUTPUT_DIM, HIDDEN_DIM, batch_norm=True).to(device) model.load_state_dict(torch.load(MODEL_04_PATH)) for param in model.parameters(): param.requires_grad = False # Parameters of newly constructed modules have requires_grad=True by default model.fc4 = nn.Linear(HIDDEN_DIM, HIDDEN_DIM) model.fc5 = nn.Linear(HIDDEN_DIM, HIDDEN_DIM) model.out = nn.Linear(HIDDEN_DIM, OUTPUT_DIM) print('> Using saved model state') else: print( '> Model state file is not found, fit a model before the transfer learning' ) print('> Stopping execution') return X_train_59, y_train_59, X_valid_59, y_valid_59 = data_to_numpy( train_set, test_set, INPUT_DIM, train_digits_59[:args.size], test_digits_59) # fixing the issues with labels y_train_59 = y_train_59 - 5 y_valid_59 = y_valid_59 - 5 criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr) print('> Training the model') model, _, _ = train_model(model, device, X_train_59, y_train_59, criterion, optimizer, X_valid=X_valid_59, y_valid=y_valid_59, batch_size=args.batch_size, n_epochs=args.epochs, early_stopping=args.early_stopping) print(f'> Saving the model state at {MODEL_59_PATH}') torch.save(model.state_dict(), MODEL_59_PATH) else: print('> Incorrect mode, try either `--fit` or `--transfer`') print('> Stopping execution')
noisy_batch_var = noisy_batch_var.to(device) outputs = model(noisy_batch_var) loss = MSE(outputs, clean_batch_var) # back-propagate and update optimizer.zero_grad() loss.backward() optimizer.step() tbwriter.add_scalar('loss', loss.item(), total_steps) if (i + 1) % 100 == 0: print( 'Epoch {}\t' 'Step {}\t' 'loss {:.5f}' .format(epoch + 1, i + 1, loss.item())) # print(outputs) # print(clean_batch_var) # record scalar data for tensorboard total_steps += 1 # save various states state_path = os.path.join(checkpoint_path, 'state-{}.pkl'.format(epoch + 1)) state = { 'DNN': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(state, state_path) tbwriter.close() print('Finished Training!')