def run(config, num_checkpoints, cuda=False): train_joint_transform_list, train_img_transform, train_label_transform = get_transforms( config, mode="train") val_joint_transform_list, val_img_transform, val_label_transform = None, None, None train_dataset = DataSet(mode="train", joint_transform_list=train_joint_transform_list, img_transform=train_img_transform, label_transform=train_label_transform) val_dataset = DataSet(mode="val", joint_transform_list=val_joint_transform_list, img_transform=val_img_transform, label_transform=val_label_transform) train_loader = data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True) val_loader = data.DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) criterion, val_criterion = get_loss(config, cuda=cuda) model = get_net(config, criterion, cuda=cuda) checkpoints = get_checkpoints(config, num_checkpoints) print("[*] Checkpoints as follow:") pprint.pprint(checkpoints) util_checkpoint.load_checkpoint(model, None, checkpoints[0]) for i, checkpoint in enumerate(checkpoints[1:]): model2 = get_net(config, criterion, cuda=cuda) util_checkpoint.load_checkpoint(model2, None, checkpoint) swa.moving_average(model, model2, 1. / (i + 2)) with torch.no_grad(): swa.update_bn(train_loader, model, cuda=cuda) output_name = "model-swa.pth" print(f"[*] SAVED: to {output_name}") checkpoint_dir = os.path.join(ROOT_DIR, LOG_DIR, os.path.basename(config.model_dir)) util_checkpoint.save_checkpoint(checkpoint_dir, output_name, model) # test the model scores = validation(config, val_loader, model, val_criterion, "swa", cuda=cuda, is_record=False) print(scores) with open(os.path.join(checkpoint_dir, "swa-scores.json"), "w") as f: json.dump(scores["FWIOU"], f)
def main(mode, epoches, learning_rate, train_path, val_path, batch_size, model_path): device = "cuda:0" if torch.cuda.is_available() else "cpu" model = Network(7).to(device) if os.path.isfile(model_path): print("loading model") model.load_state_dict(torch.load(model_path)) if mode == "train": train_loader = dataloader(train_path, batch_size) val_loader = dataloader(val_path, batch_size) optimizer = optim.Adam(model.parameters(), lr=learning_rate) loss_F = nn.NLLLoss() train(epoches, model, train_loader, val_loader, optimizer, loss_F, device) torch.save(model.state_dict(), model_path) else: val_loader = dataloader(val_path, batch_size) acc = validation(model, val_loader, device) print("\taccuracy: %.2f%%" % (acc))
# train train_loss, train_f1_macro, train_f1_micro, train_stat = train( epoch, model, args.output_dim, optimizer, scheduler, criterion, params['alpha'], train_loader, device, args.log_interval, append_line_to_log, checkPath) #train_loss, train_f1 = train(epoch, model, optimizer, scheduler, criterion, train_loader, device, args.log_interval, append_line_to_log, checkPath) history["train_loss"].append(train_loss) history["train_f1_macro"].append(train_f1_macro) history["train_f1_micro"].append(train_f1_micro) train_stat_filename = stat_train_path + "/train_" + str(epoch) + ".pkl" with open(train_stat_filename, "wb") as fout: pickle.dump(train_stat, fout) # validation valid_loss, valid_f1_macro, valid_f1_micro, valid_stat = validation( epoch, model, args.output_dim, criterion, params['alpha'], valid_loader, device, append_line_to_log) #valid_loss, valid_f1 = validation(epoch, model, criterion, valid_loader, device, append_line_to_log) history["valid_loss"].append(valid_loss) history["valid_f1_macro"].append(valid_f1_macro) history["valid_f1_micro"].append(valid_f1_micro) valid_stat_filename = stat_valid_path + "/valid_" + str(epoch) + ".pkl" with open(valid_stat_filename, "wb") as fout: pickle.dump(valid_stat, fout) scheduler.step(train_loss) # save the model of this epoch model_file = "/model_" + str(epoch) + ".pth" model_file = modelPath + model_file torch.save(model.state_dict(), model_file)
def pseudo_labeling(num_epochs, model, data_loader, val_loader, unlabeled_loader, device, val_every, file_name): # Instead of using current epoch we use a "step" variable to calculate alpha_weight # This helps the model converge faster from torch.optim.swa_utils import AveragedModel, SWALR from segmentation_models_pytorch.losses import SoftCrossEntropyLoss, JaccardLoss from adamp import AdamP criterion = [ SoftCrossEntropyLoss(smooth_factor=0.1), JaccardLoss('multiclass', classes=12) ] optimizer = AdamP(params=model.parameters(), lr=0.0001, weight_decay=1e-6) swa_scheduler = SWALR(optimizer, swa_lr=0.0001) swa_model = AveragedModel(model) optimizer = Lookahead(optimizer, la_alpha=0.5) step = 100 size = 256 best_mIoU = 0 model.train() print('Start Pseudo-Labeling..') for epoch in range(num_epochs): hist = np.zeros((12, 12)) for batch_idx, (imgs, image_infos) in enumerate(unlabeled_loader): # Forward Pass to get the pseudo labels # --------------------------------------------- test(unlabelse)를 모델에 통과 model.eval() outs = model(torch.stack(imgs).to(device)) oms = torch.argmax(outs.squeeze(), dim=1).detach().cpu().numpy() oms = torch.Tensor(oms) oms = oms.long() oms = oms.to(device) # --------------------------------------------- 학습 model.train() # Now calculate the unlabeled loss using the pseudo label imgs = torch.stack(imgs) imgs = imgs.to(device) # preds_array = preds_array.to(device) output = model(imgs) loss = 0 for each in criterion: loss += each(output, oms) unlabeled_loss = alpha_weight(step) * loss # Backpropogate optimizer.zero_grad() unlabeled_loss.backward() optimizer.step() output = torch.argmax(output.squeeze(), dim=1).detach().cpu().numpy() hist = add_hist(hist, oms.detach().cpu().numpy(), output, n_class=12) if (batch_idx + 1) % 25 == 0: acc, acc_cls, mIoU, fwavacc = label_accuracy_score(hist) print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, mIoU:{:.4f}'. format(epoch + 1, num_epochs, batch_idx + 1, len(unlabeled_loader), unlabeled_loss.item(), mIoU)) # For every 50 batches train one epoch on labeled data # 50배치마다 라벨데이터를 1 epoch학습 if batch_idx % 50 == 0: # Normal training procedure for batch_idx, (images, masks, _) in enumerate(data_loader): labeled_loss = 0 images = torch.stack(images) # (batch, channel, height, width) masks = torch.stack(masks).long() # gpu 연산을 위해 device 할당 images, masks = images.to(device), masks.to(device) output = model(images) for each in criterion: labeled_loss += each(output, masks) optimizer.zero_grad() labeled_loss.backward() optimizer.step() # Now we increment step by 1 step += 1 if (epoch + 1) % val_every == 0: avrg_loss, val_mIoU = validation(epoch + 1, model, val_loader, criterion, device) if val_mIoU > best_mIoU: print('Best performance at epoch: {}'.format(epoch + 1)) print('Save model in', saved_dir) best_mIoU = val_mIoU save_model(model, file_name=file_name) model.train() if epoch > 3: swa_model.update_parameters(model) swa_scheduler.step()
def main(): # argument parsing parser = argparse.ArgumentParser() parser.add_argument('--max-epochs', type=int, default=2) parser.add_argument('--batch-size', type=int, default=4) parser.add_argument('--max-sequence-length', type=int, default=128) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--data-dir', type=str, default='data') parser.add_argument('--real-dataset', type=str, default='webtext') parser.add_argument('--fake-dataset', type=str, default='xl-1542M-nucleus') parser.add_argument('--save-dir', type=str, default='bert_logs') parser.add_argument('--learning-rate', type=float, default=2e-5) parser.add_argument('--weight-decay', type=float, default=0) parser.add_argument('--model-name', type=str, default='bert-base-cased') parser.add_argument('--wandb', type=bool, default=True) args = parser.parse_args() if args.wandb: wandb.init(project=args.model_name) device = "cuda" if torch.cuda.is_available() else "cpu" # config, tokenizer, model config = AutoConfig.from_pretrained( args.model_name, num_labels=2 ) tokenizer = AutoTokenizer.from_pretrained(args.model_name) tokenization_utils.logger.setLevel('DEBUG') model = AutoModelForSequenceClassification.from_pretrained( args.model_name, config=config ) model.to(device) # load data train_loader, validation_loader, test_loader = load_datasets(args, tokenizer) # my model optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) best_val = 0. for epoch in range(args.max_epochs): train(model, optimizer, train_loader, args, device) val_acc = validation(model, validation_loader, args, device) test_acc = test(model, test_loader, args, device) print(f"Epoch {epoch + 1} | val_acc: {val_acc} test_acc: {test_acc}") if val_acc > best_val: os.makedirs(args.save_dir, exist_ok=True) model_name = 'baseline_' + args.model_name + '.pt' model_to_save = model.module if hasattr(model, 'module') else model torch.save(dict( epoch=epoch+1, model_state_dict=model_to_save.state_dict(), optimizer_state_dict=optimizer.state_dict(), args=args ), os.path.join(args.save_dir, model_name) ) print("Model saved to", args.save_dir) best_val = val_acc
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ])) test_data_set = CIFAR10('./data', train=False, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ])) triplet_train_data_loader, test_data_loader = createTripletDataLoaders(train_data_set, test_data_set) model = TripletNet() argumento = input('Type train for Train or val for Validation: ') if(argumento == 'train'): train(model, triplet_train_data_loader, device, PATH) elif(argumento == 'val'): config = { 'batch_size': 128, 'num_workers': 2 } train_data_loader = DataLoader(train_data_set, **config) validation(train_data_loader, test_data_loader, device, PATH)
# Loop through each epoch. print('Epoch') for epoch in tqdm(range(epochs)): print() print('Training on batches...') # Perform one full pass over the training set. train_labels, train_predict, train_loss = train(train_dataloader, model, optimizer, scheduler, device, scaler) train_acc = accuracy_score(train_labels, train_predict) # Get prediction form model on validation data. print('Validation on batches...') valid_labels, valid_predict, val_loss = validation(valid_dataloader, model, device) val_acc = accuracy_score(valid_labels, valid_predict) # Print loss and accuracy values to see how training evolves. print( " train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f" % (train_loss, val_loss, train_acc, val_acc)) print() # Store the loss value for plotting the learning curve. all_loss['train_loss'].append(train_loss) all_loss['val_loss'].append(val_loss) all_acc['train_acc'].append(train_acc) all_acc['val_acc'].append(val_acc) save_path = "/home/jovyan/data-vol-1/gpt2/fine_tuned_models/test_gp2_full" #test_gp2_full"
# to set the best validation loss as inifinity best_val_loss = np.inf # training process start_epoch = 1 for epoch in range(start_epoch, args.epochs + 1): # train train_loss, train_acc = train(epoch, model, optimizer, criterions, train_loader, device, args.log_interval, append_line_to_log, checkPath) history["train_loss"].append(train_loss) history["train_acc"].append(train_acc) # validation valid_loss, valid_acc = validation(epoch, model, criterions, valid_loader, device, append_line_to_log) history["valid_loss"].append(valid_loss) history["valid_acc"].append(valid_acc) #scheduler.step(valid_loss) # save the best model is_best = valid_loss < best_val_loss best_val_loss = min(valid_loss, best_val_loss) if is_best: best_model_file = "/best_model_" + str(epoch) + ".pth" best_model_file = bestPath + best_model_file torch.save(model.state_dict(), best_model_file) # save the model of this epoch