def train_ensamble(): """ train func of weighted ensamble """ X_train, y_train, X_test, y_test = load_train_data( preds_list=p_list, mode='train') train_dataset = MyDataset(X_train, y_train) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, sampler=None) val_dataset = MyDataset(X_test, y_test) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True) print(f'[+] trainning with {len(train_dataset)} samples, ' f' validation with {len(val_dataset)} samples') model = WeightedEnsambleModel(num_classes, len(p_list)) criterion = torch.nn.CrossEntropyLoss().cuda() EPOCHS = 100 min_loss = float("inf") lr = 0.001 patience = 0 for epoch in range(EPOCHS): print(f'[+] epoch {epoch}') if patience == 3: patience = 0 model.load_state_dict(torch.load(best_checkpoint_file)) lr /= 3 print(f'[+] set lr={lr}') optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=lr) # train for one epoch utils.train_one_epoch(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set after one epoch log_loss = utils.validate(val_loader, model, criterion) if log_loss < min_loss: torch.save(model.state_dict(), best_checkpoint_file) print(f'[+] lr = {lr}, val loss improved from {min_loss:.5f} to {log_loss:.5f}. Saved!') min_loss = log_loss patience = 0 else: patience += 1 print(f'[*] trainning done with {EPOCHS} epochs')
def main(args): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print("load dataset") num_classes = 2 data = HandDataset(args.data_path, utils.get_transform(train=True)) indices = torch.randperm(len(data)).tolist() test_cnt = int(len(data) / 10) dataset = torch.utils.data.Subset(data, indices[:-test_cnt]) dataset_test = torch.utils.data.Subset(dataset, indices[-test_cnt:]) # 定义训练和验证数据加载器 data_loader = torch.utils.data.DataLoader( dataset, batch_size=2, shuffle=True, num_workers=4, collate_fn=lambda x: tuple(zip(*x))) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=lambda x: tuple(zip(*x))) print("load model") model = MaskRcnn.get_pretrained_resnet50_model(num_classes) model.to(device) params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) print("begin train") num_epochs = 10 for epoch in range(num_epochs): utils.train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) lr_scheduler.step() # evaluate(model, data_loader_test, device=device) print("That's it!")
def train(lr=0.1, batch_size=64, max_epoch=700, rs=7, save=False, title='0', outdir='fig3', resume_model=None, resume_epoch=0, half_dataset=False): #experiment_dir = outdir experiment_dir = os.path.join('exp', title, datetime.now().strftime('%b%d_%H-%M-%S')) os.makedirs(experiment_dir, exist_ok=True) # Set the seed torch.manual_seed(rs) np.random.seed(rs) if torch.cuda.is_available(): device = torch.device('cuda:0') #print("CUDA Recognized") else: device = torch.device('cpu') model = models.get_model('resnet18').to(device) if resume_model is not None: model = resume_model loaders = datasets.get_dataset( 'first_half_cifar10' if half_dataset else 'cifar10', batch_size=batch_size) optimizer = torch.optim.SGD(model.parameters(), lr=lr) criterion = torch.nn.CrossEntropyLoss() start = time.time() for epoch in range(resume_epoch, max_epoch + 1): print(f"Epoch {epoch}") train_loss, train_accuracy = train_one_epoch(device, model, optimizer, criterion, loaders["train_loader"]) print("Train accuracy: {} Train loss: {}".format( train_accuracy, train_loss)) test_accuracy = 0 if not half_dataset: test_loss, test_accuracy = eval_on_dataloader( device, criterion, model, loaders['test_loader']) print("Test accuracy: {} Test loss: {}".format( test_accuracy, test_loss)) if train_accuracy > 0.99: cost = time.time() - start return train_accuracy, test_accuracy, cost, model return 0, 0, 0, None
def main(): args = parse_args() path = args.path dataset = args.dataset layers = eval(args.layers) weight_decay = args.weight_decay num_negatives_train = args.num_neg_train num_negatives_test = args.num_neg_test dropout = args.dropout learner = args.learner learning_rate = args.lr batch_size = args.batch_size epochs = args.epochs verbose = args.verbose topK = 10 print("MLP arguments: %s " % (args)) # model_out_file = 'Pretrain/%s_MLP_%s_%d.h5' %(args.dataset, args.layers, time()) # Load data t1 = time() full_dataset = MovieDataset( path + dataset, num_negatives_train=num_negatives_train, num_negatives_test=num_negatives_test) train, testRatings, testNegatives = full_dataset.trainMatrix, full_dataset.testRatings, full_dataset.testNegatives num_users, num_items = train.shape print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d" % (time()-t1, num_users, num_items, train.nnz, len(testRatings))) training_data_generator = DataLoader( full_dataset, batch_size=batch_size, shuffle=True, num_workers=0) # Build model model = MLP(num_users, num_items, layers=layers, dropout=dropout) # Transfer the model to GPU, if one is available model.to(device) if verbose: print(model) loss_fn = torch.nn.BCELoss() # Use Adam optimizer optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay) # Record performance hr_list = [] ndcg_list = [] BCE_loss_list = [] # Check Init performance hr, ndcg = test(model, full_dataset, topK) hr_list.append(hr) ndcg_list.append(ndcg) BCE_loss_list.append(1)ß # do the epochs now for epoch in range(epochs): epoch_loss = train_one_epoch( model, training_data_generator, loss_fn, optimizer, epoch, device) if epoch % verbose == 0: hr, ndcg = test(model, full_dataset, topK) hr_list.append(hr) ndcg_list.append(ndcg) BCE_loss_list.append(epoch_loss) # if hr > best_hr: # best_hr, best_ndcg, best_iter = hr, ndcg, epoch # if args.out > 0: # model.save(model_out_file, overwrite=True) print("hr for epochs: ", hr_list) print("ndcg for epochs: ", ndcg_list) print("loss for epochs: ", BCE_loss_list) best_iter = np.argmax(np.array(hr_list)) best_hr = hr_list[best_iter] best_ndcg = ndcg_list[best_iter] print("End. Best Iteration %d: HR = %.4f, NDCG = %.4f. " % (best_iter, best_hr, best_ndcg))
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") if os.path.exists("./weights") is False: os.makedirs("./weights") tb_writer = SummaryWriter() train_images_path, train_images_label, val_images_path, val_images_label = read_split_data( args.data_path) img_size = 224 data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(img_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(int(img_size * 1.143)), transforms.CenterCrop(img_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } # 实例化训练数据集 train_dataset = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_dataset = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) model = create_model(num_classes=args.num_classes).to(device) if args.weights != "": assert os.path.exists( args.weights), "weights file: '{}' not exist.".format(args.weights) weights_dict = torch.load(args.weights, map_location=device)["model"] # 删除有关分类类别的权重 for k in list(weights_dict.keys()): if "head" in k: del weights_dict[k] print(model.load_state_dict(weights_dict, strict=False)) if args.freeze_layers: for name, para in model.named_parameters(): # 除head外,其他权重全部冻结 if "head" not in name: para.requires_grad_(False) else: print("training {}".format(name)) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.AdamW(pg, lr=args.lr, weight_decay=5E-2) for epoch in range(args.epochs): # train train_loss, train_acc = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) # validate val_loss, val_acc = evaluate(model=model, data_loader=val_loader, device=device, epoch=epoch) tags = [ "train_loss", "train_acc", "val_loss", "val_acc", "learning_rate" ] tb_writer.add_scalar(tags[0], train_loss, epoch) tb_writer.add_scalar(tags[1], train_acc, epoch) tb_writer.add_scalar(tags[2], val_loss, epoch) tb_writer.add_scalar(tags[3], val_acc, epoch) tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch) torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(args) print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter() if os.path.exists("./weights") is False: os.makedirs("./weights") train_images_path, train_images_label, val_images_path, val_images_label = read_split_data( args.data_path) data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } # 实例化训练数据集 train_dataset = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_dataset = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) # 如果存在预训练权重则载入 model = densenet121(num_classes=args.num_classes).to(device) if args.weights != "": if os.path.exists(args.weights): load_state_dict(model, args.weights) else: raise FileNotFoundError("not found weights file: {}".format( args.weights)) # 是否冻结权重 if args.freeze_layers: for name, para in model.named_parameters(): # 除最后的全连接层外,其他权重全部冻结 if "classifier" not in name: para.requires_grad_(False) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=1E-4, nesterov=True) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * ( 1 - args.lrf) + args.lrf # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) for epoch in range(args.epochs): # train mean_loss = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) scheduler.step() # validate acc = evaluate(model=model, data_loader=val_loader, device=device) print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3))) tags = ["loss", "accuracy", "learning_rate"] tb_writer.add_scalar(tags[0], mean_loss, epoch) tb_writer.add_scalar(tags[1], acc, epoch) tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch) torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
def main(args): print("Running with arguments:") args_dict = {} for key in vars(args): if key == "default_function": continue args_dict[key] = getattr(args, key) print(key, ": ", args_dict[key]) print("---") experiment_time = datetime.now().strftime('%b%d_%H-%M-%S') experiment_dir = os.path.join('exp', args.title, experiment_time) os.makedirs(experiment_dir) with open(os.path.join(experiment_dir, "config.json"), "w") as f: json.dump(args_dict, f, indent=4, sort_keys=True, default=lambda x: x.__name__) if torch.cuda.is_available(): device = torch.device(f'cuda:{args.gpu_id}') print("CUDA Recognized") else: device = torch.device('cpu') # Training with Random Initialization overal_result = {} init_type = "random" dataset_result = {} for (dataset_name, num_classes) in [("cifar10", 10), ("cifar100", 100), ("svhn", 10)]: model_args = { "resnet18": {"num_classes": num_classes}, "mlp": {"input_dim": 32 * 32 * 3, "num_classes": num_classes, 'activation':'tanh', 'bias':True}, "logistic": {"input_dim": 32 * 32 * 3, "num_classes": num_classes}, } optimizer_result = {} for optimizer_name in ["adam", "sgd", "sgd-momentum"]: model_result = {} for model_name in ["mlp", "logistic", "resnet18"]: print(f"Training model {model_name} on {dataset_name} with {optimizer_name} optimizer.") torch.manual_seed(args.random_seed) np.random.seed(args.random_seed) model = models.get_model(model_name, **model_args[model_name]).to(device) loaders = datasets.get_dataset(dataset_name) criterion = torch.nn.CrossEntropyLoss() if optimizer_name == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) elif optimizer_name == "sgd-momentum": optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) train_accuracies = [] stop_indicator = False epoch = 0 while(not stop_indicator): if epoch % 5 == 0: print(f"\t Training in epoch {epoch + 1} \t") train_loss, train_accuracy = train_one_epoch(device, model, optimizer, criterion, loaders['train_loader']) train_loss, train_accuracy = eval_on_dataloader(device, criterion, model, loaders['train_loader']) train_accuracies.append(train_accuracy) epoch += 1 if train_accuracy >= 0.99: print("Convergence codition met. Training accuracy > 0.99") stop_indicator = True if len(train_accuracies) >= args.convergence_epochs: if np.std(train_accuracies[-args.convergence_epochs:]) < args.convergence_accuracy_change_threshold: print(f"\tConvergence codition met. Training accuracy = {train_accuracy} stopped improving") stop_indicator = True test_loss, test_accuracy = eval_on_dataloader(device, criterion, model, loaders['test_loader']) print(f"\tTest accuracy = {test_accuracy}") model_result[model_name] = test_accuracy optimizer_result[optimizer_name] = model_result dataset_result[dataset_name] = optimizer_result overal_result[init_type] = dataset_result # Training with warm-start init_type = "warm-start" dataset_result = {} for (dataset_name, num_classes) in [("cifar10", 10), ("cifar100", 100), ("svhn", 10)]: model_args = { "resnet18": {"num_classes": num_classes}, "mlp": {"input_dim": 32 * 32 * 3, "num_classes": num_classes}, "logistic": {"input_dim": 32 * 32 * 3, "num_classes": num_classes}, } optimizer_result = {} for optimizer_name in ["adam", "sgd", "sgd-momentum"]: model_result = {} for model_name in ["mlp", "logistic", "resnet18"]: print(f"Training model {model_name} on half of {dataset_name} with {optimizer_name} optimizer.") torch.manual_seed(args.random_seed) np.random.seed(args.random_seed) model = models.get_model(model_name, **model_args[model_name]).to(device) loaders = datasets.get_dataset(f"half_{dataset_name}") criterion = torch.nn.CrossEntropyLoss() if optimizer_name == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) elif optimizer_name == "sgd-momentum": optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) train_accuracies = [] stop_indicator = False epoch = 0 while(not stop_indicator): if epoch % 5 == 0: print(f"\tPre-training in epoch {epoch + 1}") train_loss, train_accuracy = train_one_epoch(device, model, optimizer, criterion, loaders['train_loader']) train_loss, train_accuracy = eval_on_dataloader(device, criterion, model, loaders['train_loader']) train_accuracies.append(train_accuracy) epoch += 1 if train_accuracy >= 0.99: print("Convergence codition met. Training accuracy > 0.99") stop_indicator = True if len(train_accuracies) >= args.convergence_epochs: if np.std(train_accuracies[-args.convergence_epochs:]) < args.convergence_accuracy_change_threshold: print(f"\tConvergence codition met. Training accuracy = {train_accuracy} stopped improving") stop_indicator = True loaders = datasets.get_dataset(f"{dataset_name}") criterion = torch.nn.CrossEntropyLoss() train_accuracies = [] stop_indicator = False epoch = 0 while(not stop_indicator): if epoch % 5 == 0: print(f"\t Training in epoch {epoch + 1}") train_loss, train_accuracy = train_one_epoch(device, model, optimizer, criterion, loaders['train_loader']) train_loss, train_accuracy = eval_on_dataloader(device, criterion, model, loaders['train_loader']) train_accuracies.append(train_accuracy) epoch += 1 if train_accuracy >= 0.99: print("Convergence codition met. Training accuracy > 0.99") stop_indicator = True if len(train_accuracies) >= args.convergence_epochs: if np.std(train_accuracies[-args.convergence_epochs:]) < args.convergence_accuracy_change_threshold: print(f"\tConvergence codition met. Training accuracy = {train_accuracy} stopped improving") stop_indicator = True test_loss, test_accuracy = eval_on_dataloader(device, criterion, model, loaders['test_loader']) print(f"\tTest accuracy = {test_accuracy}") model_result[model_name] = test_accuracy optimizer_result[optimizer_name] = model_result dataset_result[dataset_name] = optimizer_result overal_result[init_type] = dataset_result np.save(f"tables/table1-svhn-seed{args.random_seed}.npy", overal_result)
if ("features.top" not in name) and ("classifier" not in name): para.requires_grad_(False) else: print("training {}".format(name)) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=1E-4) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * (1 - args.lrf) + args.lrf # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) for epoch in range(args.epochs): # train mean_loss = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) scheduler.step() # validate sum_num = evaluate(model=model, data_loader=val_loader, device=device) acc = sum_num / len(val_data_set) print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3))) tags = ["loss", "accuracy", "learning_rate"] tb_writer.add_scalar(tags[0], mean_loss, epoch) tb_writer.add_scalar(tags[1], acc, epoch) tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch)
def train(train_data, use_asymm_gen_loss=True, use_gpu=False): """ :param train_data: np.ndarray of shape (20000, 1) :param use_asymm_gen_loss: bool :param use_gpu: bool :return: """ """ Build training configurations """ hp = dict(n_epochs=20, batch_size=64, n_disc_updates=2) hp = EasyDict(hp) constant = dict(device=torch.device("cpu" if not use_gpu else "cuda:0")) constant = EasyDict(constant) if use_gpu: torch.cuda.set_device(constant.device) """ Build data loader and data processor function """ train_loader = data.DataLoader(dataset=train_data, batch_size=hp.batch_size, shuffle=True) """ Build networks """ gen = Generator().to(constant.device) disc = Discriminator().to(constant.device) """ Build optimizers """ optimizer_g = torch.optim.Adam(gen.parameters(), lr=1e-4, betas=(0, 0.9)) optimizer_d = torch.optim.Adam(disc.parameters(), lr=1e-4, betas=(0, 0.9)) """ Build loss functions """ def disc_loss_fn(real, fake): return (-disc(real.detach()).log().mean() - (1 - disc(fake.detach())).log().mean()) if use_asymm_gen_loss: def gen_loss_fn(real, fake): return -disc(fake).log().mean() else: def gen_loss_fn(real, fake): return (1 - disc(fake)).log().mean() """ Traning loop """ history = dict(losses=[]) for epoch in range(hp.n_epochs): losses_one_epoch = train_one_epoch( n_disc_updates=hp.n_disc_updates, batch_iterator=train_loader, process_batch_fn=process_batch_fn, gen=gen, disc=disc, optimizer_g=optimizer_g, optimizer_d=optimizer_d, gen_loss_fn=gen_loss_fn, disc_loss_fn=disc_loss_fn, device=constant.device, # max_n_iterations=1, # uncomment this line if trying to debug ) history["losses"].extend(losses_one_epoch) print(f"Epoch {epoch}: loss = {np.mean(losses_one_epoch)}") if epoch == 0 or epoch == hp.n_epochs - 1: fake, disc_in, disc_out = eval_one_epoch(gen=gen, disc=disc, device=constant.device) plot_eval( real=train_data, fake=fake, disc_in=disc_in, disc_out=disc_out, epoch=epoch, ) history["losses"] = torch.stack(history["losses"]).to("cpu").numpy() plot_loss_curve( history["losses"], title="train_minimal", save_to=os.path.join(DATA_DIR, "train_minimal_loss"), )
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") if os.path.exists("./weights") is False: os.makedirs("./weights") tb_writer = SummaryWriter() train_images_path, train_images_label, val_images_path, val_images_label = read_split_data( args.data_path) data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ]), "val": transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) ]) } # 实例化训练数据集 train_dataset = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_dataset = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) model = create_model(num_classes=5, has_logits=False).to(device) if args.weights != "": assert os.path.exists( args.weights), "weights file: '{}' not exist.".format(args.weights) weights_dict = torch.load(args.weights, map_location=device) # 删除不需要的权重 del_keys = ['head.weight', 'head.bias'] if model.has_logits \ else ['pre_logits.fc.weight', 'pre_logits.fc.bias', 'head.weight', 'head.bias'] for k in del_keys: del weights_dict[k] print(model.load_state_dict(weights_dict, strict=False)) if args.freeze_layers: for name, para in model.named_parameters(): # 除head, pre_logits外,其他权重全部冻结 if "head" not in name and "pre_logits" not in name: para.requires_grad_(False) else: print("training {}".format(name)) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=5E-5) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * ( 1 - args.lrf) + args.lrf # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) for epoch in range(args.epochs): # train train_loss, train_acc = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) scheduler.step() # validate val_loss, val_acc = evaluate(model=model, data_loader=val_loader, device=device, epoch=epoch) tags = [ "train_loss", "train_acc", "val_loss", "val_acc", "learning_rate" ] tb_writer.add_scalar(tags[0], train_loss, epoch) tb_writer.add_scalar(tags[1], train_acc, epoch) tb_writer.add_scalar(tags[2], val_loss, epoch) tb_writer.add_scalar(tags[3], val_acc, epoch) tb_writer.add_scalar(tags[4], optimizer.param_groups[0]["lr"], epoch) torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
#model = model.to(device) criterion = torch.nn.MSELoss() lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=lr_patience) #train_losses, valid_losses = [], [] #%% create save path savepath = checkpoint_path if (is_resuming and checkpoint_path and not is_transfer) else create_savepath( rootpath=save_root_path, is_debug=is_debug) #if is_resuming: # savepath = checkpoint_path #else: # savepath = create_savepath(savepath=save_root_path,is_debug=is_debug) for epoch in range(epoch_start, epoch_start + num_epochs): train_loss = train_one_epoch(model, optimizer, criterion, train_loader, epoch, device, epoch_start + num_epochs) train_losses.append(train_loss) valid_loss = validate(model, criterion, valid_loader, device) valid_losses.append(valid_loss) lr_scheduler.step(valid_loss) #print('epoch {} done!'.format(epoch)) save_checkpoint(savepath=savepath, epoch=epoch, model=model, optimizer=optimizer, train_losses=train_losses, valid_losses=valid_losses, lr=lr, lr_patience=lr_patience,
def main(args): print("Running with arguments:") args_dict = {} for key in vars(args): if key == "default_function": continue args_dict[key] = getattr(args, key) print(key, ": ", args_dict[key]) print("---") experiment_time = datetime.now().strftime('%b%d_%H-%M-%S') if args.exp_dir: experiment_dir = args.exp_dir else: experiment_dir = os.path.join('exp', args.title, experiment_time) os.makedirs(experiment_dir, exist_ok=True) with open(os.path.join(experiment_dir, "config.json"), "w") as f: json.dump(args_dict, f, indent=4, sort_keys=True, default=lambda x: x.__name__) if torch.cuda.is_available(): device = torch.device('cuda:0') print("CUDA Recognized") else: device = torch.device('cpu') try: summary_writer = SummaryWriter(logdir=experiment_dir) except: summary_writer = SummaryWriter(experiment_dir) print("Starting Online Learning") #Online learning setup torch.manual_seed(args.random_seed) np.random.seed(args.random_seed) model = models.get_model(args.model).to(device) criterion = torch.nn.CrossEntropyLoss() loaders = datasets.get_dataset("online_with_val_cifar10", split_size=args.split_size) number_of_samples_online = [] test_accuracies_online = [] training_times_online = [] epoch = 0 for i, train_loader in enumerate(loaders['train_loaders']): t_start = datetime.now() n_train = (i + 1) * args.split_size number_of_samples_online.append(n_train) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) random_model = models.get_model(args.model).to(device) with torch.no_grad(): for real_parameter, random_parameter in zip( model.parameters(), random_model.parameters()): real_parameter.mul_(args.checkpoint_shrink).add_( random_parameter, alpha=args.checkpoint_perturb) train_accuracies = [] while True: if epoch % 5 == 0: print(f"Starting training in epoch {epoch + 1}") train_loss, train_accuracy = train_one_epoch( device, model, optimizer, criterion, train_loader) val_loss, val_accuracy = eval_on_dataloader( device, criterion, model, loaders['val_loader']) test_loss, test_accuracy = eval_on_dataloader( device, criterion, model, loaders['test_loader']) train_accuracies.append(train_accuracy) epoch += 1 summary_writer.add_scalar("test_accuracy", test_accuracy, epoch) summary_writer.add_scalar("test_loss", test_loss, epoch) summary_writer.add_scalar("train_accuracy", train_accuracy, epoch) summary_writer.add_scalar("train_loss", train_loss, epoch) summary_writer.add_scalar("val_accuracy", val_accuracy, epoch) summary_writer.add_scalar("val_loss", val_loss, epoch) #if len(train_accuracies) >= args.convergence_epochs and \ # max(train_accuracies) not in train_accuracies[-args.convergence_epochs:]: if train_accuracy >= 0.99: print("Convergence condition met") break val_loss, val_accuracy = eval_on_dataloader(device, criterion, model, loaders['val_loader']) test_loss, test_accuracy = eval_on_dataloader(device, criterion, model, loaders['test_loader']) summary_writer.add_scalar("online_val_accuracy", val_accuracy, n_train) summary_writer.add_scalar("online_val_loss", val_loss, n_train) summary_writer.add_scalar("online_test_accuracy", test_accuracy, n_train) summary_writer.add_scalar("online_test_loss", test_loss, n_train) t_end = datetime.now() training_time = (t_end - t_start).total_seconds() training_times_online.append(training_time) summary_writer.add_scalar("online_train_time", training_time, n_train) summary_writer.close()
def train(train_data, use_gpu=False): """ :param train_data: np.array of shape (None, 3, 32, 32) with values in [0, 1] :param use_gpu: :return: """ """ Build training configurations """ hp = dict( n_iterations=25000, batch_size=256, n_disc_updates=5, lmbda=10, ) hp = EasyDict(hp) constant = dict(device=torch.device("cpu" if not use_gpu else "cuda:0")) constant = EasyDict(constant) if use_gpu: torch.cuda.set_device(constant.device) """ Build data loader and data processor function """ train_loader = data.DataLoader(dataset=train_data, batch_size=hp.batch_size, shuffle=True) n_batches = len(train_loader) hp.n_epochs = hp.n_iterations // n_batches hp.n_iterations = hp.n_epochs * n_batches print('n_epochs', hp.n_epochs, 'n_iterations', hp.n_iterations) """ Build networks """ gen = Generator().to(constant.device) disc = Discriminator().to(constant.device) """ Build optimizers """ optimizer_g = torch.optim.Adam(gen.parameters(), lr=2e-4, betas=(0, 0.9)) optimizer_d = torch.optim.Adam(disc.parameters(), lr=2e-4, betas=(0, 0.9)) """ Build loss functions """ def disc_loss_fn(real, fake): current_batch_size = real.shape[0] real, fake = real.detach(), fake.detach() eps = torch.randn(current_batch_size, 1, 1, 1).to(constant.device) x_hat = (eps * real + (1 - eps) * fake).requires_grad_() disc_out = disc(x_hat) original_disc_loss = disc_out.mean() - disc(real).mean() grad, = torch.autograd.grad( outputs=[disc_out.mean(), ], inputs=x_hat, create_graph=True, retain_graph=True, ) grad_penalty = (grad.norm() - 1).square() return original_disc_loss + hp.lmbda * grad_penalty def gen_loss_fn(real, fake): return -disc(fake).log().mean() """ Build learning rate schedulers """ max_n_iterations = max(hp.n_iterations, 25000) scheduler_g = torch.optim.lr_scheduler.LambdaLR( optimizer=optimizer_g, lr_lambda=lambda itr: (max_n_iterations - itr) / max_n_iterations, last_epoch=-1 ) scheduler_d = torch.optim.lr_scheduler.LambdaLR( optimizer=optimizer_d, lr_lambda=lambda itr: (max_n_iterations - itr) / max_n_iterations, last_epoch=-1, ) """ Training loop """ history = dict( losses=[], ) for epoch in tqdm(range(hp.n_epochs)): losses_one_epoch = train_one_epoch( n_disc_updates=hp.n_disc_updates, batch_iterator=train_loader, process_batch_fn=process_batch_fn, gen=gen, disc=disc, optimizer_g=optimizer_g, optimizer_d=optimizer_d, gen_loss_fn=gen_loss_fn, disc_loss_fn=disc_loss_fn, device=constant.device, scheduler_g=scheduler_g, scheduler_d=scheduler_d, # max_n_iterations=1, # debug ) history['losses'].extend(losses_one_epoch) print(f"Epoch {epoch}: loss = {np.mean(losses_one_epoch)}") if epoch == hp.n_epochs - 1: fake = eval_one_epoch(gen=gen, disc=disc, device=constant.device) plot_eval(fake=fake, epoch=epoch) history['losses'] = torch.stack(history['losses']).to('cpu').numpy() plot_loss_curve( history["losses"], title="train_cifar10", save_to=os.path.join(DATA_DIR, "train_cifar10_loss"), )
# for plotting mvn = torch.distributions.Normal(0, 1) z_norm = mvn.sample([num_samples, np.ceil(n_in).astype(int)]) val_batch = next(iter(val_loader)).float() start = time.time() # for early stopping i = 0 max_loss = np.inf epochs_list = [] train_losses = [] val_losses = [] for epoch in range(1, epochs): epochs_list.append(epoch) train_loss = train_one_epoch(model, epoch, optimizer, train_loader) val_loss = val(model, train, val_loader) train_losses.append(train_loss) # val_losses.append(val_loss) val_loss = 100 if val_loss < max_loss: max_loss = val_loss i = 0 torch.save( model, (path + "model.pt"), ) else: i += 1 if i >= 30: break
testloader = DataLoader(testdataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) best_acc1 = 0 best_acc2 = 0 best_loss = np.inf counter = 0 for epoch in range(args.epochs): print(f'Epoch {epoch+1}/{args.epochs}') train_loss, _, _, _, _ = train_one_epoch(trainloader, model, optimizer, criterion, device, scaler, args, idx2ans) val_loss, predictions, val_acc, val_bleu = validate( valloader, model, criterion, device, scaler, args, val_df, idx2ans) test_loss, predictions, acc, bleu = test(testloader, model, criterion, device, scaler, args, test_df, idx2ans) scheduler.step(val_loss) if not args.category: log_dict = acc for k, v in bleu.items(): log_dict[k] = v
def main(args): print("Running with arguments:") args_dict = {} for key in vars(args): if key == "default_function": continue args_dict[key] = getattr(args, key) print(key, ": ", args_dict[key]) print("---") experiment_time = datetime.now().strftime('%b%d_%H-%M-%S') experiment_dir = os.path.join('exp', args.title, experiment_time) os.makedirs(experiment_dir) with open(os.path.join(experiment_dir, "config.json"), "w") as f: json.dump(args_dict, f, indent=4, sort_keys=True, default=lambda x: x.__name__) if torch.cuda.is_available(): device = torch.device('cuda:0') print("CUDA Recognized") else: device = torch.device('cpu') try: summary_writer = SummaryWriter(logdir=experiment_dir) except: summary_writer = SummaryWriter(experiment_dir) print("Starting Online Learning") #Online learning setup torch.manual_seed(args.random_seed) np.random.seed(args.random_seed) model = models.get_model(args.model).to(device) criterion = torch.nn.CrossEntropyLoss() loaders = datasets.get_dataset(f"online_with_val_{args.dataset}", split_size=args.split_size) number_of_samples_online = [] test_accuracies_online = [] training_times_online = [] for i, train_loader in enumerate(loaders['train_loaders']): t_start = datetime.now() n_train = (i + 1) * args.split_size number_of_samples_online.append(n_train) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) print(f"Warm-Start Training with {n_train} data.") train_accuracies = [] stop_indicator = False epoch = 0 while (not stop_indicator): if epoch % 5 == 0: print(f"Starting training in epoch {epoch + 1}") train_loss, train_accuracy = train_one_epoch( device, model, optimizer, criterion, train_loader) train_loss, train_accuracy = eval_on_dataloader( device, criterion, model, train_loader) train_accuracies.append(train_accuracy) epoch += 1 if train_accuracy >= args.acc_threshold: print( f"Convergence codition met. Training accuracy > {100 * args.acc_threshold}" ) stop_indicator = True t_end = datetime.now() training_time = (t_end - t_start).total_seconds() test_loss, test_accuracy = eval_on_dataloader(device, criterion, model, loaders['test_loader']) test_accuracies_online.append(test_accuracy) training_times_online.append(training_time) summary_writer.add_scalar("test_accuracy_online", test_accuracy, n_train) summary_writer.add_scalar("train_time_online", training_time, n_train) print("Starting Offline Learning") # Offline learning setup n_experiments = len(loaders['train_loaders']) number_of_samples_offline = [] test_accuracies_offline = [] training_times_offline = [] for i in range(1, n_experiments + 1): t_start = datetime.now() n_train = i * args.split_size number_of_samples_offline.append(n_train) print(f"Running {i}_th experiment with Train size = {n_train}") # Set the seed torch.manual_seed(args.random_seed) np.random.seed(args.random_seed) loaders = datasets.get_dataset(f"partial_with_val_{args.dataset}", n_train) model = models.get_model(args.model).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) criterion = torch.nn.CrossEntropyLoss() train_accuracies = [] stop_indicator = False epoch = 0 while (not stop_indicator): if epoch % 5 == 0: print(f"Starting training in epoch {epoch + 1}") train_loss, train_accuracy = train_one_epoch( device, model, optimizer, criterion, loaders['train_loader']) # val_loss, val_accuracy = eval_on_dataloader(model, loaders['val_loader']) train_loss, train_accuracy = eval_on_dataloader( device, criterion, model, loaders['train_loader']) # To get model's final accuracy train_accuracies.append(train_accuracy) epoch += 1 if train_accuracy >= args.acc_threshold: print( f"Convergence codition met. Training accuracy > {100 * args.acc_threshold}" ) stop_indicator = True t_end = datetime.now() training_time = (t_end - t_start).total_seconds() test_loss, test_accuracy = eval_on_dataloader(device, criterion, model, loaders['test_loader']) test_accuracies_offline.append(test_accuracy) training_times_offline.append(training_time) summary_writer.add_scalar("test_accuracy_offline", test_accuracy, n_train) summary_writer.add_scalar("train_time_offline", training_time, n_train) import matplotlib.pyplot as plt fig, axs = plt.subplots(1, 2, figsize=(10, 5)) number_of_samples_online = np.array(number_of_samples_online) / 1000 number_of_samples_offline = np.array(number_of_samples_offline) / 1000 axs[0].plot(number_of_samples_online, test_accuracies_online, label='warm start', color='C0') axs[0].plot(number_of_samples_offline, test_accuracies_offline, label='random', color='C1') axs[0].set_ylabel("Tets Accuracy") axs[0].set_xlabel("Number of Samples (thousands)") axs[1].plot(number_of_samples_online, training_times_online, label='warm start', color='C0') axs[1].plot(number_of_samples_offline, training_times_offline, label='random', color='C1') axs[1].set_ylabel("Train Time (seconds)") axs[1].set_xlabel("Number of Samples (thousands)") plt.legend() plt.savefig( f"figures/figure2-{args.dataset}-{100 * args.acc_threshold}.pdf")
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(args) print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter() if os.path.exists("./weights") is False: os.makedirs("./weights") train_images_path, train_images_label, val_images_path, val_images_label = read_split_data( args.data_path) img_size = { "B0": 224, "B1": 240, "B2": 260, "B3": 300, "B4": 380, "B5": 456, "B6": 528, "B7": 600 } num_model = "B0" data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(img_size[num_model]), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(img_size[num_model]), transforms.CenterCrop(img_size[num_model]), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } # 实例化训练数据集 train_data_set = MyDataSet(images_path=train_images_path, images_class=train_images_label, transform=data_transform["train"]) # 实例化验证数据集 val_data_set = MyDataSet(images_path=val_images_path, images_class=val_images_label, transform=data_transform["val"]) batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) val_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_data_set.collate_fn) # 如果存在预训练权重则载入 model = create_model(num_classes=args.num_classes).to(device) if os.path.exists(args.weights): weights_dict = torch.load(args.weights, map_location=device) load_weights_dict = { k: v for k, v in weights_dict.items() if model.state_dict()[k].numel() == v.numel() } print(model.load_state_dict(load_weights_dict, strict=False)) # 是否冻结权重 if args.freeze_layers: for name, para in model.named_parameters(): # 除最后一个卷积层和全连接层外,其他权重全部冻结 if ("features.top" not in name) and ("classifier" not in name): para.requires_grad_(False) else: print("training {}".format(name)) pg = [p for p in model.parameters() if p.requires_grad] optimizer = optim.SGD(pg, lr=args.lr, momentum=0.9, weight_decay=1E-4) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: ((1 + math.cos(x * math.pi / args.epochs)) / 2) * ( 1 - args.lrf) + args.lrf # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) for epoch in range(args.epochs): # train mean_loss = train_one_epoch(model=model, optimizer=optimizer, data_loader=train_loader, device=device, epoch=epoch) scheduler.step() # validate sum_num = evaluate(model=model, data_loader=val_loader, device=device) acc = sum_num / len(val_data_set) print("[epoch {}] accuracy: {}".format(epoch, round(acc, 3))) tags = ["loss", "accuracy", "learning_rate"] tb_writer.add_scalar(tags[0], mean_loss, epoch) tb_writer.add_scalar(tags[1], acc, epoch) tb_writer.add_scalar(tags[2], optimizer.param_groups[0]["lr"], epoch) torch.save(model.state_dict(), "./weights/model-{}.pth".format(epoch))
num_epoch = args.max_epoch loss_rec = {"train": [], "valid": []} acc_rec = {"train": [], "valid": []} best_acc, best_epoch = 0, 0 LR = args.lr optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9, weight_decay=1e-4) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, gamma=0.1, milestones=[92, 136]) for epoch in range(num_epoch): loss_train, acc_train, mat_train = train_one_epoch( train_loader, model, criterion, optimizer, epoch, device) loss_valid, acc_valid, mat_valid = valid_one_epoch( test_loader, model, criterion, device) print( "Epoch[{:0>3}/{:0>3}] Train Acc: {:.2%} Valid Acc:{:.2%} Train loss:{:.4f} Valid loss:{:.4f} LR:{}" .format(epoch + 1, num_epoch, acc_train, acc_valid, loss_train, loss_valid, optimizer.param_groups[0]["lr"])) if 'patience' in dir(scheduler): scheduler.step(acc_valid) # ReduceLROnPlateau else: scheduler.step() # StepLR loss_rec["train"].append(loss_train), loss_rec["valid"].append( loss_valid) acc_rec["train"].append(acc_train), acc_rec["valid"].append(acc_valid)