def batch_size_linear_search(): min = 8 max = 600 step_size = 8 optimizer = lambda x: torch.optim.SGD(x, lr=0.1) experiment_name = "batch_size_linear_search" t = Timer() batch_size_times = {} for i, batch_size in enumerate(range(min, max, step_size)): t.start() main(experiment_name, optimizer, epochs=i + 2, batch_size=batch_size) elapsed_time = t.stop() batch_size_times[batch_size] = elapsed_time pickle.dump(batch_size_times, open("batch_size_times.pickle", "wb")) # Plot batch_sizes = [] times = [] for k in sorted(batch_size_times): batch_sizes.append(k) times.append(batch_size_times[k]) plt.plot(np.array(batch_sizes), np.array(times)) plt.xlabel("Batch Size") plt.ylabel("Epoch Time") plt.title("Batch Size vs Epoch Time") plt.show()
def main(experiment_name, optimizer, output_directory_root="experiments/resnet18_logistic_cifar10", epochs=60, batch_size=512, num_workers=1): output_directory = os.path.join(output_directory_root, experiment_name) if not os.path.isdir(output_directory): os.makedirs(output_directory, exist_ok=True) # Setup regular log file + tensorboard logfile_path = os.path.join(output_directory, "logfile.txt") setup_logger_tqdm(logfile_path) tensorboard_log_directory = os.path.join("runs", "resnet18_logistic_cifar10", experiment_name) tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) # Choose Training Device use_cuda = torch.cuda.is_available() logger.info(f"CUDA Available? {use_cuda}") device = "cuda" if use_cuda else "cpu" # Datasets and Loaders train_set_loader, test_set_loader = get_data_loaders( batch_size, num_workers) # Create Model & Optimizer model = torchvision.models.resnet18(pretrained=True) for param in model.parameters(): param.requires_grad = False num_classes = 10 model.fc = nn.Linear(model.fc.in_features, 10) model.to(device) optimizer = optimizer(model.parameters()) logger.info("=========== Commencing Training ===========") logger.info(f"Epoch Count: {epochs}") logger.info(f"Batch Size: {batch_size}") # Load Checkpoint checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth") start_epoch = 0 if os.path.exists(checkpoint_file_path): logger.info("Checkpoint Found - Loading!") checkpoint = torch.load(checkpoint_file_path) logger.info(f"Last completed epoch: {checkpoint['epoch']}") logger.info(f"Average Train Loss: {checkpoint['train_loss']}") logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}") logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}") start_epoch = checkpoint["epoch"] + 1 logger.info(f"Resuming at epoch {start_epoch}") model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) else: logger.info("No checkpoint found, starting from scratch.") # Training Loop t = Timer() for epoch in range(start_epoch, epochs): t.start() logger.info("-" * 10) logger.info(f"Epoch {epoch}") logger.info("-" * 10) train_loss, train_accuracy = train_model(device, model, train_set_loader, optimizer) tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch) tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy, epoch) test_accuracy = test_model(device, model, test_set_loader, optimizer) tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy, epoch) # Save Checkpoint logger.info("Saving checkpoint.") torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy }, checkpoint_file_path) elapsed_time = t.stop() logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.") logger.info(f"Average Train Loss: {train_loss}") logger.info(f"Top-1 Train Accuracy: {train_accuracy}") logger.info(f"Top-1 Test Accuracy: {test_accuracy}") logger.info("")
def main(device, mp_args, dataloader_func, model, optimizer_callback, output_directory, tensorboard_log_directory, epochs): global_rank = mp_args.nr * mp_args.gpus + device dist.init_process_group(backend='nccl', init_method='env://', world_size=mp_args.world_size, rank=global_rank) output_directory = os.path.join(output_directory, f"rank_{global_rank}") if not os.path.isdir(output_directory): os.makedirs(output_directory, exist_ok=True) # Setup regular log file logfile_path = os.path.join(output_directory, "logfile.txt") setup_logger_tqdm(logfile_path) # Setup TensorBoard logging tensorboard_log_directory = os.path.join(tensorboard_log_directory, f"rank_{global_rank}") tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) # Dataloaders train_set_loader, test_set_loader = dataloader_func( mp_args.world_size, global_rank) # Model & Optimizer model.to(device) optimizer = optimizer_callback(model) model = nn.parallel.DistributedDataParallel(model, device_ids=[device]) logger.info(f"Epoch Count: {epochs}") # Load Checkpoint checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth") start_epoch = 0 if os.path.exists(checkpoint_file_path): logger.info("Checkpoint Found - Loading!") checkpoint = torch.load(checkpoint_file_path) logger.info(f"Last completed epoch: {checkpoint['epoch']}") logger.info(f"Average Train Loss: {checkpoint['train_loss']}") logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}") logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}") start_epoch = checkpoint["epoch"] + 1 logger.info(f"Resuming at epoch {start_epoch}") model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) else: logger.info("No checkpoint found, starting from scratch.") # Training Loop t = Timer() #progress = tqdm(total=epochs, initial=start_epoch, desc="Epochs") for epoch in range(start_epoch, epochs): t.start() logger.info(f"Commence EPOCH {epoch}") # Train train_loss, train_accuracy = train_model(device, model, train_set_loader, optimizer) tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch) tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy, epoch) # Test test_accuracy = test_model(device, model, test_set_loader) tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy, epoch) # Save Checkpoint logger.info("Saving checkpoint.") torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy }, checkpoint_file_path) elapsed_time = t.stop() logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.") logger.info(f"Average Train Loss: {train_loss}") logger.info(f"Top-1 Train Accuracy: {train_accuracy}") logger.info(f"Top-1 Test Accuracy: {test_accuracy}")
def main(dataloader_func, model, optimizer_callback, output_directory, tensorboard_log_directory, lr_scheduler=None, epochs=150): if not os.path.isdir(output_directory): os.makedirs(output_directory, exist_ok=True) # Setup regular log file logfile_path = os.path.join(output_directory, "logfile.txt") setup_logger_tqdm(logfile_path) # Setup TensorBoard logging tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) # Choose Training Device use_cuda = torch.cuda.is_available() logger.info(f"CUDA Available? {use_cuda}") device = "cuda" if use_cuda else "cpu" # Dataloaders train_set_loader, test_set_loader = dataloader_func() # Model & Optimizer model.to(device) optimizer = optimizer_callback(model) if lr_scheduler: lr_scheduler = lr_scheduler(optimizer) logger.info(f"Epoch Count: {epochs}") # Load Checkpoint checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth") start_epoch = 0 if os.path.exists(checkpoint_file_path): logger.info("Checkpoint Found - Loading!") checkpoint = torch.load(checkpoint_file_path) logger.info(f"Last completed epoch: {checkpoint['epoch']}") logger.info(f"Average Train Loss: {checkpoint['train_loss']}") logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}") logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}") start_epoch = checkpoint["epoch"] + 1 logger.info(f"Resuming at epoch {start_epoch}") model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) if lr_scheduler: lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) else: logger.info("No checkpoint found, starting from scratch.") # Training Loop t = Timer() for epoch in range(start_epoch, epochs): t.start() logger.info(f"Commence EPOCH {epoch}") # Train train_loss, train_accuracy = train_model(device, model, train_set_loader, optimizer) tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch) tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy, epoch) # Test test_accuracy = test_model(device, model, test_set_loader, optimizer) tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy, epoch) scheduler_dict = None if lr_scheduler: lr_scheduler.step() scheduler_dict = lr_scheduler.state_dict() # Save Checkpoint logger.info("Saving checkpoint.") torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'lr_scheduler_state_dict': scheduler_dict, 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy }, checkpoint_file_path) elapsed_time = t.stop() logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.") logger.info(f"Average Train Loss: {train_loss}") logger.info(f"Top-1 Train Accuracy: {train_accuracy}") logger.info(f"Top-1 Test Accuracy: {test_accuracy}")
def main(carrier_path, marking_network, target_network, target_checkpoint, batch_size=256, num_workers=1, align=True, test_set_loader=None): # Setup Device use_cuda = torch.cuda.is_available() logger.info(f"CUDA Available? {use_cuda}") device = torch.device("cuda" if use_cuda else "cpu") # Load Carrier carrier = torch.load(carrier_path).numpy() t = Timer() t.start() # Align spaces (Or Not) W = target_checkpoint["model_state_dict"]["fc.weight"].cpu().numpy() if align: logger.info( "Aligning marking and target network feature space with least squares" ) marking_network.to(device) marking_network.eval() target_network.to(device) target_network.eval() # Setup Dataloader if not test_set_loader: test_set_loader = get_data_loader(batch_size, num_workers) logger.info( "Extracting image features from marking and target networks.") features_marking, _ = extract_features(test_set_loader, marking_network, device, verbose=False) features_target, _ = extract_features(test_set_loader, target_network, device, verbose=False) features_marking = features_marking.numpy() features_target = features_target.numpy() X, residuals, rank, s = np.linalg.lstsq(features_marking, features_target) logger.info( "Norm of residual: %.4e" % np.linalg.norm(np.dot(features_marking, X) - features_target)**2) W = np.matmul(W, X.T) # Computing scores W /= np.linalg.norm(W, axis=1, keepdims=True) scores = np.sum(W * carrier, axis=1) #print(f"SCORES: {scores}") logger.info("Mean p-value is at %d times sigma" % int(scores.mean() * np.sqrt(W.shape[0] * carrier.shape[1]))) logger.info("Epoch of the model: %d" % target_checkpoint["epoch"]) p_vals = [cosine_pvalue(c, d=carrier.shape[1]) for c in list(scores)] #print(f"Cosine P values: {p_vals}") #print(f"np.sum(np.log(p_vals)): {np.sum(np.log(p_vals))}") #logger.info(p_vals) combined_pval = combine_pvalues(p_vals)[1] logger.info(f"log10(p)={np.log10(combined_pval)}") elapsed_time = t.stop() logger.info("Total took %.2f" % (elapsed_time)) return (scores, p_vals, combined_pval)
def main(device, mp_args, experiment_name, optimizer, output_directory_root="experiments/resnet18_distributed", lr_scheduler=None, epochs=150, batch_size=512, num_workers=1): global_rank = mp_args.nr * mp_args.gpus + device dist.init_process_group(backend='nccl', init_method='env://', world_size=mp_args.world_size, rank=global_rank) output_directory = os.path.join(output_directory_root, experiment_name, f"rank_{global_rank}") if not os.path.isdir(output_directory): os.makedirs(output_directory, exist_ok=True) # Setup regular log file + tensorboard logfile_path = os.path.join(output_directory, "logfile.txt") setup_logger_tqdm(logfile_path) tensorboard_log_directory = os.path.join("runs", "resnet18_distributed", experiment_name, f"rank_{global_rank}") tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) # Datasets and Loaders train_set_loader, test_set_loader = get_data_loaders( mp_args.world_size, global_rank, batch_size, num_workers) # Create Model & Optimizer (uses Partial Functions) model = torchvision.models.resnet18(pretrained=False, num_classes=10) model.to(device) optimizer = optimizer(model.parameters()) model = nn.parallel.DistributedDataParallel(model, device_ids=[device]) if lr_scheduler: lr_scheduler = lr_scheduler(optimizer) logger.info("=========== Commencing Training ===========") logger.info(f"Epoch Count: {epochs}") logger.info(f"Batch Size: {batch_size}") # Load Checkpoint checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth") start_epoch = 0 if os.path.exists(checkpoint_file_path): logger.info("Checkpoint Found - Loading!") checkpoint = torch.load(checkpoint_file_path) logger.info(f"Last completed epoch: {checkpoint['epoch']}") logger.info(f"Average Train Loss: {checkpoint['train_loss']}") logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}") logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}") start_epoch = checkpoint["epoch"] + 1 logger.info(f"Resuming at epoch {start_epoch}") model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) if lr_scheduler: lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) else: logger.info("No checkpoint found, starting from scratch.") # Training Loop t = Timer() for epoch in range(start_epoch, epochs): t.start() logger.info("-" * 10) logger.info(f"Epoch {epoch}") logger.info("-" * 10) train_loss, train_accuracy = train_model(device, model, train_set_loader, optimizer) tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch) tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy, epoch) test_accuracy = test_model(device, model, test_set_loader, optimizer) tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy, epoch) scheduler_dict = None if lr_scheduler: lr_scheduler.step() scheduler_dict = lr_scheduler.state_dict() # Save Checkpoint logger.info("Saving checkpoint.") torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'lr_scheduler_state_dict': scheduler_dict, 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy }, checkpoint_file_path) elapsed_time = t.stop() logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.") logger.info(f"Average Train Loss: {train_loss}") logger.info(f"Top-1 Train Accuracy: {train_accuracy}") logger.info(f"Top-1 Test Accuracy: {test_accuracy}") logger.info("")