def calculate_p_values(marking_percentages, batch_size): logfile_path = f"experiments/table1_imagenet/detect_radioactivity.log" setup_logger_tqdm(logfile_path) p_values = [] for run in marking_percentages: run_name = f"{run}_percent" carrier_path = f"experiments/table1_imagenet/{run_name}/carriers.pth" target_checkpoint_path = f"experiments/table1_imagenet/{run_name}/marked_classifier/rank_0/checkpoint.pth" target_checkpoint = torch.load(target_checkpoint_path) target_checkpoint['model_state_dict'] = { k.replace("module.", ""): v for k, v in target_checkpoint['model_state_dict'].items() } (scores, p_vals, combined_pval) = detect_radioactivity(carrier_path, None, None, target_checkpoint, batch_size=batch_size, align=False, test_set_loader=None) p_values.append(combined_pval) return p_values
def calculate_p_values(marking_percentages, marking_checkpoint_path, table_number, align): logfile_path = f"experiments/table{table_number}_imagenette/detect_radioactivity.log" setup_logger_tqdm(logfile_path) p_values = [] # Load Marking Network and remove fully connected layer marking_network = torchvision.models.resnet18(pretrained=False, num_classes=10) marking_checkpoint = torch.load(marking_checkpoint_path) marking_network.load_state_dict(marking_checkpoint["model_state_dict"]) marking_network.fc = nn.Sequential() for run in marking_percentages: run_name = f"{run}_percent" carrier_path = f"experiments/table1_imagenette/{run_name}/carriers.pth" target_network = torchvision.models.resnet18(pretrained=False, num_classes=10) target_checkpoint_path = f"experiments/table{table_number}_imagenette/{run_name}/marked_classifier/checkpoint.pth" target_checkpoint = torch.load(target_checkpoint_path) target_network.load_state_dict(target_checkpoint["model_state_dict"]) target_network.fc = nn.Sequential() (scores, p_vals, combined_pval) = detect_radioactivity(carrier_path, marking_network, target_network, target_checkpoint, align=align) p_values.append(combined_pval) return p_values
def calculate_p_values(marking_percentages): logfile_path = f"experiments/cifar100/table1/detect_radioactivity.log" setup_logger_tqdm(logfile_path) p_values = [] for run in marking_percentages: run_name = f"{run}_percent" carrier_path = f"experiments/cifar100/table1/{run_name}/carriers.pth" # target_network = torchvision.models.resnet18(pretrained=False, num_classes=10) target_network = resnet(num_classes=100, depth=164, block_name='bottleneck') target_checkpoint_path = f"experiments/cifar100/table1/{run_name}/marked_classifier/checkpoint.pth" target_checkpoint = torch.load(target_checkpoint_path) target_network.load_state_dict({ k.replace("module.", ""): v for k, v in target_checkpoint["model_state_dict"].items() }) target_network.fc = nn.Sequential() # No need to align when only retraining the logistic regression (scores, p_vals, combined_pval) = detect_radioactivity(carrier_path, None, None, target_checkpoint, align=False) p_values.append(combined_pval) return p_values
def do_marking_run_multiclass(overall_marking_percentage, experiment_directory, tensorboard_log_directory, marking_network, training_set, mp_args): # Setup experiment directory if os.path.isdir(experiment_directory): error_message = f"Directory {experiment_directory} already exists. By default we assume you don't want to "\ "repeat the marking stage." logger.info(error_message) return os.makedirs(experiment_directory) logfile_path = os.path.join(experiment_directory, 'marking.log') setup_logger_tqdm(filepath=logfile_path) # Carriers marking_network_fc_feature_size = 512 carriers = torch.randn(len(training_set.classes), marking_network_fc_feature_size) carriers /= torch.norm(carriers, dim=1, keepdim=True) torch.save(carriers, os.path.join(experiment_directory, "carriers.pth")) # { 0 : [(image1, original_index1),(image2, original_index2)...], 1 : [....] } image_data = get_images_for_marking_multiclass(training_set, tensorboard_log_directory, overall_marking_percentage) for class_id, image_list in image_data.items(): if image_list: images, original_indexes = map(list, zip(*image_list)) epochs = 250 batch_size = 8 output_directory = os.path.join(experiment_directory, "marked_images") #augmentation = differentiable_augmentations.CenterCrop(256, 224) augmentation = differentiable_augmentations.RandomResizedCropFlip( 256) tensorboard_class_log = os.path.join(tensorboard_log_directory, f"class_{class_id}") do_marking_dist(mp_args, images, original_indexes, output_directory, marking_network, carriers, class_id, NORMALIZE_IMAGENET, tensorboard_class_log, epochs=epochs, batch_size=batch_size, overwrite=False, augmentation=augmentation) # Record marking completion with open(os.path.join(experiment_directory, "marking.complete"), "w") as fh: fh.write("1")
def main(): logfile_path = "download_pushshift_dumps.log" setup_logger_tqdm( logfile_path) # Logger will write messages using tqdm.write args = parser.parse_args() start_month, start_year = tuple(map(int, args.start_period.split(","))) start_date = datetime.datetime(start_year, start_month, 1) if args.finish_period: finish_month, finish_year = tuple( map(int, args.finish_period.split(","))) end_date = datetime.datetime(finish_year, finish_month, 1) else: end_date = datetime.datetime.now() logger.info("Running Script - PushShift submission dumps to sqlite") logger.info("Downloading and processing dumps in the following range:") logger.info(start_date.strftime("Start Period: %m-%Y")) logger.info(end_date.strftime("End Period: %m-%Y")) dumps_directory = os.path.join(args.output_directory, "dumps") if os.path.isdir(dumps_directory): message = f"Directory '{dumps_directory}' already exists, if there are done files" \ " in the directory then these particular months will be skipped. Delete" \ " these files or the directory to avoid this." logger.info(message) if not cutie.prompt_yes_or_no('Do you want to continue?'): sys.exit(0) os.makedirs(dumps_directory, exist_ok=True) logger.info("Building PushShift submission dump file list...") url_list = build_file_list(start_date, end_date) logger.info("Getting sha256sums") sha256sums = get_sha256sums() # Download and Process logger.info("Commencing download and processing into sqlite.") results = [] for url in url_list: result = reddit_processing(url, sha256sums, dumps_directory, args.keep_dumps) results.append(result)
def calculate_p_values(marking_percentages, batch_size, test_set_loader, num_classes): logfile_path = f"experiments/table2_imagenet/detect_radioactivity.log" setup_logger_tqdm(logfile_path) p_values = [] # Load Marking Network and remove fully connected layer marking_network = torchvision.models.resnet18(pretrained=True) marking_network.fc = nn.Sequential() for run in marking_percentages: run_name = f"{run}_percent" carrier_path = f"experiments/table1_imagenet/{run_name}/carriers.pth" # Reuse table 1 carrier target_network = torchvision.models.resnet18(pretrained=False, num_classes=num_classes) target_checkpoint_path = f"experiments/table2_imagenet/{run_name}/marked_classifier/rank_0/checkpoint.pth" target_checkpoint = torch.load(target_checkpoint_path) target_checkpoint['model_state_dict'] = { k.replace("module.", ""): v for k, v in target_checkpoint['model_state_dict'].items() } target_network.load_state_dict(target_checkpoint['model_state_dict']) target_network.fc = nn.Sequential() (scores, p_vals, combined_pval) = detect_radioactivity(carrier_path, marking_network, target_network, target_checkpoint, batch_size=batch_size, align=True, test_set_loader=test_set_loader) p_values.append(combined_pval) return p_values
def do_marking_run_multiclass(overall_marking_percentage, experiment_directory, tensorboard_log_directory, marking_network, training_set): # Setup experiment directory if os.path.isdir(experiment_directory): error_message = f"Directory {experiment_directory} already exists. By default we assume you don't want to "\ "repeat the marking stage." logger.info(error_message) return os.makedirs(experiment_directory) logfile_path = os.path.join(experiment_directory, 'marking.log') setup_logger_tqdm(filepath=logfile_path) # Carriers marking_network_fc_feature_size = 512 carriers = torch.randn(len(training_set.classes), marking_network_fc_feature_size) carriers /= torch.norm(carriers, dim=1, keepdim=True) torch.save(carriers, os.path.join(experiment_directory, "carriers.pth")) # { 0 : [(image1, original_index1),(image2, original_index2)...], 1 : [....] } image_data = get_images_for_marking_multiclass(training_set, tensorboard_log_directory, overall_marking_percentage) marked_images = [] for class_id, image_list in image_data.items(): if image_list: images, original_indexes = map(list, zip(*image_list)) optimizer = lambda x: torch.optim.AdamW(x) epochs = 250 batch_size = 8 output_directory = os.path.join(experiment_directory, "marked_images") augmentation = differentiable_augmentations.CenterCrop(256, 224) tensorboard_class_log = os.path.join(tensorboard_log_directory, f"class_{class_id}") marked_images_temp = do_marking(output_directory, marking_network, images, original_indexes, carriers, class_id, NORMALIZE_IMAGENETTE, optimizer, tensorboard_class_log, epochs=epochs, batch_size=batch_size, overwrite=False, augmentation=augmentation) marked_images = marked_images + marked_images_temp # Show marked images in Tensorboard - centercrop for grid from PIL import Image as im tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) transform = transforms.Compose( [transforms.CenterCrop(256), transforms.ToTensor()]) images_for_tensorboard = [ transform(im.fromarray(x)) for x in marked_images ] img_grid = torchvision.utils.make_grid(images_for_tensorboard, nrow=3) tensorboard_summary_writer.add_image('marked_images', img_grid) # Record marking completion with open(os.path.join(experiment_directory, "marking.complete"), "w") as fh: fh.write("1")
def do_marking_run(overall_marking_percentage, experiment_directory, tensorboard_log_directory, augment=True): # Setup experiment directory if os.path.isdir(experiment_directory): error_message = f"Directory {experiment_directory} already exists. By default we assume you don't want to "\ "repeat the marking stage." logger.info(error_message) return os.makedirs(experiment_directory) logfile_path = os.path.join(experiment_directory, 'marking.log') setup_logger_tqdm(filepath=logfile_path) training_set = torchvision.datasets.CIFAR10(root="experiments/datasets", download=True) # Marking network is the resnet18 we trained on CIFAR10 marking_network = torchvision.models.resnet18(pretrained=False, num_classes=10) checkpoint_path = "experiments/table2/step1/checkpoint.pth" marking_network_checkpoint = torch.load(checkpoint_path) marking_network.load_state_dict( marking_network_checkpoint["model_state_dict"]) # Carriers marking_network_fc_feature_size = 512 carriers = torch.randn(len(training_set.classes), marking_network_fc_feature_size) carriers /= torch.norm(carriers, dim=1, keepdim=True) torch.save(carriers, os.path.join(experiment_directory, "carriers.pth")) # Load randomly sampled images from random class along with list of original indexes # Assume each class has equal number of images, adjust class_marking_percentage to # fit overall marking_percentage class_marking_percentage = overall_marking_percentage * len( training_set.classes) class_id, images, original_indexes = get_images_for_marking_cifar10( training_set, tensorboard_log_directory, class_marking_percentage) optimizer = lambda x: torch.optim.AdamW(x, lr=0.1) epochs = 100 batch_size = 32 output_directory = os.path.join(experiment_directory, "marked_images") if not augment: augmentation = None marked_images = do_marking(output_directory, marking_network, images, original_indexes, carriers, class_id, NORMALIZE_CIFAR10, optimizer, tensorboard_log_directory, epochs=epochs, batch_size=batch_size, overwrite=True, augmentation=augmentation) # Show marked images in Tensorboard tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) images_for_tensorboard = [transforms.ToTensor()(x) for x in marked_images] img_grid = torchvision.utils.make_grid(images_for_tensorboard, nrow=16) tensorboard_summary_writer.add_image('marked_images', img_grid) # Record marking completion with open(os.path.join(experiment_directory, "marking.complete"), "w") as fh: fh.write("1")
def main(device, mp_args, dataloader_func, model, optimizer_callback, output_directory, tensorboard_log_directory, epochs): global_rank = mp_args.nr * mp_args.gpus + device dist.init_process_group(backend='nccl', init_method='env://', world_size=mp_args.world_size, rank=global_rank) output_directory = os.path.join(output_directory, f"rank_{global_rank}") if not os.path.isdir(output_directory): os.makedirs(output_directory, exist_ok=True) # Setup regular log file logfile_path = os.path.join(output_directory, "logfile.txt") setup_logger_tqdm(logfile_path) # Setup TensorBoard logging tensorboard_log_directory = os.path.join(tensorboard_log_directory, f"rank_{global_rank}") tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) # Dataloaders train_set_loader, test_set_loader = dataloader_func( mp_args.world_size, global_rank) # Model & Optimizer model.to(device) optimizer = optimizer_callback(model) model = nn.parallel.DistributedDataParallel(model, device_ids=[device]) logger.info(f"Epoch Count: {epochs}") # Load Checkpoint checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth") start_epoch = 0 if os.path.exists(checkpoint_file_path): logger.info("Checkpoint Found - Loading!") checkpoint = torch.load(checkpoint_file_path) logger.info(f"Last completed epoch: {checkpoint['epoch']}") logger.info(f"Average Train Loss: {checkpoint['train_loss']}") logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}") logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}") start_epoch = checkpoint["epoch"] + 1 logger.info(f"Resuming at epoch {start_epoch}") model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) else: logger.info("No checkpoint found, starting from scratch.") # Training Loop t = Timer() #progress = tqdm(total=epochs, initial=start_epoch, desc="Epochs") for epoch in range(start_epoch, epochs): t.start() logger.info(f"Commence EPOCH {epoch}") # Train train_loss, train_accuracy = train_model(device, model, train_set_loader, optimizer) tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch) tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy, epoch) # Test test_accuracy = test_model(device, model, test_set_loader) tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy, epoch) # Save Checkpoint logger.info("Saving checkpoint.") torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy }, checkpoint_file_path) elapsed_time = t.stop() logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.") logger.info(f"Average Train Loss: {train_loss}") logger.info(f"Top-1 Train Accuracy: {train_accuracy}") logger.info(f"Top-1 Test Accuracy: {test_accuracy}")
json.dump(len(url_data), open(done_file_path, "w")) progress.close() logger.info("Done!") parser_description = 'Scrape urls extracted from Reddit.' parser = argparse.ArgumentParser(description=parser_description) parser.add_argument("-dir", "--job_directory", default="") parser.add_argument("-procs", "--process_count", type=int, default=60) parser.add_argument("-timeout", "--request_timeout", type=int, default=30) if __name__ == "__main__": logfile_name = "scrape_urls.log" setup_logger_tqdm(logfile_name) args = parser.parse_args() urls_directory = os.path.join(args.job_directory, "urls") if not os.path.exists(urls_directory): logger.info( f"No 'urls' directory found in '{args.job_directory}', aborting") sys.exit(0) scrapes_directory = os.path.join(args.job_directory, "scrapes") os.makedirs(scrapes_directory, exist_ok=True) logger.info(f"Scrapes outputting to: '{scrapes_directory}'") scrape_urls(urls_directory, scrapes_directory, args.process_count,
def main(experiment_name, optimizer, output_directory_root="experiments/resnet18_logistic_cifar10", epochs=60, batch_size=512, num_workers=1): output_directory = os.path.join(output_directory_root, experiment_name) if not os.path.isdir(output_directory): os.makedirs(output_directory, exist_ok=True) # Setup regular log file + tensorboard logfile_path = os.path.join(output_directory, "logfile.txt") setup_logger_tqdm(logfile_path) tensorboard_log_directory = os.path.join("runs", "resnet18_logistic_cifar10", experiment_name) tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) # Choose Training Device use_cuda = torch.cuda.is_available() logger.info(f"CUDA Available? {use_cuda}") device = "cuda" if use_cuda else "cpu" # Datasets and Loaders train_set_loader, test_set_loader = get_data_loaders( batch_size, num_workers) # Create Model & Optimizer model = torchvision.models.resnet18(pretrained=True) for param in model.parameters(): param.requires_grad = False num_classes = 10 model.fc = nn.Linear(model.fc.in_features, 10) model.to(device) optimizer = optimizer(model.parameters()) logger.info("=========== Commencing Training ===========") logger.info(f"Epoch Count: {epochs}") logger.info(f"Batch Size: {batch_size}") # Load Checkpoint checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth") start_epoch = 0 if os.path.exists(checkpoint_file_path): logger.info("Checkpoint Found - Loading!") checkpoint = torch.load(checkpoint_file_path) logger.info(f"Last completed epoch: {checkpoint['epoch']}") logger.info(f"Average Train Loss: {checkpoint['train_loss']}") logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}") logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}") start_epoch = checkpoint["epoch"] + 1 logger.info(f"Resuming at epoch {start_epoch}") model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) else: logger.info("No checkpoint found, starting from scratch.") # Training Loop t = Timer() for epoch in range(start_epoch, epochs): t.start() logger.info("-" * 10) logger.info(f"Epoch {epoch}") logger.info("-" * 10) train_loss, train_accuracy = train_model(device, model, train_set_loader, optimizer) tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch) tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy, epoch) test_accuracy = test_model(device, model, test_set_loader, optimizer) tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy, epoch) # Save Checkpoint logger.info("Saving checkpoint.") torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy }, checkpoint_file_path) elapsed_time = t.stop() logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.") logger.info(f"Average Train Loss: {train_loss}") logger.info(f"Top-1 Train Accuracy: {train_accuracy}") logger.info(f"Top-1 Test Accuracy: {test_accuracy}") logger.info("")
default=1, type=int, help='number of gpus per node') parser.add_argument('-nr', '--nr', default=0, type=int, help='ranking within the nodes') class AnonObject(object): def __init__(self): pass if __name__ == '__main__': setup_logger_tqdm() # Commence logging to console assert (torch.cuda.is_available()) assert (torch.distributed.is_available()) args = parser.parse_args() os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '8888' mp_args = AnonObject() mp_args.nr = args.nr mp_args.gpus = args.gpus mp_args.world_size = args.gpus * args.nodes main(args.imagenet_path, args.batch_size_step_3, mp_args)
def do_marking_run_multiclass(overall_marking_percentage, experiment_directory, tensorboard_log_directory, augment=True): # Setup experiment directory if os.path.isdir(experiment_directory): error_message = f"Directory {experiment_directory} already exists. By default we assume you don't want to "\ "repeat the marking stage." logger.info(error_message) return os.makedirs(experiment_directory) logfile_path = os.path.join(experiment_directory, 'marking.log') setup_logger_tqdm(filepath=logfile_path) training_set = torchvision.datasets.CIFAR100(root="experiments/datasets", download=True) # Marking network is the resnet18 we trained on CIFAR10 # marking_network = torchvision.models.resnet18(pretrained=False, num_classes=10) marking_network = resnet(num_classes=100, depth=164, block_name='bottleneck') checkpoint_path = "experiments/cifar100/table1/step1/checkpoint.pth" marking_network_checkpoint = torch.load(checkpoint_path) marking_network.load_state_dict({ k.replace("module.", ""): v for k, v in marking_network_checkpoint["model_state_dict"].items() }) # Carriers # marking_network_fc_feature_size = 512 marking_network_fc_feature_size = 256 carriers = torch.randn(len(training_set.classes), marking_network_fc_feature_size) carriers /= torch.norm(carriers, dim=1, keepdim=True) torch.save(carriers, os.path.join(experiment_directory, "carriers.pth")) # { 0 : [(image1, original_index1),(image2, original_index2)...], 1 : [....] } image_data = get_images_for_marking_multiclass_cifar10( training_set, tensorboard_log_directory, overall_marking_percentage) marked_images = [] for class_id, image_list in image_data.items(): if image_list: images, original_indexes = map(list, zip(*image_list)) optimizer = lambda x: torch.optim.AdamW(x, lr=0.1) epochs = 100 batch_size = 32 output_directory = os.path.join(experiment_directory, "marked_images") if not augment: augmentation = None tensorboard_class_log = os.path.join(tensorboard_log_directory, f"class_{class_id}") marked_images_temp = do_marking(output_directory, marking_network, images, original_indexes, carriers, class_id, NORMALIZE_CIFAR10, optimizer, tensorboard_class_log, epochs=epochs, batch_size=batch_size, overwrite=False, augmentation=augmentation) marked_images = marked_images + marked_images_temp # Show marked images in Tensorboard tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) images_for_tensorboard = [transforms.ToTensor()(x) for x in marked_images] img_grid = torchvision.utils.make_grid(images_for_tensorboard, nrow=16) tensorboard_summary_writer.add_image('marked_images', img_grid) # Record marking completion with open(os.path.join(experiment_directory, "marking.complete"), "w") as fh: fh.write("1")
on_error = on_done result = pool.map(process_count, progress, tasks, on_error, on_done) return result parser_description = 'Generate minhashes for all documents found.' parser = argparse.ArgumentParser(description=parser_description) parser.add_argument("-dir", "--scrape_directory", default="") parser.add_argument("-procs", "--process_count", type=int, default=4) if __name__ == '__main__': args = parser.parse_args() if not os.path.isdir(args.scrape_directory): print("Scrape directory doesn't exist, exiting.") sys.exit(0) with redirect_stdout(open(os.devnull, "w")): nltk.download('punkt') log_file = "generate_minhashes.log" setup_logger_tqdm(log_file) logger.info("Generating document level minhashes from 5 gram sets") minhashes_by_file = generate_minhashes(args.scrape_directory, args.process_count) output_pickle_path = os.path.join(args.scrape_directory, "minhashes.pkl") timed_pickle_dump(minhashes_by_file, output_pickle_path, "minhashes_by_file")
def main(imagenette_path, step_3_batch_size): setup_logger_tqdm() # Commence logging to console table_1_work(imagenette_path, step_3_batch_size) table_2_work(imagenette_path, step_3_batch_size)
if document_count > (batch_count + 1) * documents_per_batch: batch_pickle_file_path = os.path.join(batch_directory, f"batch{batch_count}.pkl") timed_pickle_dump(current_batch, batch_pickle_file_path, f"batch {batch_count} minhashes") current_batch = [] batch_count += 1 if current_batch: batch_pickle_file_path = os.path.join(batch_directory, f"batch{batch_count}.pkl") timed_pickle_dump(current_batch, batch_pickle_file_path, f"batch {batch_count} minhashes") current_batch = None file_name_lookup = [file_name for file_name, documents in minhashes] file_name_lookup_path = os.path.join(batch_directory, "file_name_lookup.pkl") timed_pickle_dump(file_name_lookup, file_name_lookup_path, "Filename lookup") document_count_path = os.path.join(batch_directory, "document_count.pkl") pickle.dump(total_documents, open(document_count_path,"wb")) parser = argparse.ArgumentParser(description='Generate batches of minhashes for cassandra lsh dedupe.') parser.add_argument("-dir", "--directory", default="") parser.add_argument("-batches", "--number_of_batches", type=int, required=True) if __name__ == '__main__': logfile_path = "minhash_lsh_batching.log" setup_logger_tqdm(logfile_path) args = parser.parse_args() main(args.number_of_batches, args.directory)
def main(dataloader_func, model, optimizer_callback, output_directory, tensorboard_log_directory, lr_scheduler=None, epochs=150): if not os.path.isdir(output_directory): os.makedirs(output_directory, exist_ok=True) # Setup regular log file logfile_path = os.path.join(output_directory, "logfile.txt") setup_logger_tqdm(logfile_path) # Setup TensorBoard logging tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) # Choose Training Device use_cuda = torch.cuda.is_available() logger.info(f"CUDA Available? {use_cuda}") device = "cuda" if use_cuda else "cpu" # Dataloaders train_set_loader, test_set_loader = dataloader_func() # Model & Optimizer model.to(device) optimizer = optimizer_callback(model) if lr_scheduler: lr_scheduler = lr_scheduler(optimizer) logger.info(f"Epoch Count: {epochs}") # Load Checkpoint checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth") start_epoch = 0 if os.path.exists(checkpoint_file_path): logger.info("Checkpoint Found - Loading!") checkpoint = torch.load(checkpoint_file_path) logger.info(f"Last completed epoch: {checkpoint['epoch']}") logger.info(f"Average Train Loss: {checkpoint['train_loss']}") logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}") logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}") start_epoch = checkpoint["epoch"] + 1 logger.info(f"Resuming at epoch {start_epoch}") model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) if lr_scheduler: lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) else: logger.info("No checkpoint found, starting from scratch.") # Training Loop t = Timer() for epoch in range(start_epoch, epochs): t.start() logger.info(f"Commence EPOCH {epoch}") # Train train_loss, train_accuracy = train_model(device, model, train_set_loader, optimizer) tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch) tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy, epoch) # Test test_accuracy = test_model(device, model, test_set_loader, optimizer) tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy, epoch) scheduler_dict = None if lr_scheduler: lr_scheduler.step() scheduler_dict = lr_scheduler.state_dict() # Save Checkpoint logger.info("Saving checkpoint.") torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'lr_scheduler_state_dict': scheduler_dict, 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy }, checkpoint_file_path) elapsed_time = t.stop() logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.") logger.info(f"Average Train Loss: {train_loss}") logger.info(f"Top-1 Train Accuracy: {train_accuracy}") logger.info(f"Top-1 Test Accuracy: {test_accuracy}")
for index in original_indexes: images.append(transforms.ToTensor()(training_set.data[index])) img_grid = torchvision.utils.make_grid(images, nrow=16) tensorboard_summary_writer.add_image('images_for_marking', img_grid) return image_data if __name__ == '__main__': # Setup experiment directory, logging experiment_directory = "experiments/radioactive" if not os.path.isdir(experiment_directory): os.makedirs(experiment_directory) logfile_path = os.path.join(experiment_directory, 'marking.log') setup_logger_tqdm(filepath=logfile_path) # Clear old tensorboard logs our_tensorboard_logs = glob.glob('runs/radioactive*') # main creates extra log dirs for tensorboard_log in our_tensorboard_logs: shutil.rmtree(tensorboard_log) tensorboard_log_directory="runs/radioactive" # Load randomly sampled images from random class along with list of original indexes training_set = torchvision.datasets.CIFAR10(root="experiments/datasets", download=True) class_marking_percentage = 10 class_id, images, original_indexes = get_images_for_marking_cifar10(training_set, tensorboard_log_directory, class_marking_percentage) # Marking network is a pretrained resnet18 marking_network = torchvision.models.resnet18(pretrained=True)
def main(device, mp_args, experiment_name, optimizer, output_directory_root="experiments/resnet18_distributed", lr_scheduler=None, epochs=150, batch_size=512, num_workers=1): global_rank = mp_args.nr * mp_args.gpus + device dist.init_process_group(backend='nccl', init_method='env://', world_size=mp_args.world_size, rank=global_rank) output_directory = os.path.join(output_directory_root, experiment_name, f"rank_{global_rank}") if not os.path.isdir(output_directory): os.makedirs(output_directory, exist_ok=True) # Setup regular log file + tensorboard logfile_path = os.path.join(output_directory, "logfile.txt") setup_logger_tqdm(logfile_path) tensorboard_log_directory = os.path.join("runs", "resnet18_distributed", experiment_name, f"rank_{global_rank}") tensorboard_summary_writer = SummaryWriter( log_dir=tensorboard_log_directory) # Datasets and Loaders train_set_loader, test_set_loader = get_data_loaders( mp_args.world_size, global_rank, batch_size, num_workers) # Create Model & Optimizer (uses Partial Functions) model = torchvision.models.resnet18(pretrained=False, num_classes=10) model.to(device) optimizer = optimizer(model.parameters()) model = nn.parallel.DistributedDataParallel(model, device_ids=[device]) if lr_scheduler: lr_scheduler = lr_scheduler(optimizer) logger.info("=========== Commencing Training ===========") logger.info(f"Epoch Count: {epochs}") logger.info(f"Batch Size: {batch_size}") # Load Checkpoint checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth") start_epoch = 0 if os.path.exists(checkpoint_file_path): logger.info("Checkpoint Found - Loading!") checkpoint = torch.load(checkpoint_file_path) logger.info(f"Last completed epoch: {checkpoint['epoch']}") logger.info(f"Average Train Loss: {checkpoint['train_loss']}") logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}") logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}") start_epoch = checkpoint["epoch"] + 1 logger.info(f"Resuming at epoch {start_epoch}") model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) if lr_scheduler: lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"]) else: logger.info("No checkpoint found, starting from scratch.") # Training Loop t = Timer() for epoch in range(start_epoch, epochs): t.start() logger.info("-" * 10) logger.info(f"Epoch {epoch}") logger.info("-" * 10) train_loss, train_accuracy = train_model(device, model, train_set_loader, optimizer) tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch) tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy, epoch) test_accuracy = test_model(device, model, test_set_loader, optimizer) tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy, epoch) scheduler_dict = None if lr_scheduler: lr_scheduler.step() scheduler_dict = lr_scheduler.state_dict() # Save Checkpoint logger.info("Saving checkpoint.") torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'lr_scheduler_state_dict': scheduler_dict, 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy }, checkpoint_file_path) elapsed_time = t.stop() logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.") logger.info(f"Average Train Loss: {train_loss}") logger.info(f"Top-1 Train Accuracy: {train_accuracy}") logger.info(f"Top-1 Test Accuracy: {test_accuracy}") logger.info("")