def create_handwriting_dataloaders(config, train_json_path, val_json_path, test_json_path, unlabelled_json_path="", twohead=False): assert config.batchnorm_track # recommended (for test time invariance to batch size) # Transforms: if config.sobel: tf1, tf2, tf3 = sobel_make_transforms(config) else: tf1, tf2, tf3 = greyscale_make_transforms(config) actual_dataset_root = os.path.join(config.dataset_root, config.dataset) if config.leave_out_unlabelled: train_files = [train_json_path] else: assert unlabelled_json_path != "" train_files = [train_json_path, unlabelled_json_path] # Training data: dataloader_list = [ _create_hw_dataloaders(config, train_files, actual_dataset_root, tf1, tf2) ] if twohead: dataloader_list.append( _create_hw_dataloaders(config, train_files, actual_dataset_root, tf1, tf2)) # Testing data (labelled): mapping_assignment_dataloader = _create_hw_mapping_loader( config, [val_json_path], actual_dataset_root, tf3) mapping_test_dataloader = _create_hw_mapping_loader( config, [test_json_path], actual_dataset_root, tf3) return dataloader_list, mapping_assignment_dataloader, mapping_test_dataloader
def make_triplets_data(config): target_transform = None if "CIFAR" in config.dataset: config.train_partitions_head_A = [True, False] config.train_partitions_head_B = config.train_partitions_head_A config.mapping_assignment_partitions = [True, False] config.mapping_test_partitions = [True, False] if config.dataset == "CIFAR10": dataset_class = torchvision.datasets.CIFAR10 elif config.dataset == "CIFAR100": dataset_class = torchvision.datasets.CIFAR100 elif config.dataset == "CIFAR20": dataset_class = torchvision.datasets.CIFAR100 target_transform = _cifar100_to_cifar20 else: assert (False) # datasets produce either 2 or 5 channel images based on config.include_rgb tf1, tf2, tf3 = sobel_make_transforms(config) elif config.dataset == "STL10": assert (config.mix_train) if not config.stl_leave_out_unlabelled: print("adding unlabelled data for STL10") config.train_partitions_head_A = ["train+unlabeled", "test"] else: print("not using unlabelled data for STL10") config.train_partitions_head_A = ["train", "test"] config.train_partitions_head_B = ["train", "test"] config.mapping_assignment_partitions = ["train", "test"] config.mapping_test_partitions = ["train", "test"] dataset_class = torchvision.datasets.STL10 # datasets produce either 2 or 5 channel images based on config.include_rgb tf1, tf2, tf3 = sobel_make_transforms(config) elif config.dataset == "MNIST": config.train_partitions_head_A = [True, False] config.train_partitions_head_B = config.train_partitions_head_A config.mapping_assignment_partitions = [True, False] config.mapping_test_partitions = [True, False] dataset_class = torchvision.datasets.MNIST tf1, tf2, tf3 = greyscale_make_transforms(config) else: assert (False) dataloaders = \ _create_dataloaders(config, dataset_class, tf1, tf2, partitions=config.train_partitions_head_A, target_transform=target_transform) dataloader_original = dataloaders[0] dataloader_positive = dataloaders[1] shuffled_dataloaders = \ _create_dataloaders(config, dataset_class, tf1, tf2, partitions=config.train_partitions_head_A, target_transform=target_transform, shuffle=True) dataloader_negative = shuffled_dataloaders[0] # since this is fully unsupervised, assign dataloader = test dataloader dataloader_test = \ _create_mapping_loader(config, dataset_class, tf3, partitions=config.mapping_test_partitions, target_transform=target_transform) return dataloader_original, dataloader_positive, dataloader_negative, \ dataloader_test
def create_basic_clustering_dataloaders(config): """ My original data loading code is complex to cover all my experiments. Here is a simple version. Use it to replace cluster_twohead_create_dataloaders() in the scripts. This uses ImageFolder but you could use your own subclass of torch.utils.data.Dataset. (ImageFolder data is not shuffled so an ideally deterministic random sampler is needed.) :param config: Requires num_dataloaders and values used by *make_transforms(), e.g. crop size, input size etc. :return: Training and testing dataloaders """ # Change these according to your data: greyscale = False train_data_path = os.path.join(config.dataset_root, "train") test_val_data_path = os.path.join(config.dataset_root, "none") test_data_path = os.path.join(config.dataset_root, "none") assert (config.batchnorm_track ) # recommended (for test time invariance to batch size) # Transforms: if greyscale: tf1, tf2, tf3 = greyscale_make_transforms(config) else: tf1, tf2, tf3 = sobel_make_transforms(config) # Training data: # main output head (B), auxiliary overclustering head (A), same data for both dataset_head_B = torchvision.datasets.ImageFolder(root=train_data_path, transform=tf1), datasets_tf_head_B = [ torchvision.datasets.ImageFolder(root=train_data_path, transform=tf2) for _ in range(config.num_dataloaders) ] dataloaders_head_B = [torch.utils.data.DataLoader( dataset_head_B, batch_size=config.dataloader_batch_sz, shuffle=False, sampler=DeterministicRandomSampler(dataset_head_B), num_workers=0, drop_last=False)] + \ [torch.utils.data.DataLoader( datasets_tf_head_B[i], batch_size=config.dataloader_batch_sz, shuffle=False, sampler=DeterministicRandomSampler(datasets_tf_head_B[i]), num_workers=0, drop_last=False) for i in range(config.num_dataloaders)] dataset_head_A = torchvision.datasets.ImageFolder(root=train_data_path, transform=tf1) datasets_tf_head_A = [ torchvision.datasets.ImageFolder(root=train_data_path, transform=tf2) for _ in range(config.num_dataloaders) ] dataloaders_head_A = [torch.utils.data.DataLoader( dataset_head_A, batch_size=config.dataloader_batch_sz, shuffle=False, sampler=DeterministicRandomSampler(dataset_head_A), num_workers=0, drop_last=False)] + \ [torch.utils.data.DataLoader( datasets_tf_head_A[i], batch_size=config.dataloader_batch_sz, shuffle=False, sampler=DeterministicRandomSampler(datasets_tf_head_A[i]), num_workers=0, drop_last=False) for i in range(config.num_dataloaders)] # Testing data (labelled): mapping_assignment_dataloader, mapping_test_dataloader = None, None if os.path.exists(test_data_path): mapping_assignment_dataset = torchvision.datasets.ImageFolder( test_val_data_path, transform=tf3) mapping_assignment_dataloader = torch.utils.data.DataLoader( mapping_assignment_dataset, batch_size=config.batch_sz, shuffle=False, sampler=DeterministicRandomSampler(mapping_assignment_dataset), num_workers=0, drop_last=False) mapping_test_dataset = torchvision.datasets.ImageFolder(test_data_path, transform=tf3) mapping_test_dataloader = torch.utils.data.DataLoader( mapping_test_dataset, batch_size=config.batch_sz, shuffle=False, sampler=DeterministicRandomSampler(mapping_test_dataset), num_workers=0, drop_last=False) return dataloaders_head_A, dataloaders_head_B, mapping_assignment_dataloader, mapping_test_dataloader
def cluster_twohead_create_dataloaders(config): assert (config.mode == "IID") assert config.twohead target_transform = None if "CIFAR" in config.dataset: config.train_partitions_head_A = [True, False] config.train_partitions_head_B = config.train_partitions_head_A config.mapping_assignment_partitions = [True, False] config.mapping_test_partitions = [True, False] if config.dataset == "CIFAR10": dataset_class = torchvision.datasets.CIFAR10 elif config.dataset == "CIFAR100": dataset_class = torchvision.datasets.CIFAR100 elif config.dataset == "CIFAR20": dataset_class = torchvision.datasets.CIFAR100 target_transform = _cifar100_to_cifar20 else: assert (False) # datasets produce either 2 or 5 channel images based on config.include_rgb tf1, tf2, tf3 = sobel_make_transforms(config) elif config.dataset == "STL10": assert (config.mix_train) if not config.leave_out_unlabelled: print("adding unlabelled data for STL10") config.train_partitions_head_A = ["train+unlabeled", "test"] else: print("not using unlabelled data for STL10") config.train_partitions_head_A = ["train", "test"] config.train_partitions_head_B = ["train", "test"] config.mapping_assignment_partitions = ["train", "test"] config.mapping_test_partitions = ["train", "test"] dataset_class = torchvision.datasets.STL10 # datasets produce either 2 or 5 channel images based on config.include_rgb tf1, tf2, tf3 = sobel_make_transforms(config) elif config.dataset == "MNIST": config.train_partitions_head_A = [True, False] config.train_partitions_head_B = config.train_partitions_head_A config.mapping_assignment_partitions = [True, False] config.mapping_test_partitions = [True, False] dataset_class = torchvision.datasets.MNIST tf1, tf2, tf3 = greyscale_make_transforms(config) else: assert False print("Making datasets with %s and %s" % (dataset_class, target_transform)) sys.stdout.flush() dataloaders_head_A = _create_dataloaders( config, dataset_class, tf1, tf2, partitions=config.train_partitions_head_A, target_transform=target_transform) dataloaders_head_B = _create_dataloaders( config, dataset_class, tf1, tf2, partitions=config.train_partitions_head_B, target_transform=target_transform) mapping_assignment_dataloader = _create_mapping_loader( config, dataset_class, tf3, partitions=config.mapping_assignment_partitions, target_transform=target_transform) mapping_test_dataloader = _create_mapping_loader( config, dataset_class, tf3, partitions=config.mapping_test_partitions, target_transform=target_transform) return dataloaders_head_A, dataloaders_head_B, mapping_assignment_dataloader, mapping_test_dataloader
def cluster_create_dataloaders(config): assert (config.mode == "IID+") assert (not config.twohead) target_transform = None # separate train/test sets if "CIFAR" in config.dataset: config.train_partitions = [True] config.mapping_assignment_partitions = [True] config.mapping_test_partitions = [False] if config.dataset == "CIFAR10": dataset_class = torchvision.datasets.CIFAR10 elif config.dataset == "CIFAR100": dataset_class = torchvision.datasets.CIFAR100 elif config.dataset == "CIFAR20": dataset_class = torchvision.datasets.CIFAR100 target_transform = _cifar100_to_cifar20 else: assert (False) # datasets produce either 2 or 5 channel images based on config.include_rgb tf1, tf2, tf3 = sobel_make_transforms(config) elif config.dataset == "STL10": config.train_partitions = ["train+unlabeled"] config.mapping_assignment_partitions = ["train"] config.mapping_test_partitions = ["test"] dataset_class = torchvision.datasets.STL10 # datasets produce either 2 or 5 channel images based on config.include_rgb tf1, tf2, tf3 = sobel_make_transforms(config) elif config.dataset == "MNIST": config.train_partitions = [True] config.mapping_assignment_partitions = [True] config.mapping_test_partitions = [False] dataset_class = torchvision.datasets.MNIST tf1, tf2, tf3 = greyscale_make_transforms(config) else: assert (False) print("Making datasets with %s and %s" % (dataset_class, target_transform)) sys.stdout.flush() dataloaders = \ _create_dataloaders(config, dataset_class, tf1, tf2, partitions=config.train_partitions, target_transform=target_transform) mapping_assignment_dataloader = \ _create_mapping_loader(config, dataset_class, tf3, partitions=config.mapping_assignment_partitions, target_transform=target_transform) mapping_test_dataloader = \ _create_mapping_loader(config, dataset_class, tf3, partitions=config.mapping_test_partitions, target_transform=target_transform) return dataloaders, mapping_assignment_dataloader, mapping_test_dataloader
assert ("MNIST" == config.dataset) dataset_class = torchvision.datasets.MNIST assert (config.train_partitions == [True]) assert (config.mapping_assignment_partitions == [True]) assert (config.mapping_test_partitions == [False]) # append to old results if not hasattr(config, "assign_set_szs_pc_acc") or given_config.rewrite: print("resetting config.assign_set_szs_pc_acc to empty") config.assign_set_szs_pc_acc = {} for pc in new_assign_set_szs_pc: print("doing %f" % pc) sysout.flush() tf1, tf2, tf3 = greyscale_make_transforms(config) mapping_assignment_dataloader = \ _create_mapping_loader(config, dataset_class, tf3, partitions=config.mapping_assignment_partitions, truncate=True, truncate_pc=pc) mapping_test_dataloader = \ _create_mapping_loader(config, dataset_class, tf3, partitions=config.mapping_test_partitions) print("num assign batches: %d" % len(mapping_assignment_dataloader)) num_imgs = len(mapping_assignment_dataloader.dataset) print("num imgs in assign dataset: %d" % num_imgs) # networks and optimisers