def validation(validation_data_loader: torch.utils.data.DataLoader, model: SupervisedModel, local_rank: int) -> tuple: """ :param validation_data_loader: Validation data loader :param model: ResNet based classifier. :param local_rank: local rank. :return: validation loss, the number of corrected samples, and the size of samples on a local """ model.eval() sum_loss = torch.tensor([0.]).to(local_rank) num_corrects = torch.tensor([0.]).to(local_rank) with torch.no_grad(): for data, targets in validation_data_loader: data, targets = data.to(local_rank), targets.to(local_rank) unnormalized_features = model(data) loss = torch.nn.functional.cross_entropy(unnormalized_features, targets, reduction="sum") predicted = torch.max(unnormalized_features.data, 1)[1] sum_loss += loss.item() num_corrects += (predicted == targets).sum() return sum_loss, num_corrects
if test_split < 0 or test_split > 9: raise Exception("Testing Split must be in range 0-9.") print('Using CK+ testing split: {}'.format(test_split)) checkpoint_dir = os.path.join(args.checkpoint_dir, 'checkpoints_48_' + str(test_split)) print 'Checkpoint dir: ', checkpoint_dir pid = os.getpid() print('PID: {}'.format(pid)) f = open('pid_' + str(test_split), 'wb') f.write(str(pid) + '\n') f.close() # Load model model = SupervisedModel('experiment', './', learning_rate=1e-2) #util.load_checkpoint(model, "./checkpoints_5/experiment-07m-20d-16h-24m-52s.pkl") monitor = util.Monitor(model, checkpoint_directory=checkpoint_dir, save_steps=1000) # Add dropout to fully-connected layer model.fc4.dropout = 0.5 model._compile() # Loading CK+ dataset print('Loading Data') #supervised_data_loader = SupervisedDataLoaderCrossVal( # data_paths.ck_plus_data_path) #train_data_container = supervised_data_loader.load('train', train_split) #test_data_container = supervised_data_loader.load('test', train_split)
description='Script to evaluate single checkpoint on TFD.') parser.add_argument("-s", "--split", default='0', help='Training split of TFD to use. (0-4)') parser.add_argument("checkpoint_file", help='Path to single model checkpoint (.pkl) file.') args = parser.parse_args() checkpoint_file = args.checkpoint_file fold = int(args.split) dataset_path = os.path.join(data_paths.tfd_data_path, 'npy_files/TFD_96/split_'+str(fold)) print 'Checkpoint: %s' % checkpoint_file print 'Testing on split %d\n' % fold # Load model model = SupervisedModel('evaluation', './') # Load dataset supervised_data_loader = SupervisedDataLoader(dataset_path) test_data_container = supervised_data_loader.load(2) test_data_container.X = numpy.float32(test_data_container.X) test_data_container.X /= 255.0 test_data_container.X *= 2.0 # Construct evaluator preprocessor = [util.Normer3(filter_size=5, num_channels=1)] evaluator = util.Evaluator(model, test_data_container, checkpoint_file, preprocessor) # For the inputted checkpoint, compute the overall test accuracy
args = parser.parse_args() print('Start') train_split = int(args.split) if train_split < 0 or train_split > 4: raise Exception("Training Split must be in range 0-4.") print('Using TFD training split: {}'.format(train_split)) pid = os.getpid() print('PID: {}'.format(pid)) f = open('pid_'+str(train_split), 'wb') f.write(str(pid)+'\n') f.close() # Load model model = SupervisedModel('experiment', './', learning_rate=1e-2) monitor = util.Monitor(model, checkpoint_directory='checkpoints_'+str(train_split), save_steps=1000) # Add dropout flag to fully-connected layer model.fc4.dropout = 0.5 model._compile() # Loading TFD dataset print('Loading Data') supervised_data_loader = SupervisedDataLoader( os.path.join(data_paths.tfd_data_path, 'npy_files/TFD_96/split_'+str(train_split))) train_data_container = supervised_data_loader.load(0) val_data_container = supervised_data_loader.load(1) test_data_container = supervised_data_loader.load(2)
args = parser.parse_args() print('Start') train_split = int(args.split) if train_split < 0 or train_split > 9: raise Exception("Training Split must be in range 0-9.") print('Using CK+ training split: {}'.format(train_split)) pid = os.getpid() print('PID: {}'.format(pid)) f = open('pid_' + str(train_split), 'wb') f.write(str(pid) + '\n') f.close() # Load model model = SupervisedModel('experiment', './', learning_rate=1e-2) monitor = util.Monitor(model, checkpoint_directory='checkpoints_' + str(train_split), save_steps=1000) # Loading CK+ dataset print('Loading Data') supervised_data_loader = SupervisedDataLoaderCrossVal( data_paths.ck_plus_data_path) train_data_container = supervised_data_loader.load('train', train_split) test_data_container = supervised_data_loader.load('test', train_split) X_train = train_data_container.X X_train = numpy.float32(X_train) X_train /= 255.0 X_train *= 2.0
args = parser.parse_args() print('Start') train_split = int(args.split) if train_split < 0 or train_split > 9: raise Exception("Training Split must be in range 0-9.") print('Using CK+ training split: {}'.format(train_split)) pid = os.getpid() print('PID: {}'.format(pid)) f = open('pid_'+str(train_split), 'wb') f.write(str(pid)+'\n') f.close() # Load model model = SupervisedModel('experiment', './', learning_rate=1e-2) monitor = util.Monitor(model, checkpoint_directory='checkpoints_'+str(train_split), save_steps=1000) # Loading CK+ dataset print('Loading Data') supervised_data_loader = SupervisedDataLoaderCrossVal( data_paths.ck_plus_data_path) train_data_container = supervised_data_loader.load('train', train_split) test_data_container = supervised_data_loader.load('test', train_split) X_train = train_data_container.X X_train = numpy.float32(X_train) X_train /= 255.0 X_train *= 2.0
def learning( cfg: OmegaConf, training_data_loader: torch.utils.data.DataLoader, validation_data_loader: torch.utils.data.DataLoader, model: SupervisedModel, ) -> None: """ Learning function including evaluation :param cfg: Hydra's config instance :param training_data_loader: Training data loader :param validation_data_loader: Validation data loader :param model: Model :return: None """ local_rank = cfg["distributed"]["local_rank"] num_gpus = cfg["distributed"]["world_size"] epochs = cfg["parameter"]["epochs"] num_training_samples = len(training_data_loader.dataset.data) steps_per_epoch = int( num_training_samples / (cfg["experiment"]["batches"] * num_gpus)) # because the drop=True total_steps = cfg["parameter"]["epochs"] * steps_per_epoch warmup_steps = cfg["parameter"]["warmup_epochs"] * steps_per_epoch current_step = 0 best_metric = np.finfo(np.float64).max optimizer = torch.optim.SGD(params=model.parameters(), lr=calculate_initial_lr(cfg), momentum=cfg["parameter"]["momentum"], nesterov=False, weight_decay=cfg["experiment"]["decay"]) # https://github.com/google-research/simclr/blob/master/lars_optimizer.py#L26 optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) cos_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer.optim, T_max=total_steps - warmup_steps, ) for epoch in range(1, epochs + 1): # training model.train() training_data_loader.sampler.set_epoch(epoch) for data, targets in training_data_loader: # adjust learning rate by applying linear warming if current_step <= warmup_steps: lr = calculate_lr(cfg, warmup_steps, current_step) for param_group in optimizer.param_groups: param_group["lr"] = lr optimizer.zero_grad() data, targets = data.to(local_rank), targets.to(local_rank) unnormalized_features = model(data) loss = torch.nn.functional.cross_entropy(unnormalized_features, targets) loss.backward() optimizer.step() # adjust learning rate by applying cosine annealing if current_step > warmup_steps: cos_lr_scheduler.step() current_step += 1 if local_rank == 0: logger_line = "Epoch:{}/{} progress:{:.3f} loss:{:.3f}, lr:{:.7f}".format( epoch, epochs, epoch / epochs, loss.item(), optimizer.param_groups[0]["lr"]) # During warmup phase, we skip validation sum_val_loss, num_val_corrects = validation(validation_data_loader, model, local_rank) torch.distributed.barrier() torch.distributed.reduce(sum_val_loss, dst=0) torch.distributed.reduce(num_val_corrects, dst=0) num_val_samples = len(validation_data_loader.dataset) # logging and save checkpoint if local_rank == 0: validation_loss = sum_val_loss.item() / num_val_samples validation_acc = num_val_corrects.item() / num_val_samples logging.info(logger_line + " val loss:{:.3f}, val acc:{:.2f}%".format( validation_loss, validation_acc * 100.)) if cfg["parameter"]["metric"] == "loss": metric = validation_loss else: metric = 1. - validation_acc if metric <= best_metric: if "save_fname" in locals(): if os.path.exists(save_fname): os.remove(save_fname) save_fname = "epoch={}-{}".format( epoch, cfg["experiment"]["output_model_name"]) torch.save(model.state_dict(), save_fname)
def main(cfg: OmegaConf): logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) stream_handler.terminator = "" logger.addHandler(stream_handler) check_hydra_conf(cfg) init_ddp(cfg) # fix seed seed = cfg["parameter"]["seed"] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False rank = cfg["distributed"]["local_rank"] logger.info("Using {}".format(rank)) root = "~/pytorch_datasets" if cfg["experiment"]["name"].lower() == "cifar10": transform = create_simclr_data_augmentation( cfg["experiment"]["strength"], size=32) training_dataset = torchvision.datasets.CIFAR10(root=root, train=True, download=True, transform=transform) validation_dataset = torchvision.datasets.CIFAR10( root=root, train=False, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), ])) num_classes = 10 elif cfg["experiment"]["name"].lower() == "cifar100": transform = create_simclr_data_augmentation( cfg["experiment"]["strength"], size=32) training_dataset = torchvision.datasets.CIFAR100(root=root, train=True, download=True, transform=transform) validation_dataset = torchvision.datasets.CIFAR100( root=root, train=False, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), ])) num_classes = 100 else: assert cfg["experiment"]["name"].lower() in {"cifar10", "cifar100"} sampler = torch.utils.data.distributed.DistributedSampler(training_dataset, shuffle=True) training_data_loader = DataLoader( dataset=training_dataset, sampler=sampler, num_workers=cfg["parameter"]["num_workers"], batch_size=cfg["experiment"]["batches"], pin_memory=True, drop_last=True, ) validation_sampler = torch.utils.data.distributed.DistributedSampler( validation_dataset, shuffle=False) validation_data_loader = DataLoader( dataset=validation_dataset, sampler=validation_sampler, num_workers=cfg["parameter"]["num_workers"], batch_size=cfg["experiment"]["batches"], pin_memory=True, drop_last=False, ) model = SupervisedModel(base_cnn=cfg["experiment"]["base_cnn"], num_classes=num_classes) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = model.to(rank) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank]) learning(cfg, training_data_loader, validation_data_loader, model)
args = parser.parse_args() print('Start') train_split = int(args.split) if train_split < 0 or train_split > 4: raise Exception("Training Split must be in range 0-4.") print('Using TFD training split: {}'.format(train_split)) pid = os.getpid() print('PID: {}'.format(pid)) f = open('pid_'+str(train_split), 'wb') f.write(str(pid)+'\n') f.close() # Load model model = SupervisedModel('experiment', './', learning_rate=1e-2) monitor = util.Monitor(model, checkpoint_directory='checkpoints_'+str(train_split), save_steps=1000) # Loading TFD dataset print('Loading Data') supervised_data_loader = SupervisedDataLoader( os.path.join(data_paths.tfd_data_path, 'npy_files/TFD_96/split_'+str(train_split))) train_data_container = supervised_data_loader.load(0) val_data_container = supervised_data_loader.load(1) test_data_container = supervised_data_loader.load(2) X_train = train_data_container.X y_train = train_data_container.y X_val = val_data_container.X