def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) if args.debug: pdb.set_trace() # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Launch a writer for the tensorboard summary writer instance save_path = 'runs/' + strftime( "%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture # if we are resuming a previous training, note it in the name if args.resume: save_path = save_path + '_resumed' writer = SummaryWriter(save_path) # saving the parsed args to file log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") for arg in vars(args): log.write(arg + ':' + str(getattr(args, arg)) + '\n') # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the number of classes from the class dictionary num_classes = dataset.num_classes # we set an epoch multiplier to 1 for isolated training and increase it proportional to amount of tasks in CL epoch_multiplier = 1 # add command line options to TensorBoard args_to_tensorboard(writer, args) log.close() # build the model model = architectures.Inos_model(args.num_class, args) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) if not args.pretrained: # Initialize the weights of the model, by default according to He et al. print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) WeightInitializer.init_model(model) # Define optimizer and loss function (criterion) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=0.9, weight_decay=2e-4) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[30, 60, 80, 100], gamma=0.5) epoch = 0 best_prec = 0 best_loss = random.getrandbits(128) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # optimize until final amount of epochs is reached. Final amount of epochs is determined through the while epoch < (args.epochs * epoch_multiplier): if epoch + 2 == epoch % args.epochs: print("debug perpose") # train train(dataset, model, criterion, epoch, optimizer, writer, device, args) # evaluate on validation set prec, loss = validate(dataset, model, criterion, epoch, writer, device, save_path, args) # evaluate on test set prec_t, loss_t = test(dataset, model, criterion, epoch, writer, device, save_path, args) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) best_prec = max(prec, best_prec) save_checkpoint( { 'epoch': epoch, 'arch': args.architecture, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, is_best, save_path) # increment epoch counters epoch += 1 scheduler.step() writer.close()
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) if args.cross_dataset and not args.incremental_data: raise ValueError( 'cross-dataset training possible only if incremental-data flag set' ) # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Launch a writer for the tensorboard summary writer instance save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture +\ '_variational_samples_' + str(args.var_samples) + '_latent_dim_' + str(args.var_latent_dim) # add option specific naming to separate tensorboard log files later if args.autoregression: save_path += '_pixelcnn' if args.incremental_data: save_path += '_incremental' if args.train_incremental_upper_bound: save_path += '_upper_bound' if args.generative_replay: save_path += '_genreplay' if args.openset_generative_replay: save_path += '_opensetreplay' if args.cross_dataset: save_path += '_cross_dataset_' + args.dataset_order # if we are resuming a previous training, note it in the name if args.resume: save_path = save_path + '_resumed' writer = SummaryWriter(save_path) # saving the parsed args to file log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") for arg in vars(args): log.write(arg + ':' + str(getattr(args, arg)) + '\n') # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the number of classes from the class dictionary num_classes = dataset.num_classes # we set an epoch multiplier to 1 for isolated training and increase it proportional to amount of tasks in CL epoch_multiplier = 1 if args.incremental_data: from lib.Datasets.incremental_dataset import get_incremental_dataset # get the method to create the incremental dataste (inherits from the chosen data loader) inc_dataset_init_method = get_incremental_dataset( data_init_method, args) # different options for class incremental vs. cross-dataset experiments if args.cross_dataset: # if a task order file is specified, load the task order from it if args.load_task_order: # check if file exists and if file ends with extension '.txt' if os.path.isfile(args.load_task_order) and len(args.load_task_order) >= 4\ and args.load_task_order[-4:] == '.txt': print("=> loading task order from '{}'".format( args.load_task_order)) with open(args.load_task_order, 'rb') as fp: task_order = pickle.load(fp) # if no file is found default to cmd line task order else: # parse and split string at commas task_order = args.dataset_order.split(',') for i in range(len(task_order)): # remove blank spaces in dataset names task_order[i] = task_order[i].replace(" ", "") # use task order as specified in command line else: # parse and split string at commas task_order = args.dataset_order.split(',') for i in range(len(task_order)): # remove blank spaces in dataset names task_order[i] = task_order[i].replace(" ", "") # just for getting the number of classes in the first dataset num_classes = 0 for i in range(args.num_base_tasks): temp_dataset_init_method = getattr(datasets, task_order[i]) temp_dataset = temp_dataset_init_method( torch.cuda.is_available(), args) num_classes += temp_dataset.num_classes del temp_dataset # multiply epochs by number of tasks if args.num_increment_tasks: epoch_multiplier = ((len(task_order) - args.num_base_tasks) / args.num_increment_tasks) + 1 else: # this branch will get active if num_increment_tasks is set to zero. This is useful when training # any isolated upper bound with all datasets present from the start. epoch_multiplier = 1.0 else: # class incremental # if specified load task order from file if args.load_task_order: if os.path.isfile(args.load_task_order): print("=> loading task order from '{}'".format( args.load_task_order)) task_order = np.load(args.load_task_order).tolist() else: # if no file is found a random task order is created print( "=> no task order found. Creating randomized task order" ) task_order = np.random.permutation(num_classes).tolist() else: # if randomize task order is specified create a random task order, else task order is sequential task_order = [] for i in range(dataset.num_classes): task_order.append(i) if args.randomize_task_order: task_order = np.random.permutation(num_classes).tolist() # save the task order np.save(os.path.join(save_path, 'task_order.npy'), task_order) # set the number of classes to base tasks + 1 because base tasks is always one less. # E.g. if you have 2 classes it's one task. This is a little inconsistent from the naming point of view # but we wanted a single variable to work for both class incremental as well as cross-dataset experiments num_classes = args.num_base_tasks + 1 # multiply epochs by number of tasks epoch_multiplier = ( (len(task_order) - (args.num_base_tasks + 1)) / args.num_increment_tasks) + 1 print("Task order: ", task_order) # log the task order into the text file log.write('task_order:' + str(task_order) + '\n') args.task_order = task_order # this is a little weird, but it needs to be here because the below method pops items from task_order args_to_tensorboard(writer, args) assert epoch_multiplier.is_integer(), print( "uneven task division, make sure number of tasks are integers.") # Get the incremental dataset dataset = inc_dataset_init_method(torch.cuda.is_available(), device, task_order, args) else: # add command line options to TensorBoard args_to_tensorboard(writer, args) log.close() # Get a sample input from the data loader to infer color channels/size net_input, _ = next(iter(dataset.train_loader)) # get the amount of color channels in the input images num_colors = net_input.size(1) # import model from architectures class net_init_method = getattr(architectures, args.architecture) # if we are not building an autoregressive model the number of output channels of the model is equivalent to # the amount of input channels. For an autoregressive models we set the number of output channels of the # non-autoregressive decoder portion according to the command line option below if not args.autoregression: args.out_channels = num_colors # build the model model = net_init_method(device, num_classes, num_colors, args) # optionally add the autoregressive decoder if args.autoregression: model.pixelcnn = PixelCNN(device, num_colors, args.out_channels, args.pixel_cnn_channels, num_layers=args.pixel_cnn_layers, k=args.pixel_cnn_kernel_size, padding=args.pixel_cnn_kernel_size // 2) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) # Initialize the weights of the model, by default according to He et al. print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) WeightInitializer.init_model(model) # Define optimizer and loss function (criterion) optimizer = torch.optim.Adam(model.parameters(), args.learning_rate) epoch = 0 best_prec = 0 best_loss = random.getrandbits(128) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # optimize until final amount of epochs is reached. Final amount of epochs is determined through the while epoch < (args.epochs * epoch_multiplier): # visualize the latent space before each task increment and at the end of training if it is 2-D if epoch % args.epochs == 0 and epoch > 0 or (epoch + 1) % ( args.epochs * epoch_multiplier) == 0: if model.module.latent_dim == 2: print("Calculating and visualizing dataset embedding") # infer the number of current tasks to plot the different classes in the embedding if args.incremental_data: if args.cross_dataset: num_tasks = sum( dataset.num_classes_per_task[:len(dataset. seen_tasks)]) else: num_tasks = len(dataset.seen_tasks) else: num_tasks = num_classes zs = get_latent_embedding(model, dataset.train_loader, num_tasks, device) visualize_dataset_in_2d_embedding(writer, zs, args.dataset, save_path, task=num_tasks) # continual learning specific part if args.incremental_data: # at the end of each task increment if epoch % args.epochs == 0 and epoch > 0: print('Saving the last checkpoint from the previous task ...') save_task_checkpoint(save_path, epoch // args.epochs) print("Incrementing dataset ...") dataset.increment_tasks( model, args.batch_size, args.workers, writer, save_path, is_gpu=torch.cuda.is_available(), upper_bound_baseline=args.train_incremental_upper_bound, generative_replay=args.generative_replay, openset_generative_replay=args.openset_generative_replay, openset_threshold=args.openset_generative_replay_threshold, openset_tailsize=args.openset_weibull_tailsize, autoregression=args.autoregression) # grow the classifier and increment the variable for number of overall classes so we can use it later if args.cross_dataset: grow_classifier( model.module.classifier, sum(dataset.num_classes_per_task[:len(dataset. seen_tasks)]) - model.module.num_classes, WeightInitializer) model.module.num_classes = sum( dataset.num_classes_per_task[:len(dataset.seen_tasks)]) else: model.module.num_classes += args.num_increment_tasks grow_classifier(model.module.classifier, args.num_increment_tasks, WeightInitializer) # reset moving averages etc. of the optimizer optimizer = torch.optim.Adam(model.parameters(), args.learning_rate) # change the number of seen classes if epoch % args.epochs == 0: model.module.seen_tasks = dataset.seen_tasks # train train(dataset, model, criterion, epoch, optimizer, writer, device, args) # evaluate on validation set prec, loss = validate(dataset, model, criterion, epoch, writer, device, save_path, args) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) best_prec = max(prec, best_prec) save_checkpoint( { 'epoch': epoch, 'arch': args.architecture, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, is_best, save_path) # increment epoch counters epoch += 1 # if a new task begins reset the best prec so that new best model can be stored. if args.incremental_data and epoch % args.epochs == 0: best_prec = 0 best_loss = random.getrandbits(128) writer.close()
def __train_val_net(self, state_list, state_space_parameters, dataset): # TODO: for average as reward # reward = AverageMeter() # TODO: for best reward reward = 0. net_input, _ = next(iter(dataset.val_loader)) gen_model = net(state_list, state_space_parameters, net_input, self.args.batch_norm, self.args.drop_out_drop) disc_model = discriminator(state_space_parameters, net_input, self.args.discriminator_classes) print(gen_model) print('-' * 80) print('Input size: {}'.format(gen_model.input_size)) print('-' * 80) print('Estimated total gpu usage of model: {gpu_usage:.4f} GB'.format( gpu_usage=gen_model.gpu_usage)) model_activations_gpu = gen_model.gpu_usage cudnn.benchmark = True self.WeightInitializer.init_model(gen_model) self.WeightInitializer.init_model(disc_model) gen_model = gen_model.to(self.device) disc_model = disc_model.to(self.device) print('available:{}'.format( (self.gpu_mem_0.total_mem - self.gpu_mem_0.total_mem * self.gpu_mem_0.get_mem_util()) / 1024.)) print('required per gpu with buffer: {}'.format( (3. / float(self.args.no_gpus) * model_activations_gpu) + 1)) print('-' * 80) if ((self.gpu_mem_0.total_mem - self.gpu_mem_0.total_mem * self.gpu_mem_0.get_mem_util()) / 1024.) < ( (3. / float(self.args.no_gpus) * model_activations_gpu) + 1): del gen_model, disc_model return [None] * 2 elif not (gen_model.convT_no > 0 or gen_model.wrnT_bb_no > 0 or gen_model.fc_no > 0): del gen_model, disc_model return [None] * 2 if int(self.args.no_gpus) > 1: gen_model = torch.nn.DataParallel(gen_model) disc_model = torch.nn.DataParallel(disc_model) criterion = nn.BCELoss(size_average=True).to(self.device) gen_optimizer = optim.SGD(filter(lambda p: p.requires_grad, gen_model.parameters()), lr=self.args.learning_rate, momentum=self.args.momentum, weight_decay=self.args.weight_decay) disc_optimizer = optim.SGD(filter(lambda p: p.requires_grad, disc_model.parameters()), lr=self.args.learning_rate, momentum=self.args.momentum, weight_decay=self.args.weight_decay) lr_scheduler = LearningRateScheduler(self.args.lr_wr_epochs, len(dataset.train_loader.dataset), self.args.batch_size, self.args.learning_rate, self.args.lr_wr_mul, self.args.lr_wr_min) save_path_pictures = os.path.join(self.save_path, str(self.count + 1)) if not os.path.exists(save_path_pictures): os.mkdir(save_path_pictures) train_flag = True epoch = 0 while epoch < self.args.epochs: disc_losses_train, gen_losses_train = train(dataset, gen_model, disc_model, criterion,\ epoch, gen_optimizer, disc_optimizer,\ lr_scheduler, self.device, self.args) disc_losses_valid, gen_losses_valid = validate(dataset, gen_model, disc_model, criterion, epoch,\ self.device, self.args, save_path_pictures) reward = max(reward, 1. / (disc_losses_valid + gen_losses_valid)) # TODO: include early stopping criterion, plotting epoch += 1 del gen_model, disc_model, criterion, disc_optimizer, gen_optimizer, lr_scheduler return reward, train_flag
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") cudnn.benchmark = True num_GPUs = torch.cuda.device_count() # If save directory for runs doesn't exist then create it if not os.path.exists('runs'): os.mkdir('runs') # Create a time-stamped save path for individual experiment save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + \ ';' + args.dataset + ';' + args.architecture os.mkdir(save_path) # List of values to log to csv columns_list = [ 'Filters', 'Parameters', 'Mean', 'Variance', 'Skew', 'BestVal', 'BestValsTrain', 'BestEpoch', 'LastValPrec', 'LastTrainPrec', 'AllTrain', 'AllVal' ] df = pd.DataFrame(columns=columns_list) # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the amount of color channels in the input images net_input, _ = next(iter(dataset.train_loader)) num_colors = net_input.size(1) # import model from architectures class net_init_method = getattr(architectures, args.architecture) # Get the parameters for all valid skewed models SNModels = SkewNormalModels(depth=args.vgg_depth, num_classes=dataset.num_classes, patch_size=args.patch_size) skew_model_params = SNModels.get_valid_models() print("Total number of models: ", len(skew_model_params["filters"])) # Weight-init method WeightInitializer = WeightInit(args.weight_init) # Optionally resume a previous experiment current_id = args.resume_model_id for i in range(len(skew_model_params["filters"]) - current_id): print("Model filters: ", skew_model_params["filters"][i + current_id]) print("Model parameters: ", skew_model_params["total_params"][i + current_id], " mean: ", skew_model_params["means"][i + current_id], " var: ", skew_model_params["vars"][i + current_id], " skew: ", skew_model_params["skews"][i + current_id]) model = net_init_method(device, dataset.num_classes, num_colors, args, skew_model_params["filters"][i + current_id], custom_filters=True) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) # Initialize the weights of the model print("Initializing networks with: " + args.weight_init) WeightInitializer.init_model(model) # Define criterion and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Initialize SGDWR learning rate scheduler lr_scheduler = LearningRateScheduler(args.lr_wr_epochs, len(dataset.train_loader.dataset), args.batch_size, args.learning_rate, args.lr_wr_mul, args.lr_wr_min) # Get estimated GPU memory usage of the model and split batch if too little memory is available if torch.cuda.is_available(): GPUMemory = GPUMem(torch.cuda.is_available()) print('available:{}'.format( (GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.)) print('required per gpu with buffer: {}'.format( (4. / float(num_GPUs) * model.module.gpu_usage) + 1.)) # calculate smaller chunk size to split batch into sequential computations mem_scale_factor = 4.0 # TODO: WEIRD factor... why is this necessary and where does it come from? # TODO: the + 1 Gb should be taken from the cache allocator if ((GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.) < ( (mem_scale_factor / float(num_GPUs) * model.module.gpu_usage) + 1.): # code for variable batch size implementation as per gpu constraint; remove for old code approx_small_batch_size = (((GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024. - 1.) * float(num_GPUs) / mem_scale_factor) //\ (model.module.gpu_usage / float(args.batch_size)) diff = float('inf') temp_small_batch_size = approx_small_batch_size for j in range(1, (args.batch_size // 2) + 1): if args.batch_size % j == 0 and abs( j - approx_small_batch_size) < diff: diff = abs(j - approx_small_batch_size) temp_small_batch_size = j batch_seq_split_size = temp_small_batch_size else: batch_seq_split_size = args.batch_size else: batch_seq_split_size = args.batch_size # Get training and validation dataset loaders dataset.train_loader, dataset.val_loader = dataset.get_dataset_loader( batch_seq_split_size, args.workers, device) print( 'sequential batch size split size:{}'.format(batch_seq_split_size)) epoch = 0 best_epoch = 0 best_prec = 0 best_val_train_prec = 0 all_train = [] all_val = [] while epoch < args.epochs: # train for one epoch train_prec = train(dataset.train_loader, model, criterion, epoch, optimizer, lr_scheduler, device, batch_seq_split_size, args) # evaluate on validation set prec = validate(dataset.val_loader, model, criterion, epoch, device, args) all_train.append(train_prec) all_val.append(prec) # remember best prec@1 and save checkpoint is_best = prec > best_prec if is_best: best_epoch = epoch best_val_train_prec = train_prec best_prec = prec # if architecture doesn't train at all skip it if epoch == args.lr_wr_epochs - 1 and train_prec < ( 2 * 100.0 / dataset.num_classes): break # increment epoch counters epoch += 1 lr_scheduler.scheduler_epoch += 1 # append architecture results to csv df = df.append(pd.DataFrame([[ skew_model_params["filters"][i + current_id], skew_model_params["total_params"][i + current_id], skew_model_params["means"][i + current_id], skew_model_params["vars"][i + current_id], skew_model_params["skews"][i + current_id], best_prec, best_val_train_prec, best_epoch, prec, train_prec, all_train, all_val ]], columns=columns_list), ignore_index=True) df.to_csv(save_path + '/model_%03d' % (i + 1 + current_id) + '.csv') del model del optimizer
def __train_val_net(self, state_list, state_space_parameters, dataset): best_prec = 0. num_classes = len(dataset.val_loader.dataset.class_to_idx) net_input, _ = next(iter(dataset.val_loader)) model = net(state_list, state_space_parameters, num_classes, net_input, self.args.batch_norm, self.args.drop_out_drop) print(model) print('-' * 80) print('SPP levels: {}'.format(model.spp_filter_size)) print('-' * 80) print('Estimated total gpu usage of model: {gpu_usage:.4f} GB'.format( gpu_usage=model.gpu_usage)) model_activations_gpu = model.gpu_usage cudnn.benchmark = True self.WeightInitializer.init_model(model) model = model.to(self.device) print('available:{}'.format( (self.gpu_mem_0.total_mem - self.gpu_mem_0.total_mem * self.gpu_mem_0.get_mem_util()) / 1024.)) print('required per gpu with buffer: {}'.format( (3. / float(self.args.no_gpus) * model_activations_gpu) + 1)) print('-' * 80) if ((self.gpu_mem_0.total_mem - self.gpu_mem_0.total_mem * self.gpu_mem_0.get_mem_util()) / 1024.) < ( (3. / float(self.args.no_gpus) * model_activations_gpu) + 1): del model return [None] * 12 if int(self.args.no_gpus) > 1: model = torch.nn.DataParallel(model) criterion = nn.BCELoss(size_average=True).to(self.device) optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=self.args.learning_rate, momentum=self.args.momentum, weight_decay=self.args.weight_decay) lr_scheduler = LearningRateScheduler(self.args.lr_wr_epochs, len(dataset.train_loader.dataset), self.args.batch_size, self.args.learning_rate, self.args.lr_wr_mul, self.args.lr_wr_min) train_flag = True epoch = 0 while epoch < self.args.epochs: train(dataset, model, criterion, epoch, optimizer, lr_scheduler, self.device, self.args) prec = validate(dataset, model, criterion, epoch, self.device, self.args) best_prec = max(prec, best_prec) # TODO: hard-coded early stopping criterion of last prec < 15% if epoch == (self.args.lr_wr_epochs - 1) and float(prec) < (1.5 * 100. / 10): train_flag = False break epoch += 1 if self.args.no_gpus > 1: spp_filter_size = model.module.spp_filter_size else: spp_filter_size = model.spp_filter_size del model, criterion, optimizer, lr_scheduler return spp_filter_size, best_prec, train_flag
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # import the correct loss and training functions depending which model to optimize # TODO: these could easily be refactored into one function, but we kept it this way for modularity if args.train_var: if args.joint: from lib.Training.train import train_var_joint as train from lib.Training.validate import validate_var_joint as validate from lib.Training.loss_functions import var_loss_function_joint as criterion else: from lib.Training.train import train_var as train from lib.Training.validate import validate_var as validate from lib.Training.loss_functions import var_loss_function as criterion else: if args.joint: from lib.Training.train import train_joint as train from lib.Training.validate import validate_joint as validate from lib.Training.loss_functions import loss_function_joint as criterion else: from lib.Training.train import train as train from lib.Training.validate import validate as validate from lib.Training.loss_functions import loss_function as criterion # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Launch a writer for the tensorboard summary writer instance save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture +\ '_dropout_' + str(args.dropout) if args.train_var: save_path += '_variational_samples_' + str( args.var_samples) + '_latent_dim_' + str(args.var_latent_dim) if args.joint: save_path += '_joint' # if we are resuming a previous training, note it in the name if args.resume: save_path = save_path + '_resumed' writer = SummaryWriter(save_path) # saving the parsed args to file log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") for arg in vars(args): log.write(arg + ':' + str(getattr(args, arg)) + '\n') # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the number of classes from the class dictionary num_classes = dataset.num_classes # add command line options to TensorBoard args_to_tensorboard(writer, args) log.close() # Get a sample input from the data loader to infer color channels/size net_input, _ = next(iter(dataset.train_loader)) # get the amount of color channels in the input images num_colors = net_input.size(1) # import model from architectures class net_init_method = getattr(architectures, args.architecture) # build the model model = net_init_method(device, num_classes, num_colors, args) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) # Initialize the weights of the model, by default according to He et al. print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) WeightInitializer.init_model(model) # Define optimizer and loss function (criterion) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) epoch = 0 best_prec = 0 best_loss = random.getrandbits(128) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # optimize until final amount of epochs is reached. while epoch < args.epochs: # train train(dataset, model, criterion, epoch, optimizer, writer, device, args) # evaluate on validation set prec, loss = validate(dataset, model, criterion, epoch, writer, device, args) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) best_prec = max(prec, best_prec) save_checkpoint( { 'epoch': epoch, 'arch': args.architecture, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, is_best, save_path) # increment epoch counters epoch += 1 writer.close()
def train_val_net(state_list, dataset, weight_initializer, device, args, save_path): """ builds a net given a state list, and trains and validates it Parameters: state_list (list): list of states to build the net dataset (lib.Datasets.datasets.CODEBRIM): dataset to train and validate the net on weight_initializer (lib.Models.initialization.WeightInit): weight initializer for initializing the weights of the network device (torch.device): type of computational device available (cpu / gpu) args (argparse.ArgumentParser): parsed command line arguments save_path (string): path for saving results to Returns: memfit (bool): True if the network fits the memory after batch splitting, False otherwise val_acc_all_epochs (list): list of validation accuracies in all epochs train_flag (bool): False if net's been early-stopped, False otherwise """ # reset the data loaders dataset.train_loader, dataset.val_loader, dataset.test_loader = dataset.get_dataset_loader( args.batch_size, args.workers, torch.cuda.is_available()) net_input, _ = next(iter(dataset.train_loader)) num_classes = dataset.num_classes batch_size = net_input.size(0) # gets number of available gpus and total gpu memory num_gpu = float(torch.cuda.device_count()) gpu_mem = GPUMem(torch.device('cuda') == device) # builds the net from the state list model = Net(state_list, num_classes, net_input, args.batch_norm, args.drop_out_drop) print(model) print('*' * 80) print('no. of spp scales: {}'.format(model.spp_size)) print('*' * 80) # sets cudnn benchmark flag cudnn.benchmark = True # initializes weights weight_initializer.init_model(model) # puts model on gpu/cpu model = model.to(device) # gets available gpu memory gpu_avail = (gpu_mem.total_mem - gpu_mem.total_mem * gpu_mem.get_mem_util()) / 1024. print('gpu memory available:{gpu_avail:.4f}'.format(gpu_avail=gpu_avail)) # prints estimated gpu requirement of model but actual memory requirement is higher than what's estimated (from # experiments) print("model's estimated gpu memory requirement: {gpu_mem_req:.4f} GB". format(gpu_mem_req=model.gpu_mem_req)) # scaling factor and buffer for matching expected memory requirement with empirically observed memory requirement scale_factor = 4.0 scale_buffer = 1.0 scaled_gpu_mem_req = (scale_factor / num_gpu) * model.gpu_mem_req + scale_buffer print( "model's empirically scaled gpu memory requirement: {scaled_gpu_mem_req:.4f}" .format(scaled_gpu_mem_req=scaled_gpu_mem_req)) split_batch_size = batch_size # splits batch into smaller batches if gpu_avail < scaled_gpu_mem_req: # estimates split batch size as per available gpu mem. (may not be a factor of original batch size) approx_split_batch_size = int( ((gpu_avail - scale_buffer) * num_gpu / scale_factor) // (model.gpu_mem_req / float(batch_size))) diff = float('inf') temp_split_batch_size = 1 # sets split batch size such that it's close to the estimated split batch size, is also a factor of original # batch size & should give a terminal batch size of more than 1 for j in range(2, approx_split_batch_size + 1): if batch_size % j == 0 and abs( j - approx_split_batch_size) < diff and ( len(dataset.train_set) % j > 1): diff = abs(j - approx_split_batch_size) temp_split_batch_size = j split_batch_size = temp_split_batch_size print('split batch size:{}'.format(split_batch_size)) print('*' * 80) # returns memfit = False if model doesn't fit in memory even after splitting the batch size to as small as 1 if split_batch_size < 2: return False, None, None, None, None, None, False, None, None, None, None, None, None # set the data loaders using the split batch size dataset.train_loader, dataset.val_loader, dataset.test_loader = dataset.get_dataset_loader( split_batch_size, args.workers, torch.cuda.is_available()) # use data parallelism for multi-gpu machine model = torch.nn.DataParallel(model) # cross entropy loss criterion (LogSoftmax and NLLoss together) criterion = nn.BCELoss(reduction='mean').to(device) # SGD optimizer with warm restarts optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # quarter cosine learning rate schedule for SGD with warm restarts lr_scheduler = LearningRateScheduler(args.lr_wr_epochs, len(dataset.train_loader.dataset), args.batch_size, args.learning_rate, args.lr_wr_mul, args.lr_wr_min) train_flag = True epoch = 0 loss_val_all_epochs = [] hard_val_all_epochs = [] soft_val_all_epochs = [] hard_best_background = 0.0 hard_best_crack = 0.0 hard_best_spallation = 0.0 hard_best_exposed_bars = 0.0 hard_best_efflorescence = 0.0 hard_best_corrosion_stain = 0.0 while epoch < args.epochs: # train and validate the model train(dataset.train_loader, model, criterion, epoch, optimizer, lr_scheduler, device, args, split_batch_size) loss_val, hard_val, soft_val, hard_background, hard_crack, hard_spallation, hard_exposed_bars,\ hard_efflorescence, hard_corrosion_stain = val(dataset.val_loader, model, criterion, device) if int(args.task) == 2: _ = val(dataset.test_loader, model, criterion, device, is_val=False) if len(hard_val_all_epochs) == 0 or hard_val == max( hard_val_all_epochs): hard_best_background = hard_background hard_best_crack = hard_crack hard_best_spallation = hard_spallation hard_best_exposed_bars = hard_exposed_bars hard_best_efflorescence = hard_efflorescence hard_best_corrosion_stain = hard_corrosion_stain loss_val_all_epochs.append(loss_val) hard_val_all_epochs.append(hard_val) soft_val_all_epochs.append(soft_val) if int(args.task) == 2: # saves model dict while training fixed net state = { 'epoch': epoch, 'arch': 'Fixed net: replay buffer - {}, index no - {}'.format( args.replay_buffer_csv_path, args.fixed_net_index_no), 'state_dict': model.state_dict(), 'hard_val': hard_val, 'optimizer': optimizer.state_dict() } save_checkpoint(state, max(hard_val_all_epochs) == hard_val, save_path) # checks for early stopping; early-stops if the mean of the validation accuracy from the last 3 epochs before # the early stopping epoch isn't at least as high as the early stopping threshold if epoch == (args.early_stopping_epoch - 1) and float(np.mean(hard_val_all_epochs[-5:])) <\ (args.early_stopping_thresh * 100.): train_flag = False break epoch += 1 hard_best_val = max(hard_val_all_epochs) soft_best_val = max(soft_val_all_epochs) # free up memory by deleting objects spp_size = model.module.spp_size del model, criterion, optimizer, lr_scheduler return True, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, soft_val_all_epochs, train_flag,\ hard_best_background, hard_best_crack, hard_best_spallation, hard_best_exposed_bars,\ hard_best_efflorescence, hard_best_corrosion_stain