def main(): save_path = './runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) if not os.path.exists(save_path): os.makedirs(save_path) log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) log.write(arg + ':' + str(getattr(args, arg)) + '\n') utils.extract(args.dataset, args.raw_data) utils.resize(args.mask_data, args.resized_size) utils.resize(args.eval_data, args.resized_size) utils.deskew(args.image_mask_countdata) utils.deskew(args.eval_data) hyperparameter_list = [args.no_scales, args.no_directions, args.sigma,\ args.frequency_max, args.frequency_factor] image_mask_label_array = mask(args.mask_data, args.no_scales*args.no_directions, \ args.resized_size, hyperparameter_list) eval_accuracy = eval(args.eval_data, args.mask_data, args.resized_size, args.image_mask_label_array, \ args.no_scales*args.no_directions, log, hyperparameter_list, args.match_ite, \ args.relative_weight) print("Final Evaluation Accuracy:{eval_accuracy:.3f}".format(eval_accuracy)) log.write("Final Evaluation Accuracy:{eval_accuracy:.3f}".format(eval_accuracy)) log.close()
def main(): # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") save_path = './runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) if not os.path.exists(save_path): os.makedirs(save_path) log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") # TODO: gives interrupted sys call error # log_file = os.path.join(save_path, "stdout") # sys.stdout = Logger(log_file) # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) log.write(arg + ':' + str(getattr(args, arg)) + '\n') log.close() # Initialize the weights of the model print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) # Dataset loading # TODO: hard-coded file paths patch_size = args.patch_size data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) gen = QLearner(state_space_parameters, 1, WeightInitializer, device, args, save_path, qstore = args.qstore_path,\ replaydict = args.replay_dict_path) if (args.continue_epsilon not in np.array(state_space_parameters.epsilon_schedule)[:, 0]): raise ValueError('continue-epsilon {} not in epsilon schedule!'.format( args.continue_epsilon)) for episode in state_space_parameters.epsilon_schedule: epsilon = episode[0] M = episode[1] for ite in range(1, M + 1): if epsilon == args.continue_epsilon and args.continue_ite > M: raise ValueError( 'continue-ite {} not within range of continue-epsilon {} in epsilon schedule!' .format(args.continue_ite, epsilon)) if (epsilon == args.continue_epsilon and ite >= args.continue_ite ) or (epsilon < args.continue_epsilon): print('ite:{}, epsilon:{}'.format(ite, epsilon)) gen.generate_net(epsilon, dataset) gen.replay_dictionary.to_csv(os.path.join(save_path, 'replayDictFinal.csv')) gen.qstore.save_to_csv(os.path.join(save_path, 'qValFinal.csv'))
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) if args.cross_dataset and not args.incremental_data: raise ValueError( 'cross-dataset training possible only if incremental-data flag set' ) # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Launch a writer for the tensorboard summary writer instance save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture +\ '_variational_samples_' + str(args.var_samples) + '_latent_dim_' + str(args.var_latent_dim) # add option specific naming to separate tensorboard log files later if args.autoregression: save_path += '_pixelcnn' if args.incremental_data: save_path += '_incremental' if args.train_incremental_upper_bound: save_path += '_upper_bound' if args.generative_replay: save_path += '_genreplay' if args.openset_generative_replay: save_path += '_opensetreplay' if args.cross_dataset: save_path += '_cross_dataset_' + args.dataset_order # if we are resuming a previous training, note it in the name if args.resume: save_path = save_path + '_resumed' writer = SummaryWriter(save_path) # saving the parsed args to file log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") for arg in vars(args): log.write(arg + ':' + str(getattr(args, arg)) + '\n') # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the number of classes from the class dictionary num_classes = dataset.num_classes # we set an epoch multiplier to 1 for isolated training and increase it proportional to amount of tasks in CL epoch_multiplier = 1 if args.incremental_data: from lib.Datasets.incremental_dataset import get_incremental_dataset # get the method to create the incremental dataste (inherits from the chosen data loader) inc_dataset_init_method = get_incremental_dataset( data_init_method, args) # different options for class incremental vs. cross-dataset experiments if args.cross_dataset: # if a task order file is specified, load the task order from it if args.load_task_order: # check if file exists and if file ends with extension '.txt' if os.path.isfile(args.load_task_order) and len(args.load_task_order) >= 4\ and args.load_task_order[-4:] == '.txt': print("=> loading task order from '{}'".format( args.load_task_order)) with open(args.load_task_order, 'rb') as fp: task_order = pickle.load(fp) # if no file is found default to cmd line task order else: # parse and split string at commas task_order = args.dataset_order.split(',') for i in range(len(task_order)): # remove blank spaces in dataset names task_order[i] = task_order[i].replace(" ", "") # use task order as specified in command line else: # parse and split string at commas task_order = args.dataset_order.split(',') for i in range(len(task_order)): # remove blank spaces in dataset names task_order[i] = task_order[i].replace(" ", "") # just for getting the number of classes in the first dataset num_classes = 0 for i in range(args.num_base_tasks): temp_dataset_init_method = getattr(datasets, task_order[i]) temp_dataset = temp_dataset_init_method( torch.cuda.is_available(), args) num_classes += temp_dataset.num_classes del temp_dataset # multiply epochs by number of tasks if args.num_increment_tasks: epoch_multiplier = ((len(task_order) - args.num_base_tasks) / args.num_increment_tasks) + 1 else: # this branch will get active if num_increment_tasks is set to zero. This is useful when training # any isolated upper bound with all datasets present from the start. epoch_multiplier = 1.0 else: # class incremental # if specified load task order from file if args.load_task_order: if os.path.isfile(args.load_task_order): print("=> loading task order from '{}'".format( args.load_task_order)) task_order = np.load(args.load_task_order).tolist() else: # if no file is found a random task order is created print( "=> no task order found. Creating randomized task order" ) task_order = np.random.permutation(num_classes).tolist() else: # if randomize task order is specified create a random task order, else task order is sequential task_order = [] for i in range(dataset.num_classes): task_order.append(i) if args.randomize_task_order: task_order = np.random.permutation(num_classes).tolist() # save the task order np.save(os.path.join(save_path, 'task_order.npy'), task_order) # set the number of classes to base tasks + 1 because base tasks is always one less. # E.g. if you have 2 classes it's one task. This is a little inconsistent from the naming point of view # but we wanted a single variable to work for both class incremental as well as cross-dataset experiments num_classes = args.num_base_tasks + 1 # multiply epochs by number of tasks epoch_multiplier = ( (len(task_order) - (args.num_base_tasks + 1)) / args.num_increment_tasks) + 1 print("Task order: ", task_order) # log the task order into the text file log.write('task_order:' + str(task_order) + '\n') args.task_order = task_order # this is a little weird, but it needs to be here because the below method pops items from task_order args_to_tensorboard(writer, args) assert epoch_multiplier.is_integer(), print( "uneven task division, make sure number of tasks are integers.") # Get the incremental dataset dataset = inc_dataset_init_method(torch.cuda.is_available(), device, task_order, args) else: # add command line options to TensorBoard args_to_tensorboard(writer, args) log.close() # Get a sample input from the data loader to infer color channels/size net_input, _ = next(iter(dataset.train_loader)) # get the amount of color channels in the input images num_colors = net_input.size(1) # import model from architectures class net_init_method = getattr(architectures, args.architecture) # if we are not building an autoregressive model the number of output channels of the model is equivalent to # the amount of input channels. For an autoregressive models we set the number of output channels of the # non-autoregressive decoder portion according to the command line option below if not args.autoregression: args.out_channels = num_colors # build the model model = net_init_method(device, num_classes, num_colors, args) # optionally add the autoregressive decoder if args.autoregression: model.pixelcnn = PixelCNN(device, num_colors, args.out_channels, args.pixel_cnn_channels, num_layers=args.pixel_cnn_layers, k=args.pixel_cnn_kernel_size, padding=args.pixel_cnn_kernel_size // 2) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) # Initialize the weights of the model, by default according to He et al. print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) WeightInitializer.init_model(model) # Define optimizer and loss function (criterion) optimizer = torch.optim.Adam(model.parameters(), args.learning_rate) epoch = 0 best_prec = 0 best_loss = random.getrandbits(128) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # optimize until final amount of epochs is reached. Final amount of epochs is determined through the while epoch < (args.epochs * epoch_multiplier): # visualize the latent space before each task increment and at the end of training if it is 2-D if epoch % args.epochs == 0 and epoch > 0 or (epoch + 1) % ( args.epochs * epoch_multiplier) == 0: if model.module.latent_dim == 2: print("Calculating and visualizing dataset embedding") # infer the number of current tasks to plot the different classes in the embedding if args.incremental_data: if args.cross_dataset: num_tasks = sum( dataset.num_classes_per_task[:len(dataset. seen_tasks)]) else: num_tasks = len(dataset.seen_tasks) else: num_tasks = num_classes zs = get_latent_embedding(model, dataset.train_loader, num_tasks, device) visualize_dataset_in_2d_embedding(writer, zs, args.dataset, save_path, task=num_tasks) # continual learning specific part if args.incremental_data: # at the end of each task increment if epoch % args.epochs == 0 and epoch > 0: print('Saving the last checkpoint from the previous task ...') save_task_checkpoint(save_path, epoch // args.epochs) print("Incrementing dataset ...") dataset.increment_tasks( model, args.batch_size, args.workers, writer, save_path, is_gpu=torch.cuda.is_available(), upper_bound_baseline=args.train_incremental_upper_bound, generative_replay=args.generative_replay, openset_generative_replay=args.openset_generative_replay, openset_threshold=args.openset_generative_replay_threshold, openset_tailsize=args.openset_weibull_tailsize, autoregression=args.autoregression) # grow the classifier and increment the variable for number of overall classes so we can use it later if args.cross_dataset: grow_classifier( model.module.classifier, sum(dataset.num_classes_per_task[:len(dataset. seen_tasks)]) - model.module.num_classes, WeightInitializer) model.module.num_classes = sum( dataset.num_classes_per_task[:len(dataset.seen_tasks)]) else: model.module.num_classes += args.num_increment_tasks grow_classifier(model.module.classifier, args.num_increment_tasks, WeightInitializer) # reset moving averages etc. of the optimizer optimizer = torch.optim.Adam(model.parameters(), args.learning_rate) # change the number of seen classes if epoch % args.epochs == 0: model.module.seen_tasks = dataset.seen_tasks # train train(dataset, model, criterion, epoch, optimizer, writer, device, args) # evaluate on validation set prec, loss = validate(dataset, model, criterion, epoch, writer, device, save_path, args) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) best_prec = max(prec, best_prec) save_checkpoint( { 'epoch': epoch, 'arch': args.architecture, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, is_best, save_path) # increment epoch counters epoch += 1 # if a new task begins reset the best prec so that new best model can be stored. if args.incremental_data and epoch % args.epochs == 0: best_prec = 0 best_loss = random.getrandbits(128) writer.close()
def main(): # set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # choose dataset evaluation function to import (e.g. variational will operate on z values) if args.train_var: from lib.Training.evaluate import eval_var_dataset as eval_dataset from lib.Training.evaluate import eval_var_openset_dataset as eval_openset_dataset else: from lib.Training.evaluate import eval_dataset as eval_dataset from lib.Training.evaluate import eval_openset_dataset as eval_openset_dataset # Get the dataset which has been trained and the corresponding number of classes data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) num_classes = dataset.num_classes net_input, _ = next(iter(dataset.train_loader)) num_colors = net_input.size(1) # Split a part of the non-used dataset to use as validation set for determining open set (e.g entropy) # rejection thresholds split_perc = 0.5 split_sets = torch.utils.data.random_split(dataset.valset, [int((1 - split_perc) * len(dataset.valset)), int(split_perc * len(dataset.valset))]) # overwrite old set and create new split set to determine thresholds/priors dataset.valset = split_sets[0] dataset.threshset = split_sets[1] # overwrite old data loader and create new loader for thresh set is_gpu = torch.cuda.is_available() dataset.val_loader = torch.utils.data.DataLoader(dataset.valset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=is_gpu, sampler=None) dataset.threshset_loader = torch.utils.data.DataLoader(dataset.threshset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=is_gpu, sampler=None) # Load open set datasets openset_datasets_names = args.openset_datasets.strip().split(',') openset_datasets = [] for openset_dataset in openset_datasets_names: openset_data_init_method = getattr(datasets, openset_dataset) openset_datasets.append(openset_data_init_method(torch.cuda.is_available(), args)) # Initialize empty model net_init_method = getattr(architectures, args.architecture) model = net_init_method(device, num_classes, num_colors, args).to(device) model = torch.nn.DataParallel(model).to(device) print(model) # load model (using the resume functionality) assert(os.path.isfile(args.resume)), "=> no model checkpoint found at '{}'".format(args.resume) # Fill the random model with the parameters of the checkpoint print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] print("Saved model's validation accuracy: ", best_prec) print("Saved model's validation loss: ", best_loss) model.load_state_dict(checkpoint['state_dict']) model.to(device) model.eval() # set the save path to the directory from which the model has been loaded save_path = os.path.dirname(args.resume) # start of the model evaluation on the training dataset and fitting print("Evaluating original train dataset: " + args.dataset + ". This may take a while...") dataset_eval_dict_train = eval_dataset(model, dataset.train_loader, num_classes, device, latent_var_samples=args.var_samples, model_var_samples=args.model_samples) print("Training accuracy: ", dataset_eval_dict_train["accuracy"]) # Get the mean of z for correctly classified data inputs mean_zs = get_means(dataset_eval_dict_train["zs_correct"]) # visualize the mean z vectors mean_zs_tensor = torch.stack(mean_zs, dim=0) visualize_means(mean_zs_tensor, num_classes, args.dataset, save_path, "z") # calculate each correctly classified example's distance to the mean z distances_to_z_means_correct_train = calc_distances_to_means(mean_zs, dataset_eval_dict_train["zs_correct"], args.distance_function) # Weibull fitting # set tailsize according to command line parameters (according to percentage of dataset size) tailsize = int(len(dataset.trainset) * args.openset_weibull_tailsize / num_classes) print("Fitting Weibull models with tailsize: " + str(tailsize)) tailsizes = [tailsize] * num_classes weibull_models, valid_weibull = fit_weibull_models(distances_to_z_means_correct_train, tailsizes) assert valid_weibull, "Weibull fit is not valid" # ------------------------------------------------------------------------------------------ # Fitting on train dataset complete. Determine rejection thresholds/priors on the created split set # ------------------------------------------------------------------------------------------ print("Evaluating original threshold split dataset: " + args.dataset + ". This may take a while...") threshset_eval_dict = eval_dataset(model, dataset.threshset_loader, num_classes, device, latent_var_samples=args.var_samples, model_var_samples=args.model_samples) # Again calculate distances to mean z print("Split set accuracy: ", threshset_eval_dict["accuracy"]) distances_to_z_means_threshset = calc_distances_to_means(mean_zs, threshset_eval_dict["zs_correct"], args.distance_function) # get Weibull outlier probabilities for thresh set outlier_probs_threshset = calc_outlier_probs(weibull_models, distances_to_z_means_threshset) threshset_classification = calc_openset_classification(outlier_probs_threshset, num_classes, num_outlier_threshs=100) # also check outlier detection based on entropy max_entropy = np.max(threshset_eval_dict["out_entropy"]) threshset_entropy_classification = calc_entropy_classification(threshset_eval_dict["out_entropy"], max_entropy, num_outlier_threshs=100) # determine rejection priors based on 5% of the split data considered as inlying if (np.array(threshset_classification["outlier_percentage"]) <= 0.05).any() == True: EVT_prior_index = np.argwhere(np.array(threshset_classification["outlier_percentage"]) <= 0.05)[0][0] EVT_prior = threshset_classification["thresholds"][EVT_prior_index] else: EVT_prior = 0.5 EVT_prior_index = 50 if (np.array(threshset_entropy_classification["entropy_outlier_percentage"]) <= 0.05).any() == True: entropy_threshold_index = np.argwhere(np.array(threshset_entropy_classification["entropy_outlier_percentage"]) <= 0.05)[0][0] entropy_threshold = threshset_entropy_classification["entropy_thresholds"][entropy_threshold_index] else: # this should never actually happen entropy_threshold = np.median(threshset_entropy_classification["entropy_thresholds"]) entropy_threshold_index = 50 print("EVT prior: " + str(EVT_prior) + "; Entropy threshold: " + str(entropy_threshold)) # ------------------------------------------------------------------------------------------ # Beginning of all testing/open set recognition on test and unknown sets. # ------------------------------------------------------------------------------------------ # We evaluate the validation set to later evaluate trained dataset's statistical inlier/outlier estimates. print("Evaluating original validation dataset: " + args.dataset + ". This may take a while...") dataset_eval_dict = eval_dataset(model, dataset.val_loader, num_classes, device, latent_var_samples=args.var_samples, model_var_samples=args.model_samples) # Again calculate distances to mean z print("Validation accuracy: ", dataset_eval_dict["accuracy"]) distances_to_z_means_correct = calc_distances_to_means(mean_zs, dataset_eval_dict["zs_correct"], args.distance_function) # Evaluate outlier probability of trained dataset's validation set outlier_probs_correct = calc_outlier_probs(weibull_models, distances_to_z_means_correct) dataset_classification_correct = calc_openset_classification(outlier_probs_correct, num_classes, num_outlier_threshs=100) dataset_entropy_classification_correct = calc_entropy_classification(dataset_eval_dict["out_entropy"], max_entropy, num_outlier_threshs=100) print(args.dataset + '(trained) EVT outlier percentage: ' + str(dataset_classification_correct["outlier_percentage"][EVT_prior_index])) print(args.dataset + '(trained) entropy outlier percentage: ' + str(dataset_entropy_classification_correct["entropy_outlier_percentage"][entropy_threshold_index])) # Repeat process for open set recognition on unseen datasets ( openset_dataset_eval_dicts = collections.OrderedDict() openset_outlier_probs_dict = collections.OrderedDict() openset_classification_dict = collections.OrderedDict() openset_entropy_classification_dict = collections.OrderedDict() for od, openset_dataset in enumerate(openset_datasets): print("Evaluating openset dataset: " + openset_datasets_names[od] + ". This may take a while...") openset_dataset_eval_dict = eval_openset_dataset(model, openset_dataset.val_loader, num_classes, device, latent_var_samples=args.var_samples, model_var_samples=args.model_samples) openset_distances_to_z_means = calc_distances_to_means(mean_zs, openset_dataset_eval_dict["zs"], args.distance_function) openset_outlier_probs = calc_outlier_probs(weibull_models, openset_distances_to_z_means) # getting outlier classification accuracies across the entire datasets openset_classification = calc_openset_classification(openset_outlier_probs, num_classes, num_outlier_threshs=100) openset_entropy_classification = calc_entropy_classification(openset_dataset_eval_dict["out_entropy"], max_entropy, num_outlier_threshs=100) # dictionary of dictionaries: per datasetname one dictionary with respective values openset_dataset_eval_dicts[openset_datasets_names[od]] = openset_dataset_eval_dict openset_outlier_probs_dict[openset_datasets_names[od]] = openset_outlier_probs openset_classification_dict[openset_datasets_names[od]] = openset_classification openset_entropy_classification_dict[openset_datasets_names[od]] = openset_entropy_classification # print outlier rejection values for all unseen unknown datasets for other_data_name, other_data_dict in openset_classification_dict.items(): print(other_data_name + ' EVT outlier percentage: ' + str(other_data_dict["outlier_percentage"][entropy_threshold_index])) for other_data_name, other_data_dict in openset_entropy_classification_dict.items(): print(other_data_name + ' entropy outlier percentage: ' + str(other_data_dict["entropy_outlier_percentage"][entropy_threshold_index])) # joint prediction uncertainty plot for all datasets if (args.train_var and args.var_samples > 1) or args.model_samples > 1: visualize_classification_uncertainty(dataset_eval_dict["out_mus_correct"], dataset_eval_dict["out_sigmas_correct"], openset_dataset_eval_dicts, "out_mus", "out_sigmas", args.dataset + ' (trained)', args.var_samples, save_path) # visualize the outlier probabilities visualize_weibull_outlier_probabilities(outlier_probs_correct, openset_outlier_probs_dict, args.dataset + ' (trained)', save_path, tailsize) visualize_classification_scores(dataset_eval_dict["out_mus_correct"], openset_dataset_eval_dicts, 'out_mus', args.dataset + ' (trained)', save_path) visualize_entropy_histogram(dataset_eval_dict["out_entropy"], openset_dataset_eval_dicts, dataset_entropy_classification_correct["entropy_thresholds"][-1], "out_entropy", args.dataset + ' (trained)', save_path) # joint plot for outlier detection accuracy for seen and both unseen datasets visualize_openset_classification(dataset_classification_correct["outlier_percentage"], openset_classification_dict, "outlier_percentage", args.dataset + ' (trained)', dataset_classification_correct["thresholds"], save_path, tailsize) visualize_entropy_classification(dataset_entropy_classification_correct["entropy_outlier_percentage"], openset_entropy_classification_dict, "entropy_outlier_percentage", args.dataset + ' (trained)', dataset_entropy_classification_correct["entropy_thresholds"], save_path)
def main(): # set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # Get the dataset which has been trained and the corresponding number of classes data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) num_classes = dataset.num_classes net_input, _ = next(iter(dataset.train_loader)) num_colors = net_input.size(1) # Load open set dataset 1 openset_data_init_method = getattr(datasets, args.openset_dataset) openset_dataset = openset_data_init_method(torch.cuda.is_available(), args) # Load open set dataset 2 # Note: This could be easily refactored to one or flexible amount of datasets, we have kept this hard-coded # to reproduce the plots of the paper. Please feel free to refactor this. openset_data_init_method2 = getattr(datasets, args.openset_dataset2) openset_dataset2 = openset_data_init_method2(torch.cuda.is_available(), args) if not args.autoregression: args.out_channels = num_colors # Initialize empty model net_init_method = getattr(architectures, args.architecture) model = net_init_method(device, num_classes, num_colors, args) # Optional addition of autoregressive decoder portion if args.autoregression: model.pixelcnn = PixelCNN(device, num_colors, args.out_channels, args.pixel_cnn_channels, num_layers=args.pixel_cnn_layers, k=args.pixel_cnn_kernel_size, padding=args.pixel_cnn_kernel_size // 2) model = torch.nn.DataParallel(model).to(device) # load model (using the resume functionality) assert(os.path.isfile(args.resume)), "=> no model checkpoint found at '{}'".format(args.resume) # Fill the random model with the parameters of the checkpoint print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] # print the saved model's validation accuracy (as a check to see if the loaded model has really been trained) print("Saved model's validation accuracy: ", best_prec) print("Saved model's validation loss: ", best_loss) model.load_state_dict(checkpoint['state_dict']) model.to(device) model.eval() # set the save path to the directory from which the model has been loaded save_path = os.path.dirname(args.resume) # start of the model evaluation on the training dataset and fitting print("Evaluating original train dataset: " + args.dataset + ". This may take a while...") dataset_eval_dict_train = eval_dataset(model, dataset.train_loader, dataset.num_classes, device, samples=args.var_samples) print("Training accuracy: ", dataset_eval_dict_train["accuracy"]) # Get the mean of z for correctly classified data inputs mean_zs = get_means(dataset_eval_dict_train["zs_correct"]) # visualize the mean z vectors mean_zs_tensor = torch.stack(mean_zs, dim=0) visualize_means(mean_zs_tensor, dataset.class_to_idx, args.dataset, save_path, "z") # calculate each correctly classified example's distance to the mean z distances_to_z_means_correct_train = calc_distances_to_means(mean_zs, dataset_eval_dict_train["zs_correct"], args.distance_function) # Weibull fitting # set tailsize according to command line parameters (according to percentage of dataset size) tailsize = int(len(dataset.trainset) * args.openset_weibull_tailsize / num_classes) print("Fitting Weibull models with tailsize: " + str(tailsize)) tailsizes = [tailsize] * num_classes weibull_models, valid_weibull = fit_weibull_models(distances_to_z_means_correct_train, tailsizes) assert valid_weibull, "Weibull fit is not valid" # ------------------------------------------------------------------------------------------ # Fitting on train dataset complete. Beginning of all testing/open set recognition on validation and unknown sets. # ------------------------------------------------------------------------------------------ # We evaluate the validation set to later evaluate trained dataset's statistical inlier/outlier estimates. print("Evaluating original validation dataset: " + args.dataset + ". This may take a while...") dataset_eval_dict = eval_dataset(model, dataset.val_loader, dataset.num_classes, device, samples=args.var_samples) # Again calculate distances to mean z print("Validation accuracy: ", dataset_eval_dict["accuracy"]) distances_to_z_means_correct = calc_distances_to_means(mean_zs, dataset_eval_dict["zs_correct"], args.distance_function) # Evaluate outlier probability of trained dataset's validation set outlier_probs_correct = calc_outlier_probs(weibull_models, distances_to_z_means_correct) # Repeat process for open set recognition on unseen dataset 1 ( print("Evaluating openset dataset: " + args.openset_dataset + ". This may take a while...") openset_dataset_eval_dict = eval_openset_dataset(model, openset_dataset.val_loader, openset_dataset.num_classes, device, samples=args.var_samples) openset_distances_to_z_means = calc_distances_to_means(mean_zs, openset_dataset_eval_dict["zs"], args.distance_function) openset_outlier_probs = calc_outlier_probs(weibull_models, openset_distances_to_z_means) # visualize the outlier probabilities visualize_weibull_outlier_probabilities(outlier_probs_correct, openset_outlier_probs, args.dataset, args.openset_dataset, save_path, tailsize) # getting outlier classification accuracies across the entire datasets dataset_classification_correct = calc_openset_classification(outlier_probs_correct, dataset.num_classes, num_outlier_threshs=100) openset_classification = calc_openset_classification(openset_outlier_probs, dataset.num_classes, num_outlier_threshs=100) # open set recognition on unseen dataset 2 (Lots of redundant code copy pasting, could be refactored) print("Evaluating openset dataset 2: " + args.openset_dataset2 + ". This may take a while...") openset_dataset_eval_dict2 = eval_openset_dataset(model, openset_dataset2.val_loader, openset_dataset2.num_classes, device, samples=args.var_samples) # joint prediction uncertainty plot for all datasets visualize_classification_uncertainty(dataset_eval_dict["out_mus_correct"], dataset_eval_dict["out_sigmas_correct"], openset_dataset_eval_dict["out_mus"], openset_dataset_eval_dict["out_sigmas"], openset_dataset_eval_dict2["out_mus"], openset_dataset_eval_dict2["out_sigmas"], args.dataset + ' (trained)', args.openset_dataset, args.openset_dataset2, args.var_samples, save_path) # get outlier probabilities of open set dataset 2 openset_distances_to_z_means2 = calc_distances_to_means(mean_zs, openset_dataset_eval_dict2["zs"], args.distance_function) openset_outlier_probs2 = calc_outlier_probs(weibull_models, openset_distances_to_z_means2) visualize_weibull_outlier_probabilities(outlier_probs_correct, openset_outlier_probs2, args.dataset, args.openset_dataset2, save_path, tailsize) # getting outlier classification accuracy for open set dataset 2 openset_classification2 = calc_openset_classification(openset_outlier_probs2, dataset.num_classes, num_outlier_threshs=100) # joint plot for outlier detection accuracy for seen and both unseen datasets visualize_openset_classification(dataset_classification_correct["outlier_percentage"], openset_classification["outlier_percentage"], openset_classification2["outlier_percentage"], args.dataset + ' (trained)', args.openset_dataset, args.openset_dataset2, dataset_classification_correct["thresholds"], save_path, tailsize)
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) if args.debug: pdb.set_trace() # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Launch a writer for the tensorboard summary writer instance save_path = 'runs/' + strftime( "%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture # if we are resuming a previous training, note it in the name if args.resume: save_path = save_path + '_resumed' writer = SummaryWriter(save_path) # saving the parsed args to file log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") for arg in vars(args): log.write(arg + ':' + str(getattr(args, arg)) + '\n') # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the number of classes from the class dictionary num_classes = dataset.num_classes # we set an epoch multiplier to 1 for isolated training and increase it proportional to amount of tasks in CL epoch_multiplier = 1 # add command line options to TensorBoard args_to_tensorboard(writer, args) log.close() # build the model model = architectures.Inos_model(args.num_class, args) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) if not args.pretrained: # Initialize the weights of the model, by default according to He et al. print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) WeightInitializer.init_model(model) # Define optimizer and loss function (criterion) optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=0.9, weight_decay=2e-4) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[30, 60, 80, 100], gamma=0.5) epoch = 0 best_prec = 0 best_loss = random.getrandbits(128) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # optimize until final amount of epochs is reached. Final amount of epochs is determined through the while epoch < (args.epochs * epoch_multiplier): if epoch + 2 == epoch % args.epochs: print("debug perpose") # train train(dataset, model, criterion, epoch, optimizer, writer, device, args) # evaluate on validation set prec, loss = validate(dataset, model, criterion, epoch, writer, device, save_path, args) # evaluate on test set prec_t, loss_t = test(dataset, model, criterion, epoch, writer, device, save_path, args) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) best_prec = max(prec, best_prec) save_checkpoint( { 'epoch': epoch, 'arch': args.architecture, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, is_best, save_path) # increment epoch counters epoch += 1 scheduler.step() writer.close()
def main(): # set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # Get the dataset which has been trained and the corresponding number of classes data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) num_classes = dataset.num_classes net_input, _ = next(iter(dataset.train_loader)) num_colors = net_input.size(1) # Split a part of the non-used dataset to use as validation set for determining open set (e.g entropy) # rejection thresholds split_perc = 0.5 split_sets = torch.utils.data.random_split(dataset.valset, [int((1 - split_perc) * len(dataset.valset)), int(split_perc * len(dataset.valset))]) # overwrite old set and create new split set to determine thresholds/priors dataset.valset = split_sets[0] dataset.threshset = split_sets[1] # overwrite old data loader and create new loader for thresh set is_gpu = torch.cuda.is_available() dataset.val_loader = torch.utils.data.DataLoader(dataset.valset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=is_gpu, sampler=None) dataset.threshset_loader = torch.utils.data.DataLoader(dataset.threshset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=is_gpu, sampler=None) # Load open set datasets openset_datasets_names = args.openset_datasets.strip().split(',') openset_datasets = [] for openset_dataset in openset_datasets_names: openset_data_init_method = getattr(datasets, openset_dataset) openset_datasets.append(openset_data_init_method(torch.cuda.is_available(), args)) if not args.autoregression: args.out_channels = num_colors # Initialize empty model net_init_method = getattr(architectures, args.architecture) model = net_init_method(device, num_classes, num_colors, args) # Optional addition of autoregressive decoder portion if args.autoregression: model.pixelcnn = PixelCNN(device, num_colors, args.out_channels, args.pixel_cnn_channels, num_layers=args.pixel_cnn_layers, k=args.pixel_cnn_kernel_size, padding=args.pixel_cnn_kernel_size // 2) model = torch.nn.DataParallel(model).to(device) # load model (using the resume functionality) assert(os.path.isfile(args.resume)), "=> no model checkpoint found at '{}'".format(args.resume) # Fill the random model with the parameters of the checkpoint print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] # print the saved model's validation accuracy (as a check to see if the loaded model has really been trained) print("Saved model's validation accuracy: ", best_prec) print("Saved model's validation loss: ", best_loss) model.load_state_dict(checkpoint['state_dict']) model.to(device) model.eval() # set the save path to the directory from which the model has been loaded save_path = os.path.dirname(args.resume) # start of the model evaluation on the training dataset and fitting print("Evaluating original train dataset: " + args.dataset + ". This may take a while...") dataset_eval_dict_train = eval_dataset(model, dataset.train_loader, dataset.num_classes, device, samples=args.var_samples, calc_reconstruction=args.calc_reconstruction, autoregression=args.autoregression) print("Training accuracy: ", dataset_eval_dict_train["accuracy"]) # Get the mean of z for correctly classified data inputs mean_zs = get_means(dataset_eval_dict_train["zs_correct"]) # visualize the mean z vectors mean_zs_tensor = torch.stack(mean_zs, dim=0) visualize_means(mean_zs_tensor, dataset.class_to_idx, args.dataset, save_path, "z") # calculate each correctly classified example's distance to the mean z distances_to_z_means_correct_train = calc_distances_to_means(mean_zs, dataset_eval_dict_train["zs_correct"], args.distance_function) # Weibull fitting # set tailsize according to command line parameters (according to percentage of dataset size) tailsize = int(len(dataset.trainset) * args.openset_weibull_tailsize / num_classes) print("Fitting Weibull models with tailsize: " + str(tailsize)) tailsizes = [tailsize] * num_classes weibull_models, valid_weibull = fit_weibull_models(distances_to_z_means_correct_train, tailsizes) assert valid_weibull, "Weibull fit is not valid" # Determine rejection thresholds/priors on the created split set print("Evaluating original threshold split dataset: " + args.dataset + ". This may take a while...") threshset_eval_dict = eval_dataset(model, dataset.threshset_loader, num_classes, device, samples=args.var_samples, calc_reconstruction=args.calc_reconstruction, autoregression=args.autoregression) # Again calculate distances to mean z print("Split set accuracy: ", threshset_eval_dict["accuracy"]) distances_to_z_means_threshset = calc_distances_to_means(mean_zs, threshset_eval_dict["zs_correct"], args.distance_function) outlier_probs_threshset = calc_outlier_probs(weibull_models, distances_to_z_means_threshset) threshset_classification = calc_openset_classification(outlier_probs_threshset, num_classes, num_outlier_threshs=100) max_entropy = np.max(threshset_eval_dict["out_entropy"]) threshset_entropy_classification = calc_entropy_classification(threshset_eval_dict["out_entropy"], max_entropy, num_outlier_threshs=100) # We have added a flag to turn off calculation of the decoder because it is computationally heavy for many samples # (repeated calculation of the decoder), whereas latent space sampling and repeated calculation of our latent based # EVT approach and even the single layer classifier is cheap. if args.calc_reconstruction: max_recon_loss = np.max(threshset_eval_dict["recon_loss_mus"]) threshset_recon_classification = calc_reconstruction_classification(threshset_eval_dict["recon_loss_mus"], max_recon_loss, num_outlier_threshs=1000) # determine the index for the corresponding rejection priors/thresholds. Although this should never happen, # we also set a default if no threshold satisfies the 95% inlier condition. if (np.array(threshset_classification["outlier_percentage"]) <= args.percent_validation_outliers).any() == True: EVT_prior_index = np.argwhere(np.array(threshset_classification["outlier_percentage"]) <= 0.05)[0][0] EVT_prior = threshset_classification["thresholds"][EVT_prior_index] else: EVT_prior = 0.5 EVT_prior_index = 50 if (np.array(threshset_entropy_classification["entropy_outlier_percentage"]) <= args.percent_validation_outliers).any() == True: entropy_threshold_index = np.argwhere(np.array(threshset_entropy_classification["entropy_outlier_percentage"]) <= 0.05)[0][0] entropy_threshold = threshset_entropy_classification["entropy_thresholds"][entropy_threshold_index] else: entropy_threshold = np.median(threshset_entropy_classification["entropy_thresholds"]) entropy_threshold_index = 50 if args.calc_reconstruction: if (np.array(threshset_recon_classification["reconstruction_outlier_percentage"]) <= args.percent_validation_outliers).any() == True: recon_threshold_index = np.argwhere( np.array(threshset_recon_classification["reconstruction_outlier_percentage"]) <= 0.05)[0][0] recon_threshold = threshset_recon_classification["reconstruction_thresholds"][recon_threshold_index] else: recon_threshold = np.median(threshset_recon_classification["reconstruction_thresholds"]) recon_threshold_index = 500 print("EVT prior: " + str(EVT_prior) + "; Entropy threshold: " + str(entropy_threshold)) if args.calc_reconstruction: print("Reconstruction loss threshold: " + str(recon_threshold)) # ------------------------------------------------------------------------------------------ # Fitting on train dataset complete. Beginning of all testing/open set recognition on validation and unknown sets. # ------------------------------------------------------------------------------------------ # We evaluate the validation set to later evaluate trained dataset's statistical inlier/outlier estimates. print("Evaluating original validation dataset: " + args.dataset + ". This may take a while...") dataset_eval_dict = eval_dataset(model, dataset.val_loader, num_classes, device, samples=args.var_samples, calc_reconstruction=args.calc_reconstruction, autoregression=args.autoregression) # Again calculate distances to mean z print("Validation accuracy: ", dataset_eval_dict["accuracy"]) distances_to_z_means_correct = calc_distances_to_means(mean_zs, dataset_eval_dict["zs_correct"], args.distance_function) # Evaluate outlier probability of trained dataset's validation set outlier_probs_correct = calc_outlier_probs(weibull_models, distances_to_z_means_correct) dataset_classification_correct = calc_openset_classification(outlier_probs_correct, num_classes, num_outlier_threshs=100) dataset_entropy_classification_correct = calc_entropy_classification(dataset_eval_dict["out_entropy"], max_entropy, num_outlier_threshs=100) if args.calc_reconstruction: dataset_recon_classification_correct = calc_reconstruction_classification(dataset_eval_dict["recon_loss_mus"], max_recon_loss, num_outlier_threshs=1000) print(args.dataset + '(trained) EVT outlier percentage: ' + str(dataset_classification_correct["outlier_percentage"][EVT_prior_index])) print(args.dataset + '(trained) entropy outlier percentage: ' + str(dataset_entropy_classification_correct["entropy_outlier_percentage"][entropy_threshold_index])) if args.calc_reconstruction: print(args.dataset + '(trained) reconstruction loss outlier percentage: ' + str(dataset_recon_classification_correct["reconstruction_outlier_percentage"][recon_threshold_index])) # ------------------------------------------------------------------------------------------ # Repeat process for open set recognition (no fitting, just testing) on all unseen datasets # ------------------------------------------------------------------------------------------ # dicitionaries to hold results openset_dataset_eval_dicts = collections.OrderedDict() openset_outlier_probs_dict = collections.OrderedDict() openset_classification_dict = collections.OrderedDict() openset_entropy_classification_dict = collections.OrderedDict() if args.calc_reconstruction: openset_recon_classification_dict = collections.OrderedDict() for od, openset_dataset in enumerate(openset_datasets): print("Evaluating openset dataset: " + openset_datasets_names[od] + ". This may take a while...") openset_dataset_eval_dict = eval_openset_dataset(model, openset_dataset.val_loader, num_classes, device, samples=args.var_samples, autoregression=args.autoregression, calc_reconstruction=args.calc_reconstruction) openset_distances_to_z_means = calc_distances_to_means(mean_zs, openset_dataset_eval_dict["zs"], args.distance_function) openset_outlier_probs = calc_outlier_probs(weibull_models, openset_distances_to_z_means) # getting outlier classification accuracies across the entire datasets openset_classification = calc_openset_classification(openset_outlier_probs, num_classes, num_outlier_threshs=100) openset_entropy_classification = calc_entropy_classification(openset_dataset_eval_dict["out_entropy"], max_entropy, num_outlier_threshs=100) if args.calc_reconstruction: openset_recon_classification_correct = calc_reconstruction_classification( openset_dataset_eval_dict["recon_loss_mus"], max_recon_loss, num_outlier_threshs=1000) openset_dataset_eval_dicts[openset_datasets_names[od]] = openset_dataset_eval_dict openset_outlier_probs_dict[openset_datasets_names[od]] = openset_outlier_probs openset_classification_dict[openset_datasets_names[od]] = openset_classification openset_entropy_classification_dict[openset_datasets_names[od]] = openset_entropy_classification if args.calc_reconstruction: openset_recon_classification_dict[openset_datasets_names[od]] = openset_recon_classification_correct # Print the results # TODO: maybe log this to file also for other_data_name, other_data_dict in openset_classification_dict.items(): print(other_data_name + ' EVT outlier percentage: ' + str(other_data_dict["outlier_percentage"][entropy_threshold_index])) for other_data_name, other_data_dict in openset_entropy_classification_dict.items(): print(other_data_name + ' entropy outlier percentage: ' + str(other_data_dict["entropy_outlier_percentage"][entropy_threshold_index])) if args.calc_reconstruction: for other_data_name, other_data_dict in openset_recon_classification_dict.items(): print(other_data_name + ' reconstruction loss outlier percentage: ' + str(other_data_dict["reconstruction_outlier_percentage"][recon_threshold_index])) # joint prediction uncertainty plot for all datasets if args.var_samples > 1: visualize_classification_uncertainty(dataset_eval_dict["out_mus_correct"], dataset_eval_dict["out_sigmas_correct"], openset_dataset_eval_dicts, "out_mus", "out_sigmas", args.dataset + ' (trained)', args.var_samples, save_path) # visualize the outlier probabilities visualize_weibull_outlier_probabilities(outlier_probs_correct, openset_outlier_probs_dict, args.dataset + ' (trained)', save_path, tailsize) # histograms visualize_classification_scores(dataset_eval_dict["out_mus_correct"], openset_dataset_eval_dicts, 'out_mus', args.dataset + ' (trained)', save_path) visualize_entropy_histogram(dataset_eval_dict["out_entropy"], openset_dataset_eval_dicts, dataset_entropy_classification_correct["entropy_thresholds"][-1], "out_entropy", args.dataset + ' (trained)', save_path) if args.calc_reconstruction: visualize_recon_loss_histogram(dataset_eval_dict["recon_loss_mus"], openset_dataset_eval_dicts, dataset_recon_classification_correct["reconstruction_thresholds"][-1], "recon_loss_mus", args.dataset + ' (trained)', save_path) # joint plot for outlier detection accuracy for both seen and unseen datasets visualize_openset_classification(dataset_classification_correct["outlier_percentage"], openset_classification_dict, "outlier_percentage", args.dataset + ' (trained)', dataset_classification_correct["thresholds"], save_path, tailsize) visualize_entropy_classification(dataset_entropy_classification_correct["entropy_outlier_percentage"], openset_entropy_classification_dict, "entropy_outlier_percentage", args.dataset + ' (trained)', dataset_entropy_classification_correct["entropy_thresholds"], save_path) if args.calc_reconstruction: visualize_reconstruction_classification(dataset_recon_classification_correct["reconstruction_outlier_percentage"], openset_recon_classification_dict, "reconstruction_outlier_percentage", args.dataset + ' (trained)', dataset_recon_classification_correct["reconstruction_thresholds"], save_path, autoregression=args.autoregression)
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") cudnn.benchmark = True num_GPUs = torch.cuda.device_count() # If save directory for runs doesn't exist then create it if not os.path.exists('runs'): os.mkdir('runs') # Create a time-stamped save path for individual experiment save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + \ ';' + args.dataset + ';' + args.architecture os.mkdir(save_path) # List of values to log to csv columns_list = [ 'Filters', 'Parameters', 'Mean', 'Variance', 'Skew', 'BestVal', 'BestValsTrain', 'BestEpoch', 'LastValPrec', 'LastTrainPrec', 'AllTrain', 'AllVal' ] df = pd.DataFrame(columns=columns_list) # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the amount of color channels in the input images net_input, _ = next(iter(dataset.train_loader)) num_colors = net_input.size(1) # import model from architectures class net_init_method = getattr(architectures, args.architecture) # Get the parameters for all valid skewed models SNModels = SkewNormalModels(depth=args.vgg_depth, num_classes=dataset.num_classes, patch_size=args.patch_size) skew_model_params = SNModels.get_valid_models() print("Total number of models: ", len(skew_model_params["filters"])) # Weight-init method WeightInitializer = WeightInit(args.weight_init) # Optionally resume a previous experiment current_id = args.resume_model_id for i in range(len(skew_model_params["filters"]) - current_id): print("Model filters: ", skew_model_params["filters"][i + current_id]) print("Model parameters: ", skew_model_params["total_params"][i + current_id], " mean: ", skew_model_params["means"][i + current_id], " var: ", skew_model_params["vars"][i + current_id], " skew: ", skew_model_params["skews"][i + current_id]) model = net_init_method(device, dataset.num_classes, num_colors, args, skew_model_params["filters"][i + current_id], custom_filters=True) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) # Initialize the weights of the model print("Initializing networks with: " + args.weight_init) WeightInitializer.init_model(model) # Define criterion and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Initialize SGDWR learning rate scheduler lr_scheduler = LearningRateScheduler(args.lr_wr_epochs, len(dataset.train_loader.dataset), args.batch_size, args.learning_rate, args.lr_wr_mul, args.lr_wr_min) # Get estimated GPU memory usage of the model and split batch if too little memory is available if torch.cuda.is_available(): GPUMemory = GPUMem(torch.cuda.is_available()) print('available:{}'.format( (GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.)) print('required per gpu with buffer: {}'.format( (4. / float(num_GPUs) * model.module.gpu_usage) + 1.)) # calculate smaller chunk size to split batch into sequential computations mem_scale_factor = 4.0 # TODO: WEIRD factor... why is this necessary and where does it come from? # TODO: the + 1 Gb should be taken from the cache allocator if ((GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.) < ( (mem_scale_factor / float(num_GPUs) * model.module.gpu_usage) + 1.): # code for variable batch size implementation as per gpu constraint; remove for old code approx_small_batch_size = (((GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024. - 1.) * float(num_GPUs) / mem_scale_factor) //\ (model.module.gpu_usage / float(args.batch_size)) diff = float('inf') temp_small_batch_size = approx_small_batch_size for j in range(1, (args.batch_size // 2) + 1): if args.batch_size % j == 0 and abs( j - approx_small_batch_size) < diff: diff = abs(j - approx_small_batch_size) temp_small_batch_size = j batch_seq_split_size = temp_small_batch_size else: batch_seq_split_size = args.batch_size else: batch_seq_split_size = args.batch_size # Get training and validation dataset loaders dataset.train_loader, dataset.val_loader = dataset.get_dataset_loader( batch_seq_split_size, args.workers, device) print( 'sequential batch size split size:{}'.format(batch_seq_split_size)) epoch = 0 best_epoch = 0 best_prec = 0 best_val_train_prec = 0 all_train = [] all_val = [] while epoch < args.epochs: # train for one epoch train_prec = train(dataset.train_loader, model, criterion, epoch, optimizer, lr_scheduler, device, batch_seq_split_size, args) # evaluate on validation set prec = validate(dataset.val_loader, model, criterion, epoch, device, args) all_train.append(train_prec) all_val.append(prec) # remember best prec@1 and save checkpoint is_best = prec > best_prec if is_best: best_epoch = epoch best_val_train_prec = train_prec best_prec = prec # if architecture doesn't train at all skip it if epoch == args.lr_wr_epochs - 1 and train_prec < ( 2 * 100.0 / dataset.num_classes): break # increment epoch counters epoch += 1 lr_scheduler.scheduler_epoch += 1 # append architecture results to csv df = df.append(pd.DataFrame([[ skew_model_params["filters"][i + current_id], skew_model_params["total_params"][i + current_id], skew_model_params["means"][i + current_id], skew_model_params["vars"][i + current_id], skew_model_params["skews"][i + current_id], best_prec, best_val_train_prec, best_epoch, prec, train_prec, all_train, all_val ]], columns=columns_list), ignore_index=True) df.to_csv(save_path + '/model_%03d' % (i + 1 + current_id) + '.csv') del model del optimizer
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # Check whether GPU is available and can be used # if CUDA is found then set flag to True is_gpu = torch.cuda.is_available() # Dataset loading # There's only one dataset loader right now, but it can be extended data_init_method = getattr(datasets, 'CUSTOM') dataset = data_init_method(is_gpu, args) # Construct the network net_init_method = getattr(architectures, args.architecture) model = net_init_method(args.batch_norm) # Initialize the weights of the model print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) WeightInitializer.init_model(model) if is_gpu: # CUDNN import torch.backends.cudnn as cudnn model = model.cuda() cudnn.benchmark = True print(model) # Define optimizer and loss function (criterion) criterion = nn.BCELoss() if is_gpu: criterion = criterion.cuda() # autoencoders also work with SGD but it is much harder to find the correct parameters # TODO: expose this to cmd line eventually if args.optimizer == 'ADAM': optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.momentum, 0.999)) else: optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) save_path = 'runs/' + strftime("%Y-%m-%d_%H:%M:%S", gmtime()) + \ ';' + args.architecture os.mkdir(save_path) epoch = 1 best_loss = 100000000 # some arbitrarily large initial number while epoch <= args.epochs: model.train() loss = train_unsup(dataset.train_loader, model, criterion, epoch, optimizer, is_gpu, args) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { 'epoch': epoch, 'arch': args.architecture, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), }, is_best, save_path) # increment epoch counter epoch += 1
def main(): # set device for torch computations device = torch.device("cuda" if torch.cuda.is_available() else "cpu") save_path = './runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) if not os.path.exists(save_path): os.makedirs(save_path) # parse command line arguments args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # create log file log_file = os.path.join(save_path, "stdout") # write parsed args to log file log = open(log_file, "a") for arg in vars(args): print(arg, getattr(args, arg)) log.write(arg + ':' + str(getattr(args, arg)) + '\n') log.close() # instantiate the weight initializer print("Initializing network with: " + args.weight_init) weight_initializer = WeightInit(args.weight_init) # instantiate dataset object data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # instantiate a tabular Q-learner q_learner = QLearner(args, dataset.num_classes, save_path) # start new architecture search if int(args.task) == 1: if args.continue_search is True: # raise exceptions if requirements to start new search not met if args.continue_epsilon not in np.array( state_space_parameters.epsilon_schedule)[:, 0]: raise ValueError( 'continue-epsilon {} not in epsilon schedule!'.format( args.continue_epsilon)) if (args.replay_buffer_csv_path is None) or (not os.path.exists( args.replay_buffer_csv_path)): raise ValueError( 'specify correct path to replay buffer to continue ') if (args.q_values_csv_path is None) or (not os.path.exists( args.q_values_csv_path)): raise ValueError('wrong path is specified for Q-values') # iterate as per the epsilon-greedy schedule for episode in state_space_parameters.epsilon_schedule: epsilon = episode[0] m = episode[1] # raise exception if net number to continue from greater than number of nets for the continue_epsilon if epsilon == args.continue_epsilon and args.continue_ite > m: raise ValueError( 'continue-ite {} not within range of continue-epsilon {} in epsilon schedule!' .format(args.continue_ite, epsilon)) # iterate through number of nets for an epsilon for ite in range(1, m + 1): # check conditions to generate and train arc if (epsilon == args.continue_epsilon and ite >= args.continue_ite) or ( epsilon < args.continue_epsilon): print('ite:{}, epsilon:{}'.format(ite, epsilon)) # generate net states for search q_learner.generate_search_net_states(epsilon) # check if net already trained before search_net_in_replay_dict = q_learner.check_search_net_in_replay_buffer( ) # add to the end of the replay buffer if net already trained before if search_net_in_replay_dict: q_learner.add_search_net_to_replay_buffer( search_net_in_replay_dict, verbose=True) # train net if net not trained before else: # train/val search net mem_fit, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, soft_val_all_epochs,\ train_flag, hard_best_background, hard_best_crack, hard_best_spallation,\ hard_best_exposed_bars, hard_best_efflorescence, hard_best_corrosion_stain =\ train_val_net(q_learner.state_list, dataset, weight_initializer, device, args, save_path) # check if net fits memory while mem_fit is False: print( "net failed mem check even with batch splitting, sampling again!" ) q_learner.generate_search_net_states(epsilon) net_in_replay_dict = q_learner.check_search_net_in_replay_buffer( ) if search_net_in_replay_dict: q_learner.add_search_net_to_replay_buffer( net_in_replay_dict) break else: mem_fit, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, \ soft_val_all_epochs, train_flag, hard_best_background, hard_best_crack,\ hard_best_spallation, hard_best_exposed_bars, hard_best_efflorescence,\ hard_best_corrosion_stain =\ train_val_net(q_learner.state_list, dataset, weight_initializer, device, args, save_path) # add new net and performance measures to replay buffer if it fits in memory after splitting # batch if mem_fit: reward = q_learner.accuracies_to_reward( hard_val_all_epochs) q_learner.add_search_net_to_replay_buffer( search_net_in_replay_dict, spp_size=spp_size, reward=reward, hard_best_val=hard_best_val, hard_val_all_epochs=hard_val_all_epochs, soft_best_val=soft_best_val, soft_val_all_epochs=soft_val_all_epochs, train_flag=train_flag, hard_best_background=hard_best_background, hard_best_crack=hard_best_crack, hard_best_spallation=hard_best_spallation, hard_best_exposed_bars=hard_best_exposed_bars, hard_best_efflorescence=hard_best_efflorescence, hard_best_corrosion_stain= hard_best_corrosion_stain, verbose=True) # sample nets from replay buffer, update Q-values and save partially filled replay buffer and # Q-values q_learner.update_q_values_and_save_partial() # save fully filled replay buffer and final Q-values q_learner.save_final() # load single architecture config from replay buffer and train till convergence elif int(args.task) == 2: # raise exceptions if requirements to continue incomplete search not met if (args.replay_buffer_csv_path is None) or (not os.path.exists( args.replay_buffer_csv_path)): raise ValueError('wrong path specified for replay buffer') if int(args.fixed_net_index_no) < 0: raise ValueError( 'specify a non negative integer for fixed net index') # generate states for fixed net from a complete search q_learner.generate_fixed_net_states() # train/val fixed net exhaustively mem_fit, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, soft_val_all_epochs, train_flag,\ hard_best_background, hard_best_crack, hard_best_spallation, hard_best_exposed_bars, hard_best_efflorescence, \ hard_best_corrosion_stain = train_val_net(q_learner.state_list, dataset, weight_initializer, device, args, save_path) # add fixed net and performance measures to a data frame and save it q_learner.add_fixed_net_to_fixed_net_buffer( spp_size=spp_size, hard_best_val=hard_best_val, hard_val_all_epochs=hard_val_all_epochs, soft_best_val=soft_best_val, soft_val_all_epochs=soft_val_all_epochs, hard_best_background=hard_best_background, hard_best_crack=hard_best_crack, hard_best_spallation=hard_best_spallation, hard_best_exposed_bars=hard_best_exposed_bars, hard_best_efflorescence=hard_best_efflorescence, hard_best_corrosion_stain=hard_best_corrosion_stain) # save fixed net buffer q_learner.save_final() # raise exception if no matching task else: raise NotImplementedError('Given task no. not implemented.')
def main(): # Command line options args = parser.parse_args() print("Command line options:") for arg in vars(args): print(arg, getattr(args, arg)) # import the correct loss and training functions depending which model to optimize # TODO: these could easily be refactored into one function, but we kept it this way for modularity if args.train_var: if args.joint: from lib.Training.train import train_var_joint as train from lib.Training.validate import validate_var_joint as validate from lib.Training.loss_functions import var_loss_function_joint as criterion else: from lib.Training.train import train_var as train from lib.Training.validate import validate_var as validate from lib.Training.loss_functions import var_loss_function as criterion else: if args.joint: from lib.Training.train import train_joint as train from lib.Training.validate import validate_joint as validate from lib.Training.loss_functions import loss_function_joint as criterion else: from lib.Training.train import train as train from lib.Training.validate import validate as validate from lib.Training.loss_functions import loss_function as criterion # Check whether GPU is available and can be used # if CUDA is found then device is set accordingly device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Launch a writer for the tensorboard summary writer instance save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture +\ '_dropout_' + str(args.dropout) if args.train_var: save_path += '_variational_samples_' + str( args.var_samples) + '_latent_dim_' + str(args.var_latent_dim) if args.joint: save_path += '_joint' # if we are resuming a previous training, note it in the name if args.resume: save_path = save_path + '_resumed' writer = SummaryWriter(save_path) # saving the parsed args to file log_file = os.path.join(save_path, "stdout") log = open(log_file, "a") for arg in vars(args): log.write(arg + ':' + str(getattr(args, arg)) + '\n') # Dataset loading data_init_method = getattr(datasets, args.dataset) dataset = data_init_method(torch.cuda.is_available(), args) # get the number of classes from the class dictionary num_classes = dataset.num_classes # add command line options to TensorBoard args_to_tensorboard(writer, args) log.close() # Get a sample input from the data loader to infer color channels/size net_input, _ = next(iter(dataset.train_loader)) # get the amount of color channels in the input images num_colors = net_input.size(1) # import model from architectures class net_init_method = getattr(architectures, args.architecture) # build the model model = net_init_method(device, num_classes, num_colors, args) # Parallel container for multi GPU use and cast to available device model = torch.nn.DataParallel(model).to(device) print(model) # Initialize the weights of the model, by default according to He et al. print("Initializing network with: " + args.weight_init) WeightInitializer = WeightInit(args.weight_init) WeightInitializer.init_model(model) # Define optimizer and loss function (criterion) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) epoch = 0 best_prec = 0 best_loss = random.getrandbits(128) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_prec = checkpoint['best_prec'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # optimize until final amount of epochs is reached. while epoch < args.epochs: # train train(dataset, model, criterion, epoch, optimizer, writer, device, args) # evaluate on validation set prec, loss = validate(dataset, model, criterion, epoch, writer, device, args) # remember best prec@1 and save checkpoint is_best = loss < best_loss best_loss = min(loss, best_loss) best_prec = max(prec, best_prec) save_checkpoint( { 'epoch': epoch, 'arch': args.architecture, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, is_best, save_path) # increment epoch counters epoch += 1 writer.close()