Beispiel #1
0
def main():
    # Check whether GPU is available and can be used
    # if CUDA is found then device is set accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    save_path = './runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime())
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    log_file = os.path.join(save_path, "stdout")
    log = open(log_file, "a")
    # TODO: gives interrupted sys call error
    # log_file = os.path.join(save_path, "stdout")
    # sys.stdout = Logger(log_file)

    # Command line options
    args = parser.parse_args()
    print("Command line options:")
    for arg in vars(args):
        print(arg, getattr(args, arg))
        log.write(arg + ':' + str(getattr(args, arg)) + '\n')
    log.close()

    # Initialize the weights of the model
    print("Initializing network with: " + args.weight_init)
    WeightInitializer = WeightInit(args.weight_init)

    # Dataset loading
    # TODO: hard-coded file paths
    patch_size = args.patch_size
    data_init_method = getattr(datasets, args.dataset)
    dataset = data_init_method(torch.cuda.is_available(), args)

    gen = QLearner(state_space_parameters, 1, WeightInitializer, device, args, save_path, qstore = args.qstore_path,\
        replaydict = args.replay_dict_path)

    if (args.continue_epsilon
            not in np.array(state_space_parameters.epsilon_schedule)[:, 0]):
        raise ValueError('continue-epsilon {} not in epsilon schedule!'.format(
            args.continue_epsilon))

    for episode in state_space_parameters.epsilon_schedule:

        epsilon = episode[0]
        M = episode[1]

        for ite in range(1, M + 1):
            if epsilon == args.continue_epsilon and args.continue_ite > M:
                raise ValueError(
                    'continue-ite {} not within range of continue-epsilon {} in epsilon schedule!'
                    .format(args.continue_ite, epsilon))
            if (epsilon == args.continue_epsilon and ite >= args.continue_ite
                ) or (epsilon < args.continue_epsilon):
                print('ite:{}, epsilon:{}'.format(ite, epsilon))
                gen.generate_net(epsilon, dataset)

    gen.replay_dictionary.to_csv(os.path.join(save_path,
                                              'replayDictFinal.csv'))
    gen.qstore.save_to_csv(os.path.join(save_path, 'qValFinal.csv'))
Beispiel #2
0
def main():
    # Command line options
    args = parser.parse_args()
    print("Command line options:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    if args.cross_dataset and not args.incremental_data:
        raise ValueError(
            'cross-dataset training possible only if incremental-data flag set'
        )

    # Check whether GPU is available and can be used
    # if CUDA is found then device is set accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Launch a writer for the tensorboard summary writer instance
    save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture +\
                '_variational_samples_' + str(args.var_samples) + '_latent_dim_' + str(args.var_latent_dim)

    # add option specific naming to separate tensorboard log files later
    if args.autoregression:
        save_path += '_pixelcnn'

    if args.incremental_data:
        save_path += '_incremental'
        if args.train_incremental_upper_bound:
            save_path += '_upper_bound'
        if args.generative_replay:
            save_path += '_genreplay'
        if args.openset_generative_replay:
            save_path += '_opensetreplay'
    if args.cross_dataset:
        save_path += '_cross_dataset_' + args.dataset_order

    # if we are resuming a previous training, note it in the name
    if args.resume:
        save_path = save_path + '_resumed'
    writer = SummaryWriter(save_path)

    # saving the parsed args to file
    log_file = os.path.join(save_path, "stdout")
    log = open(log_file, "a")
    for arg in vars(args):
        log.write(arg + ':' + str(getattr(args, arg)) + '\n')

    # Dataset loading
    data_init_method = getattr(datasets, args.dataset)
    dataset = data_init_method(torch.cuda.is_available(), args)
    # get the number of classes from the class dictionary
    num_classes = dataset.num_classes

    # we set an epoch multiplier to 1 for isolated training and increase it proportional to amount of tasks in CL
    epoch_multiplier = 1
    if args.incremental_data:
        from lib.Datasets.incremental_dataset import get_incremental_dataset

        # get the method to create the incremental dataste (inherits from the chosen data loader)
        inc_dataset_init_method = get_incremental_dataset(
            data_init_method, args)

        # different options for class incremental vs. cross-dataset experiments
        if args.cross_dataset:
            # if a task order file is specified, load the task order from it
            if args.load_task_order:
                # check if file exists and if file ends with extension '.txt'
                if os.path.isfile(args.load_task_order) and len(args.load_task_order) >= 4\
                        and args.load_task_order[-4:] == '.txt':
                    print("=> loading task order from '{}'".format(
                        args.load_task_order))
                    with open(args.load_task_order, 'rb') as fp:
                        task_order = pickle.load(fp)
                # if no file is found default to cmd line task order
                else:
                    # parse and split string at commas
                    task_order = args.dataset_order.split(',')
                    for i in range(len(task_order)):
                        # remove blank spaces in dataset names
                        task_order[i] = task_order[i].replace(" ", "")
            # use task order as specified in command line
            else:
                # parse and split string at commas
                task_order = args.dataset_order.split(',')
                for i in range(len(task_order)):
                    # remove blank spaces in dataset names
                    task_order[i] = task_order[i].replace(" ", "")

            # just for getting the number of classes in the first dataset
            num_classes = 0
            for i in range(args.num_base_tasks):
                temp_dataset_init_method = getattr(datasets, task_order[i])
                temp_dataset = temp_dataset_init_method(
                    torch.cuda.is_available(), args)
                num_classes += temp_dataset.num_classes
                del temp_dataset

            # multiply epochs by number of tasks
            if args.num_increment_tasks:
                epoch_multiplier = ((len(task_order) - args.num_base_tasks) /
                                    args.num_increment_tasks) + 1
            else:
                # this branch will get active if num_increment_tasks is set to zero. This is useful when training
                # any isolated upper bound with all datasets present from the start.
                epoch_multiplier = 1.0
        else:
            # class incremental
            # if specified load task order from file
            if args.load_task_order:
                if os.path.isfile(args.load_task_order):
                    print("=> loading task order from '{}'".format(
                        args.load_task_order))
                    task_order = np.load(args.load_task_order).tolist()
                else:
                    # if no file is found a random task order is created
                    print(
                        "=> no task order found. Creating randomized task order"
                    )
                    task_order = np.random.permutation(num_classes).tolist()
            else:
                # if randomize task order is specified create a random task order, else task order is sequential
                task_order = []
                for i in range(dataset.num_classes):
                    task_order.append(i)

                if args.randomize_task_order:
                    task_order = np.random.permutation(num_classes).tolist()

            # save the task order
            np.save(os.path.join(save_path, 'task_order.npy'), task_order)
            # set the number of classes to base tasks + 1 because base tasks is always one less.
            # E.g. if you have 2 classes it's one task. This is a little inconsistent from the naming point of view
            # but we wanted a single variable to work for both class incremental as well as cross-dataset experiments
            num_classes = args.num_base_tasks + 1
            # multiply epochs by number of tasks
            epoch_multiplier = (
                (len(task_order) -
                 (args.num_base_tasks + 1)) / args.num_increment_tasks) + 1

        print("Task order: ", task_order)
        # log the task order into the text file
        log.write('task_order:' + str(task_order) + '\n')
        args.task_order = task_order

        # this is a little weird, but it needs to be here because the below method pops items from task_order
        args_to_tensorboard(writer, args)

        assert epoch_multiplier.is_integer(), print(
            "uneven task division, make sure number of tasks are integers.")

        # Get the incremental dataset
        dataset = inc_dataset_init_method(torch.cuda.is_available(), device,
                                          task_order, args)
    else:
        # add command line options to TensorBoard
        args_to_tensorboard(writer, args)

    log.close()

    # Get a sample input from the data loader to infer color channels/size
    net_input, _ = next(iter(dataset.train_loader))
    # get the amount of color channels in the input images
    num_colors = net_input.size(1)

    # import model from architectures class
    net_init_method = getattr(architectures, args.architecture)

    # if we are not building an autoregressive model the number of output channels of the model is equivalent to
    # the amount of input channels. For an autoregressive models we set the number of output channels of the
    # non-autoregressive decoder portion according to the command line option below
    if not args.autoregression:
        args.out_channels = num_colors

    # build the model
    model = net_init_method(device, num_classes, num_colors, args)

    # optionally add the autoregressive decoder
    if args.autoregression:
        model.pixelcnn = PixelCNN(device,
                                  num_colors,
                                  args.out_channels,
                                  args.pixel_cnn_channels,
                                  num_layers=args.pixel_cnn_layers,
                                  k=args.pixel_cnn_kernel_size,
                                  padding=args.pixel_cnn_kernel_size // 2)

    # Parallel container for multi GPU use and cast to available device
    model = torch.nn.DataParallel(model).to(device)
    print(model)

    # Initialize the weights of the model, by default according to He et al.
    print("Initializing network with: " + args.weight_init)
    WeightInitializer = WeightInit(args.weight_init)
    WeightInitializer.init_model(model)

    # Define optimizer and loss function (criterion)
    optimizer = torch.optim.Adam(model.parameters(), args.learning_rate)

    epoch = 0
    best_prec = 0
    best_loss = random.getrandbits(128)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            epoch = checkpoint['epoch']
            best_prec = checkpoint['best_prec']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # optimize until final amount of epochs is reached. Final amount of epochs is determined through the
    while epoch < (args.epochs * epoch_multiplier):
        # visualize the latent space before each task increment and at the end of training if it is 2-D
        if epoch % args.epochs == 0 and epoch > 0 or (epoch + 1) % (
                args.epochs * epoch_multiplier) == 0:
            if model.module.latent_dim == 2:
                print("Calculating and visualizing dataset embedding")
                # infer the number of current tasks to plot the different classes in the embedding
                if args.incremental_data:
                    if args.cross_dataset:
                        num_tasks = sum(
                            dataset.num_classes_per_task[:len(dataset.
                                                              seen_tasks)])
                    else:
                        num_tasks = len(dataset.seen_tasks)
                else:
                    num_tasks = num_classes

                zs = get_latent_embedding(model, dataset.train_loader,
                                          num_tasks, device)
                visualize_dataset_in_2d_embedding(writer,
                                                  zs,
                                                  args.dataset,
                                                  save_path,
                                                  task=num_tasks)

        # continual learning specific part
        if args.incremental_data:
            # at the end of each task increment
            if epoch % args.epochs == 0 and epoch > 0:
                print('Saving the last checkpoint from the previous task ...')
                save_task_checkpoint(save_path, epoch // args.epochs)

                print("Incrementing dataset ...")
                dataset.increment_tasks(
                    model,
                    args.batch_size,
                    args.workers,
                    writer,
                    save_path,
                    is_gpu=torch.cuda.is_available(),
                    upper_bound_baseline=args.train_incremental_upper_bound,
                    generative_replay=args.generative_replay,
                    openset_generative_replay=args.openset_generative_replay,
                    openset_threshold=args.openset_generative_replay_threshold,
                    openset_tailsize=args.openset_weibull_tailsize,
                    autoregression=args.autoregression)

                # grow the classifier and increment the variable for number of overall classes so we can use it later
                if args.cross_dataset:
                    grow_classifier(
                        model.module.classifier,
                        sum(dataset.num_classes_per_task[:len(dataset.
                                                              seen_tasks)]) -
                        model.module.num_classes, WeightInitializer)
                    model.module.num_classes = sum(
                        dataset.num_classes_per_task[:len(dataset.seen_tasks)])
                else:
                    model.module.num_classes += args.num_increment_tasks
                    grow_classifier(model.module.classifier,
                                    args.num_increment_tasks,
                                    WeightInitializer)

                # reset moving averages etc. of the optimizer
                optimizer = torch.optim.Adam(model.parameters(),
                                             args.learning_rate)

            # change the number of seen classes
            if epoch % args.epochs == 0:
                model.module.seen_tasks = dataset.seen_tasks

        # train
        train(dataset, model, criterion, epoch, optimizer, writer, device,
              args)

        # evaluate on validation set
        prec, loss = validate(dataset, model, criterion, epoch, writer, device,
                              save_path, args)

        # remember best prec@1 and save checkpoint
        is_best = loss < best_loss
        best_loss = min(loss, best_loss)
        best_prec = max(prec, best_prec)
        save_checkpoint(
            {
                'epoch': epoch,
                'arch': args.architecture,
                'state_dict': model.state_dict(),
                'best_prec': best_prec,
                'best_loss': best_loss,
                'optimizer': optimizer.state_dict()
            }, is_best, save_path)

        # increment epoch counters
        epoch += 1

        # if a new task begins reset the best prec so that new best model can be stored.
        if args.incremental_data and epoch % args.epochs == 0:
            best_prec = 0
            best_loss = random.getrandbits(128)

    writer.close()
Beispiel #3
0
def main():
    # Command line options
    args = parser.parse_args()
    print("Command line options:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    if args.debug:
        pdb.set_trace()

    # Check whether GPU is available and can be used
    # if CUDA is found then device is set accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Launch a writer for the tensorboard summary writer instance
    save_path = 'runs/' + strftime(
        "%Y-%m-%d_%H-%M-%S",
        gmtime()) + '_' + args.dataset + '_' + args.architecture

    # if we are resuming a previous training, note it in the name
    if args.resume:
        save_path = save_path + '_resumed'
    writer = SummaryWriter(save_path)

    # saving the parsed args to file
    log_file = os.path.join(save_path, "stdout")
    log = open(log_file, "a")
    for arg in vars(args):
        log.write(arg + ':' + str(getattr(args, arg)) + '\n')

    # Dataset loading
    data_init_method = getattr(datasets, args.dataset)
    dataset = data_init_method(torch.cuda.is_available(), args)
    # get the number of classes from the class dictionary
    num_classes = dataset.num_classes

    # we set an epoch multiplier to 1 for isolated training and increase it proportional to amount of tasks in CL
    epoch_multiplier = 1

    # add command line options to TensorBoard
    args_to_tensorboard(writer, args)
    log.close()

    # build the model
    model = architectures.Inos_model(args.num_class, args)

    # Parallel container for multi GPU use and cast to available device
    model = torch.nn.DataParallel(model).to(device)
    print(model)

    if not args.pretrained:
        # Initialize the weights of the model, by default according to He et al.
        print("Initializing network with: " + args.weight_init)
        WeightInitializer = WeightInit(args.weight_init)
        WeightInitializer.init_model(model)

    # Define optimizer and loss function (criterion)
    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=0.9,
                                weight_decay=2e-4)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[30, 60, 80, 100], gamma=0.5)

    epoch = 0
    best_prec = 0
    best_loss = random.getrandbits(128)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            epoch = checkpoint['epoch']
            best_prec = checkpoint['best_prec']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            # optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # optimize until final amount of epochs is reached. Final amount of epochs is determined through the
    while epoch < (args.epochs * epoch_multiplier):
        if epoch + 2 == epoch % args.epochs:
            print("debug perpose")

        # train
        train(dataset, model, criterion, epoch, optimizer, writer, device,
              args)

        # evaluate on validation set
        prec, loss = validate(dataset, model, criterion, epoch, writer, device,
                              save_path, args)

        # evaluate on test set
        prec_t, loss_t = test(dataset, model, criterion, epoch, writer, device,
                              save_path, args)

        # remember best prec@1 and save checkpoint
        is_best = loss < best_loss
        best_loss = min(loss, best_loss)
        best_prec = max(prec, best_prec)
        save_checkpoint(
            {
                'epoch': epoch,
                'arch': args.architecture,
                'state_dict': model.state_dict(),
                'best_prec': best_prec,
                'best_loss': best_loss,
                'optimizer': optimizer.state_dict()
            }, is_best, save_path)

        # increment epoch counters
        epoch += 1
        scheduler.step()

    writer.close()
def main():
    # Command line options
    args = parser.parse_args()
    print("Command line options:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    # Check whether GPU is available and can be used
    # if CUDA is found then device is set accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    cudnn.benchmark = True
    num_GPUs = torch.cuda.device_count()

    # If save directory for runs doesn't exist then create it
    if not os.path.exists('runs'):
        os.mkdir('runs')

    # Create a time-stamped save path for individual experiment
    save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + \
                ';' + args.dataset + ';' + args.architecture
    os.mkdir(save_path)

    # List of values to log to csv
    columns_list = [
        'Filters', 'Parameters', 'Mean', 'Variance', 'Skew', 'BestVal',
        'BestValsTrain', 'BestEpoch', 'LastValPrec', 'LastTrainPrec',
        'AllTrain', 'AllVal'
    ]
    df = pd.DataFrame(columns=columns_list)

    # Dataset loading
    data_init_method = getattr(datasets, args.dataset)
    dataset = data_init_method(torch.cuda.is_available(), args)

    # get the amount of color channels in the input images
    net_input, _ = next(iter(dataset.train_loader))
    num_colors = net_input.size(1)

    # import model from architectures class
    net_init_method = getattr(architectures, args.architecture)

    # Get the parameters for all valid skewed models
    SNModels = SkewNormalModels(depth=args.vgg_depth,
                                num_classes=dataset.num_classes,
                                patch_size=args.patch_size)
    skew_model_params = SNModels.get_valid_models()
    print("Total number of models: ", len(skew_model_params["filters"]))

    # Weight-init method
    WeightInitializer = WeightInit(args.weight_init)

    # Optionally resume a previous experiment
    current_id = args.resume_model_id
    for i in range(len(skew_model_params["filters"]) - current_id):
        print("Model filters: ", skew_model_params["filters"][i + current_id])
        print("Model parameters: ",
              skew_model_params["total_params"][i + current_id], " mean: ",
              skew_model_params["means"][i + current_id], " var: ",
              skew_model_params["vars"][i + current_id], " skew: ",
              skew_model_params["skews"][i + current_id])

        model = net_init_method(device,
                                dataset.num_classes,
                                num_colors,
                                args,
                                skew_model_params["filters"][i + current_id],
                                custom_filters=True)

        # Parallel container for multi GPU use and cast to available device
        model = torch.nn.DataParallel(model).to(device)
        print(model)

        # Initialize the weights of the model
        print("Initializing networks with: " + args.weight_init)
        WeightInitializer.init_model(model)

        # Define criterion and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(model.parameters(),
                                    args.learning_rate,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay,
                                    nesterov=args.nesterov)

        # Initialize SGDWR learning rate scheduler
        lr_scheduler = LearningRateScheduler(args.lr_wr_epochs,
                                             len(dataset.train_loader.dataset),
                                             args.batch_size,
                                             args.learning_rate,
                                             args.lr_wr_mul, args.lr_wr_min)

        # Get estimated GPU memory usage of the model and split batch if too little memory is available
        if torch.cuda.is_available():
            GPUMemory = GPUMem(torch.cuda.is_available())
            print('available:{}'.format(
                (GPUMemory.total_mem -
                 GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.))
            print('required per gpu with buffer: {}'.format(
                (4. / float(num_GPUs) * model.module.gpu_usage) + 1.))

            # calculate smaller chunk size to split batch into sequential computations
            mem_scale_factor = 4.0  # TODO: WEIRD factor... why is this necessary and where does it come from?
            # TODO: the + 1 Gb should be taken from the cache allocator
            if ((GPUMemory.total_mem -
                 GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.) < (
                     (mem_scale_factor / float(num_GPUs) *
                      model.module.gpu_usage) + 1.):

                # code for variable batch size implementation as per gpu constraint; remove for old code
                approx_small_batch_size = (((GPUMemory.total_mem - GPUMemory.total_mem * GPUMemory.get_mem_util()) / 1024.
                                            - 1.) * float(num_GPUs) / mem_scale_factor) //\
                                          (model.module.gpu_usage / float(args.batch_size))

                diff = float('inf')
                temp_small_batch_size = approx_small_batch_size
                for j in range(1, (args.batch_size // 2) + 1):
                    if args.batch_size % j == 0 and abs(
                            j - approx_small_batch_size) < diff:
                        diff = abs(j - approx_small_batch_size)
                        temp_small_batch_size = j
                batch_seq_split_size = temp_small_batch_size
            else:
                batch_seq_split_size = args.batch_size
        else:
            batch_seq_split_size = args.batch_size

        # Get training and validation dataset loaders
        dataset.train_loader, dataset.val_loader = dataset.get_dataset_loader(
            batch_seq_split_size, args.workers, device)

        print(
            'sequential batch size split size:{}'.format(batch_seq_split_size))

        epoch = 0
        best_epoch = 0
        best_prec = 0
        best_val_train_prec = 0
        all_train = []
        all_val = []

        while epoch < args.epochs:
            # train for one epoch
            train_prec = train(dataset.train_loader, model, criterion, epoch,
                               optimizer, lr_scheduler, device,
                               batch_seq_split_size, args)
            # evaluate on validation set
            prec = validate(dataset.val_loader, model, criterion, epoch,
                            device, args)

            all_train.append(train_prec)
            all_val.append(prec)

            # remember best prec@1 and save checkpoint
            is_best = prec > best_prec
            if is_best:
                best_epoch = epoch
                best_val_train_prec = train_prec
                best_prec = prec

            # if architecture doesn't train at all skip it
            if epoch == args.lr_wr_epochs - 1 and train_prec < (
                    2 * 100.0 / dataset.num_classes):
                break

            # increment epoch counters
            epoch += 1
            lr_scheduler.scheduler_epoch += 1

        # append architecture results to csv
        df = df.append(pd.DataFrame([[
            skew_model_params["filters"][i + current_id],
            skew_model_params["total_params"][i + current_id],
            skew_model_params["means"][i + current_id],
            skew_model_params["vars"][i + current_id],
            skew_model_params["skews"][i + current_id], best_prec,
            best_val_train_prec, best_epoch, prec, train_prec, all_train,
            all_val
        ]],
                                    columns=columns_list),
                       ignore_index=True)
        df.to_csv(save_path + '/model_%03d' % (i + 1 + current_id) + '.csv')

        del model
        del optimizer
Beispiel #5
0
def main():
    # set device for torch computations
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    save_path = './runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime())
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # parse command line arguments
    args = parser.parse_args()
    print("Command line options:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    # create log file
    log_file = os.path.join(save_path, "stdout")

    # write parsed args to log file
    log = open(log_file, "a")
    for arg in vars(args):
        print(arg, getattr(args, arg))
        log.write(arg + ':' + str(getattr(args, arg)) + '\n')
    log.close()

    # instantiate the weight initializer
    print("Initializing network with: " + args.weight_init)
    weight_initializer = WeightInit(args.weight_init)

    # instantiate dataset object
    data_init_method = getattr(datasets, args.dataset)
    dataset = data_init_method(torch.cuda.is_available(), args)

    # instantiate a tabular Q-learner
    q_learner = QLearner(args, dataset.num_classes, save_path)

    # start new architecture search
    if int(args.task) == 1:
        if args.continue_search is True:
            # raise exceptions if requirements to start new search not met
            if args.continue_epsilon not in np.array(
                    state_space_parameters.epsilon_schedule)[:, 0]:
                raise ValueError(
                    'continue-epsilon {} not in epsilon schedule!'.format(
                        args.continue_epsilon))
            if (args.replay_buffer_csv_path is None) or (not os.path.exists(
                    args.replay_buffer_csv_path)):
                raise ValueError(
                    'specify correct path to replay buffer to continue ')
            if (args.q_values_csv_path is None) or (not os.path.exists(
                    args.q_values_csv_path)):
                raise ValueError('wrong path is specified for Q-values')

        # iterate as per the epsilon-greedy schedule
        for episode in state_space_parameters.epsilon_schedule:
            epsilon = episode[0]
            m = episode[1]

            # raise exception if net number to continue from greater than number of nets for the continue_epsilon
            if epsilon == args.continue_epsilon and args.continue_ite > m:
                raise ValueError(
                    'continue-ite {} not within range of continue-epsilon {} in epsilon schedule!'
                    .format(args.continue_ite, epsilon))

            # iterate through number of nets for an epsilon
            for ite in range(1, m + 1):
                # check conditions to generate and train arc
                if (epsilon == args.continue_epsilon
                        and ite >= args.continue_ite) or (
                            epsilon < args.continue_epsilon):
                    print('ite:{}, epsilon:{}'.format(ite, epsilon))

                    # generate net states for search
                    q_learner.generate_search_net_states(epsilon)

                    # check if net already trained before
                    search_net_in_replay_dict = q_learner.check_search_net_in_replay_buffer(
                    )

                    # add to the end of the replay buffer if net already trained before
                    if search_net_in_replay_dict:
                        q_learner.add_search_net_to_replay_buffer(
                            search_net_in_replay_dict, verbose=True)
                    # train net if net not trained before
                    else:
                        # train/val search net
                        mem_fit, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, soft_val_all_epochs,\
                        train_flag, hard_best_background, hard_best_crack, hard_best_spallation,\
                        hard_best_exposed_bars, hard_best_efflorescence, hard_best_corrosion_stain =\
                            train_val_net(q_learner.state_list, dataset, weight_initializer, device, args, save_path)

                        # check if net fits memory
                        while mem_fit is False:
                            print(
                                "net failed mem check even with batch splitting, sampling again!"
                            )

                            q_learner.generate_search_net_states(epsilon)
                            net_in_replay_dict = q_learner.check_search_net_in_replay_buffer(
                            )

                            if search_net_in_replay_dict:
                                q_learner.add_search_net_to_replay_buffer(
                                    net_in_replay_dict)
                                break
                            else:
                                mem_fit, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, \
                                soft_val_all_epochs, train_flag, hard_best_background, hard_best_crack,\
                                hard_best_spallation, hard_best_exposed_bars, hard_best_efflorescence,\
                                hard_best_corrosion_stain =\
                                    train_val_net(q_learner.state_list, dataset, weight_initializer, device, args,
                                                  save_path)

                        # add new net and performance measures to replay buffer if it fits in memory after splitting
                        # batch
                        if mem_fit:
                            reward = q_learner.accuracies_to_reward(
                                hard_val_all_epochs)
                            q_learner.add_search_net_to_replay_buffer(
                                search_net_in_replay_dict,
                                spp_size=spp_size,
                                reward=reward,
                                hard_best_val=hard_best_val,
                                hard_val_all_epochs=hard_val_all_epochs,
                                soft_best_val=soft_best_val,
                                soft_val_all_epochs=soft_val_all_epochs,
                                train_flag=train_flag,
                                hard_best_background=hard_best_background,
                                hard_best_crack=hard_best_crack,
                                hard_best_spallation=hard_best_spallation,
                                hard_best_exposed_bars=hard_best_exposed_bars,
                                hard_best_efflorescence=hard_best_efflorescence,
                                hard_best_corrosion_stain=
                                hard_best_corrosion_stain,
                                verbose=True)
                    # sample nets from replay buffer, update Q-values and save partially filled replay buffer and
                    # Q-values
                    q_learner.update_q_values_and_save_partial()

        # save fully filled replay buffer and final Q-values
        q_learner.save_final()

    # load single architecture config from replay buffer and train till convergence
    elif int(args.task) == 2:
        # raise exceptions if requirements to continue incomplete search not met
        if (args.replay_buffer_csv_path is None) or (not os.path.exists(
                args.replay_buffer_csv_path)):
            raise ValueError('wrong path specified for replay buffer')
        if int(args.fixed_net_index_no) < 0:
            raise ValueError(
                'specify a non negative integer for fixed net index')

        # generate states for fixed net from a complete search
        q_learner.generate_fixed_net_states()

        # train/val fixed net exhaustively
        mem_fit, spp_size, hard_best_val, hard_val_all_epochs, soft_best_val, soft_val_all_epochs, train_flag,\
        hard_best_background, hard_best_crack, hard_best_spallation, hard_best_exposed_bars, hard_best_efflorescence, \
        hard_best_corrosion_stain = train_val_net(q_learner.state_list, dataset, weight_initializer, device, args,
                                                  save_path)

        # add fixed net and performance measures to a data frame and save it
        q_learner.add_fixed_net_to_fixed_net_buffer(
            spp_size=spp_size,
            hard_best_val=hard_best_val,
            hard_val_all_epochs=hard_val_all_epochs,
            soft_best_val=soft_best_val,
            soft_val_all_epochs=soft_val_all_epochs,
            hard_best_background=hard_best_background,
            hard_best_crack=hard_best_crack,
            hard_best_spallation=hard_best_spallation,
            hard_best_exposed_bars=hard_best_exposed_bars,
            hard_best_efflorescence=hard_best_efflorescence,
            hard_best_corrosion_stain=hard_best_corrosion_stain)

        # save fixed net buffer
        q_learner.save_final()

    # raise exception if no matching task
    else:
        raise NotImplementedError('Given task no. not implemented.')
Beispiel #6
0
def main():
    # Command line options
    args = parser.parse_args()
    print("Command line options:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    # import the correct loss and training functions depending which model to optimize
    # TODO: these could easily be refactored into one function, but we kept it this way for modularity
    if args.train_var:
        if args.joint:
            from lib.Training.train import train_var_joint as train
            from lib.Training.validate import validate_var_joint as validate
            from lib.Training.loss_functions import var_loss_function_joint as criterion
        else:
            from lib.Training.train import train_var as train
            from lib.Training.validate import validate_var as validate
            from lib.Training.loss_functions import var_loss_function as criterion
    else:
        if args.joint:
            from lib.Training.train import train_joint as train
            from lib.Training.validate import validate_joint as validate
            from lib.Training.loss_functions import loss_function_joint as criterion
        else:
            from lib.Training.train import train as train
            from lib.Training.validate import validate as validate
            from lib.Training.loss_functions import loss_function as criterion

    # Check whether GPU is available and can be used
    # if CUDA is found then device is set accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Launch a writer for the tensorboard summary writer instance
    save_path = 'runs/' + strftime("%Y-%m-%d_%H-%M-%S", gmtime()) + '_' + args.dataset + '_' + args.architecture +\
                '_dropout_' + str(args.dropout)

    if args.train_var:
        save_path += '_variational_samples_' + str(
            args.var_samples) + '_latent_dim_' + str(args.var_latent_dim)

    if args.joint:
        save_path += '_joint'

    # if we are resuming a previous training, note it in the name
    if args.resume:
        save_path = save_path + '_resumed'
    writer = SummaryWriter(save_path)

    # saving the parsed args to file
    log_file = os.path.join(save_path, "stdout")
    log = open(log_file, "a")
    for arg in vars(args):
        log.write(arg + ':' + str(getattr(args, arg)) + '\n')

    # Dataset loading
    data_init_method = getattr(datasets, args.dataset)
    dataset = data_init_method(torch.cuda.is_available(), args)
    # get the number of classes from the class dictionary
    num_classes = dataset.num_classes

    # add command line options to TensorBoard
    args_to_tensorboard(writer, args)

    log.close()

    # Get a sample input from the data loader to infer color channels/size
    net_input, _ = next(iter(dataset.train_loader))
    # get the amount of color channels in the input images
    num_colors = net_input.size(1)

    # import model from architectures class
    net_init_method = getattr(architectures, args.architecture)

    # build the model
    model = net_init_method(device, num_classes, num_colors, args)

    # Parallel container for multi GPU use and cast to available device
    model = torch.nn.DataParallel(model).to(device)
    print(model)

    # Initialize the weights of the model, by default according to He et al.
    print("Initializing network with: " + args.weight_init)
    WeightInitializer = WeightInit(args.weight_init)
    WeightInitializer.init_model(model)

    # Define optimizer and loss function (criterion)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)

    epoch = 0
    best_prec = 0
    best_loss = random.getrandbits(128)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            epoch = checkpoint['epoch']
            best_prec = checkpoint['best_prec']
            best_loss = checkpoint['best_loss']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # optimize until final amount of epochs is reached.
    while epoch < args.epochs:
        # train
        train(dataset, model, criterion, epoch, optimizer, writer, device,
              args)

        # evaluate on validation set
        prec, loss = validate(dataset, model, criterion, epoch, writer, device,
                              args)

        # remember best prec@1 and save checkpoint
        is_best = loss < best_loss
        best_loss = min(loss, best_loss)
        best_prec = max(prec, best_prec)
        save_checkpoint(
            {
                'epoch': epoch,
                'arch': args.architecture,
                'state_dict': model.state_dict(),
                'best_prec': best_prec,
                'best_loss': best_loss,
                'optimizer': optimizer.state_dict()
            }, is_best, save_path)

        # increment epoch counters
        epoch += 1

    writer.close()