Ejemplo n.º 1
0
                model_load_path = os.path.join(
                    models_save_dir, algo_name + '_b' + str(b - 1) + '.pt')

            print('New train data loaded from ' + new_train_file_path)

            batch_models_save_dir = os.path.join(models_save_dir,
                                                 batch_algo_name)
            if saving_intermediate_models == True:
                if not os.path.exists(batch_models_save_dir):
                    os.mkdir(batch_models_save_dir)

            new_train_dataset = ImagesListFileFolder(
                new_train_file_path,
                transforms.Compose([
                    transforms.RandomResizedCrop(224),
                    transforms.RandomHorizontalFlip(),
                    transforms.ToTensor(),
                    normalize,
                ]),
                target_transform=lambda x: target_transform(x, P, b))

            train_loader = torch.utils.data.DataLoader(
                new_train_dataset,
                batch_size=new_batch_size,
                shuffle=True,
                num_workers=num_workers,
                pin_memory=False)

            new_classes_number = len(new_train_dataset.classes)

            print("New classes number = " + str(new_classes_number))
Ejemplo n.º 2
0
import sys, os, warnings, time
import numpy as np
from MyImageFolder import ImagesListFileFolder
import torch as th

if len(sys.argv) != 2:
    print('Arguments: images_list_file_path')
    sys.exit(-1)

train_file_path = sys.argv[1]
print('Train file path = ' + train_file_path)

#catching warnings
with warnings.catch_warnings(record=True) as warn_list:

    train_dataset = ImagesListFileFolder(train_file_path,
                                         transforms.ToTensor())

    num_classes = len(train_dataset.classes)
    print("Number of classes = " + str(num_classes))
    print("Training-set size = " + str(len(train_dataset)))

    dataloader = th.utils.data.DataLoader(train_dataset,
                                          batch_size=1,
                                          shuffle=False,
                                          num_workers=12)
    mean = th.zeros(3)
    std = th.zeros(3)
    print('==> Computing mean and std..')
    cpt = 0
    for inputs, targets in dataloader:
        cpt += 1
Ejemplo n.º 3
0
    S = int((num_classes - B) / P) + 1
    print('S = ' + str(S))
    ckp_prefix = '{}_s{}_k{}'.format(normalization_dataset_name, S,
                                     memory_size)
    np.random.seed(random_seed)  # Fix the random seed
    ########################################
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    dataset_mean, dataset_std = utils.get_dataset_mean_std(
        normalization_dataset_name, datasets_mean_std_file_path)
    normalize = transforms.Normalize(mean=dataset_mean, std=dataset_std)

    trainset = ImagesListFileFolder(
        train_file_path,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    testset = ImagesListFileFolder(
        test_file_path,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ]))

    evalset = ImagesListFileFolder(
        test_file_path,

        print('New train data loaded from ' + new_train_file_path)
        print('Old train data loaded from ' + old_train_file_path)
        print('New val data loaded from ' + new_val_file_path)
        print('Old val data loaded from ' + old_val_file_path)

        batch_models_save_dir = os.path.join(models_save_dir, batch_algo_name)
        if saving_intermediate_models == True:
            if not os.path.exists(batch_models_save_dir):
                os.mkdir(batch_models_save_dir)

        old_train_dataset = ImagesListFileFolder(
            old_train_file_path,
            transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ]))

        new_train_dataset = ImagesListFileFolder(
            new_train_file_path,
            transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize, ]),
        )

        new_and_old_train_datasets = torch.utils.data.dataset.ConcatDataset(
            (old_train_dataset, new_train_dataset))
Ejemplo n.º 5
0
    print('normalization dataset name = ' + str(normalization_dataset_name))
    print('dataset mean = ' + str(dataset_mean))
    print('dataset std = ' + str(dataset_std))

    normalize = transforms.Normalize(mean=dataset_mean, std=dataset_std)

    print("Number of workers = " + str(num_workers))
    print("Batch size = " + str(batch_size))
    print("Running on gpu " + str(gpu))

    print('-------> Val data')

    val_dataset = ImagesListFileFolder(val_images_list,
                                       transforms.Compose([
                                           transforms.Resize(256),
                                           transforms.CenterCrop(224),
                                           transforms.ToTensor(),
                                           normalize,
                                       ]),
                                       return_path=True)

    print("Val-set size = " + str(len(val_dataset)))

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=num_workers,
                                             pin_memory=False)

    print('Loading list file from ' + val_images_list)
    print('Destination directory ' + val_destination_dir)
def main(I):
    if __name__ == '__main__':
        if not os.path.exists(data_output_dir):
            os.makedirs(data_output_dir)

        if not os.path.exists(models_save_dir):
            os.makedirs(models_save_dir)

        # catching warnings
        with warnings.catch_warnings(record=True) as warn_list:
            herding = StaticHerding()

            runs_top1_acc = []
            runs_topx_acc = []

            first_run_starting_time = time.time()
            for r in range(1, num_runs + 1):
                run_data_output_dir = os.path.join(data_output_dir,
                                                   'run_' + str(r))
                if not os.path.exists(run_data_output_dir):
                    os.makedirs(run_data_output_dir)

                run_models_save_dir = os.path.join(models_save_dir,
                                                   'run_' + str(r))
                if not os.path.exists(run_models_save_dir):
                    os.makedirs(run_models_save_dir)

                run_features_destination_dir = os.path.join(
                    run_data_output_dir, 'features')
                if not os.path.exists(run_features_destination_dir):
                    os.mkdir(run_features_destination_dir)

                top1_val_accuracies = []
                topx_val_accuracies = []
                previous_model = None

                run_starting_time = time.time()
                batch_oracle_annotated_paths = {}
                undetected_classes = []
                for b in range(1, T + 1):
                    print('*' * 110)
                    print('*' * 46 +
                          ' Run {}/{} | BATCH {} '.format(r, num_runs, b) +
                          '*' * 45)
                    print('*' * 110 + '\n')

                    if b == 1:
                        model_load_path = first_model_load_path
                        new_train_file_path = path_train_batch1
                        val_file_path = path_val_batch1
                        print('Train data loaded from ' + new_train_file_path)
                        print('Val data loaded from ' + val_file_path)

                        new_train_dataset = ImagesListFileFolder(
                            new_train_file_path,
                            transforms.Compose([
                                transforms.RandomResizedCrop(224),
                                transforms.RandomHorizontalFlip(),
                                transforms.ToTensor(), normalize
                            ]),
                            return_path=True)

                        model_dsets = [new_train_dataset]

                        val_dataset = ImagesListFileFolder(
                            val_file_path,
                            transforms.Compose([
                                transforms.Resize(256),
                                transforms.CenterCrop(224),
                                transforms.ToTensor(), normalize
                            ]),
                            return_path=True)

                        val_loader = torch.utils.data.DataLoader(
                            val_dataset,
                            batch_size=val_batch_size,
                            shuffle=True,
                            num_workers=num_workers,
                            pin_memory=False)

                        old_classes_number = 0
                        new_classes_number = len(val_dataset.classes)
                        print("Classes number = " + str(new_classes_number))
                        print("Validation-set size = " + str(len(val_dataset)))

                        model = models.resnet18(pretrained=False,
                                                num_classes=base)

                        print('\nLoading model from ' + model_load_path)
                        state = torch.load(
                            model_load_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(state['state_dict'])
                        model = model.cuda(gpu)

                        print('\n\n********* VALIDATION ********* ')
                        model.eval()
                        top1 = AverageMeter()
                        topx = AverageMeter()
                        top = min(5, new_classes_number)
                        N, n = get_dataset_N_n(model_dsets,
                                               model.fc.out_features)
                        # Validation on batch 1
                        for data in val_loader:
                            (inputs, labels), paths = data
                            inputs, labels = inputs.cuda(gpu), labels.cuda(gpu)
                            scores = model(Variable(inputs))

                            if apply_th_train or apply_th_val_al:
                                scores = th_calibration(
                                    F.softmax(scores, dim=1), N, n)

                            prec1, prec5 = utils.accuracy(scores.data,
                                                          labels,
                                                          topk=(1, top))

                            top1.update(prec1.item(), inputs.size(0))
                            topx.update(prec5.item(), inputs.size(0))
                        # -------------------------------------------
                        print(
                            'BATCH 1 | Val : acc@1 = {:.2f}% ; acc@{} = {:.2f}%'
                            .format(top1.avg, top, topx.avg))
                        top1_val_accuracies.append(top1.avg)
                        topx_val_accuracies.append(topx.avg)

                        oracle_annotated_paths = open(new_train_file_path,
                                                      'r').readlines()
                        batch_oracle_annotated_paths[
                            b] = oracle_annotated_paths

                    else:

                        batch_algo_name = algo_name + '_b' + str(b)

                        old_train_file_path = os.path.join(
                            run_data_output_dir,
                            str(b) + '_old')
                        new_val_file_path = os.path.join(
                            dataset_files_dir, 'separated/val/batch' + str(b))
                        if b == 2:
                            old_val_file_path = path_val_batch1
                        else:
                            old_val_file_path = os.path.join(
                                dataset_files_dir,
                                'accumulated/val/batch' + str(b - 1))

                        if mode == "il":  # supervised :
                            I = 1
                            new_train_file_path = os.path.join(
                                train_files_dir, 'batch' + str(b))
                            oracle_annotated_paths = open(
                                new_train_file_path, 'r').readlines()
                            batch_oracle_annotated_paths[
                                b] = oracle_annotated_paths

                        print('Old train data loaded from ' +
                              old_train_file_path)
                        print('New val data loaded from ' + new_val_file_path)
                        print('Old val data loaded from ' + old_val_file_path)

                        # Data loaders for training
                        old_train_dataset = ImagesListFileFolder(
                            old_train_file_path,
                            transforms.Compose([
                                transforms.RandomResizedCrop(224),
                                transforms.RandomHorizontalFlip(),
                                transforms.ToTensor(), normalize
                            ]),
                            return_path=True)

                        old_val_dataset = ImagesListFileFolder(
                            old_val_file_path,
                            transforms.Compose([
                                transforms.Resize(256),
                                transforms.CenterCrop(224),
                                transforms.ToTensor(), normalize
                            ]),
                            return_path=True)

                        new_val_dataset = ImagesListFileFolder(
                            new_val_file_path,
                            transforms.Compose([
                                transforms.Resize(256),
                                transforms.CenterCrop(224),
                                transforms.ToTensor(), normalize
                            ]),
                            return_path=True)

                        val_datasets = torch.utils.data.dataset.ConcatDataset(
                            (old_val_dataset, new_val_dataset))

                        val_loader = torch.utils.data.DataLoader(
                            val_datasets,
                            batch_size=val_batch_size,
                            shuffle=True,
                            num_workers=num_workers,
                            pin_memory=False)

                        old_classes_number = len(old_train_dataset.classes)

                        # Loading the model
                        if b == 2:
                            model_load_path = first_model_load_path
                        else:
                            model_load_path = os.path.join(
                                run_models_save_dir,
                                algo_name + '_b' + str(b - 1) + '.pt')

                        model = models.resnet18(pretrained=False,
                                                num_classes=base + P * (b - 2))

                        print('\nLoading saved model from ' + model_load_path)
                        state = torch.load(
                            model_load_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(state['state_dict'])

                        model.fc = nn.Linear(model.fc.in_features,
                                             base + P * (b - 1))
                        model = model.cuda(gpu)

                        # Define Loss and Optimizer
                        criterion = nn.CrossEntropyLoss()
                        optimizer = optim.SGD(model.parameters(),
                                              lr=lr,
                                              momentum=momentum,
                                              weight_decay=weight_decay)
                        scheduler = lr_scheduler.ReduceLROnPlateau(
                            optimizer, patience=patience, factor=lr_decay)

                        print("\nlr = {:.4f}".format(lr))
                        print("Old classes number = " +
                              str(old_classes_number))
                        print("Old Training-set size = " +
                              str(len(old_train_dataset)))
                        print("Validation-set size = " +
                              str(len(val_datasets)) + '\n')
                        ##############################
                        # Active learning : update batch_oracle_annotated_paths / Semi-supervised labelisation step
                        batch_oracle_annotated_paths[b] = []
                        next_new_train_file_path = os.path.join(
                            train_files_dir, 'batch' + str(b))

                        for sess in range(I):

                            if sess == 0:
                                al_model = previous_model
                            else:
                                al_model = model

                            sess_epochs = int(num_epochs / I)  #todo modify

                            for param_group in optimizer.param_groups:
                                param_group['lr'] = lr

                            if mode == "il" or I == 1:
                                sess_budget = B
                            else:
                                if sess == 0:  # take 40% of budget
                                    sess_budget = math.ceil(int(B * 40 / 100))
                                else:
                                    sess_budget = math.ceil(int(B * 20 / 100))

                            next_new_train_paths_list = open(
                                next_new_train_file_path, 'r').readlines()
                            assert (sorted(list(
                                set(next_new_train_paths_list))) == sorted(
                                    next_new_train_paths_list))
                            assert (sorted(
                                list(set(batch_oracle_annotated_paths[b]))) ==
                                    sorted(batch_oracle_annotated_paths[b]))

                            sess_new_train_paths = list(
                                set(next_new_train_paths_list) -
                                set(batch_oracle_annotated_paths[b]))
                            oracle_annotated_paths = active_learning(
                                rerun, sess, new_batch_size, b, al_model, N, n,
                                sess_budget, next_new_train_file_path,
                                sess_new_train_paths, run_data_output_dir,
                                undetected_classes)

                            batch_oracle_annotated_paths[b].extend(
                                oracle_annotated_paths)

                            new_train_file_path = os.path.join(
                                run_data_output_dir,
                                str(b) + '_new')

                            print('New train data loaded from ' +
                                  new_train_file_path)

                            new_train_dataset = ImagesListFileFolder(
                                new_train_file_path,
                                transforms.Compose([
                                    transforms.RandomResizedCrop(224),
                                    transforms.RandomHorizontalFlip(),
                                    transforms.ToTensor(), normalize
                                ]),
                                return_path=True)

                            model_dsets = [
                                old_train_dataset, new_train_dataset
                            ]

                            new_and_old_train_datasets = torch.utils.data.dataset.ConcatDataset(
                                (old_train_dataset, new_train_dataset))

                            train_loader = torch.utils.data.DataLoader(
                                new_and_old_train_datasets,
                                shuffle=True,
                                batch_size=new_batch_size,
                                num_workers=num_workers,
                                pin_memory=False)

                            new_classes_number = len(new_train_dataset.classes)
                            undetected_classes.extend(
                                list(
                                    set(
                                        range(base + P * (b - 2), base + P *
                                              (b - 1))) -
                                    set(new_train_dataset.classes)))
                            undetected_classes = sorted(
                                list(set(undetected_classes)))
                            print('undetected_classes = ' +
                                  str(undetected_classes))

                            print("New classes number = " +
                                  str(new_classes_number))
                            print("New Training-set size = " +
                                  str(len(new_train_dataset)))
                            print("Training-set size = " +
                                  str(len(new_and_old_train_datasets)))

                            N, n = get_dataset_N_n(model_dsets,
                                                   model.fc.out_features)

                            # Training
                            print("-" * 20)
                            print('\n\n********* TRAINING ********* ')
                            starting_time = time.time()

                            for epoch in range(sess_epochs):
                                top1 = AverageMeter()
                                topx = AverageMeter()
                                model.train()
                                running_loss = 0.0
                                nb_batches = 0
                                optimizer.zero_grad()
                                for i, data in enumerate(train_loader, 0):
                                    nb_batches += 1
                                    (inputs, labels), paths = data
                                    inputs, labels = Variable(
                                        inputs.cuda(gpu)), Variable(
                                            labels.cuda(gpu))
                                    scores = model(inputs)
                                    # scores[:, undetected_classes] = -np.Inf
                                    loss = criterion(scores, labels)
                                    loss.data /= iter_size
                                    loss.backward()
                                    running_loss += loss.data.item()
                                    if (i + 1) % iter_size == 0:
                                        optimizer.step()
                                        optimizer.zero_grad()
                                scheduler.step(loss.cpu().data.numpy())

                                # Model evaluation
                                model.eval()
                                top = min(
                                    5, old_classes_number + new_classes_number)
                                for data in val_loader:
                                    (inputs, labels), paths = data
                                    inputs, labels = inputs.cuda(
                                        gpu), labels.cuda(gpu)

                                    scores = model(Variable(inputs))
                                    # scores[:, undetected_classes] = -np.Inf

                                    if apply_th_train or apply_th_val_al:
                                        scores = th_calibration(
                                            F.softmax(scores, dim=1), N, n)

                                    prec1, prec5 = utils.accuracy(scores.data,
                                                                  labels,
                                                                  topk=(1,
                                                                        top))
                                    top1.update(prec1.item(), inputs.size(0))
                                    topx.update(prec5.item(), inputs.size(0))

                                current_elapsed_time = time.time(
                                ) - starting_time
                                print(
                                    '{}/{} | lr={:.5f} |{:03}/{:03} | {} | Train : loss = {:.4f}  | Val : acc@1 = {:.2f}% ; acc@{}= {:.2f}%'
                                    .format(
                                        sess, I,
                                        optimizer.param_groups[0]['lr'],
                                        epoch + 1, num_epochs,
                                        timedelta(seconds=round(
                                            current_elapsed_time)),
                                        running_loss / nb_batches, top1.avg,
                                        top, topx.avg))

                        # Training finished
                        print('Saving model in ' + batch_algo_name + '.pt' +
                              '...')
                        state = {
                            'epoch': epoch,
                            'state_dict': model.state_dict(),
                            'optimizer': optimizer.state_dict()
                        }
                        torch.save(
                            state,
                            os.path.join(run_models_save_dir, batch_algo_name)
                            + '.pt')

                        top1_val_accuracies.append(top1.avg)
                        topx_val_accuracies.append(topx.avg)

                        print("")
                        print('TOP1 val acc = ' + str(
                            [float(str(e)[:6]) for e in top1_val_accuracies]))
                        print('TOP{} val acc = '.format(top) + str(
                            [float(str(e)[:6]) for e in topx_val_accuracies]))

                    previous_model = model

                    ########## Herding
                    new_train_dataset = ImagesListFileFolder(
                        new_train_file_path,
                        transforms.Compose([
                            transforms.Resize(256),
                            transforms.CenterCrop(224),
                            transforms.ToTensor(),
                            normalize,
                        ]),
                        return_path=True)

                    # computing number of exemplars
                    m = int(
                        math.ceil(K /
                                  (old_classes_number + new_classes_number)))

                    new_train_loader = torch.utils.data.DataLoader(
                        new_train_dataset,
                        batch_size=new_batch_size,
                        shuffle=True,
                        num_workers=num_workers,
                        pin_memory=False)

                    features_extractor = nn.Sequential(
                        *list(model.children())[:-1])
                    features_extractor.eval()
                    features_extractor = features_extractor.cuda(gpu)

                    print(
                        '\n\n********* PREPARING OLD DATA FOR THE NEXT BATCH ********* '
                    )
                    if b == 1:
                        print(
                            '------> Features extraction of new data S{}* using model M{}'
                            .format(b, b))
                    else:
                        print(
                            '------> Features extraction of new data S{}+ using model M{}'
                            .format(b, b))

                    print('data loaded from : ' + new_train_file_path)

                    full_features = None
                    full_paths = None

                    for data in new_train_loader:
                        (inputs, labels), paths = data
                        inputs = inputs.cuda(gpu)
                        features = features_extractor(Variable(inputs))
                        np_paths = np.array(paths)
                        np_features = features.data.cpu().numpy()
                        np_features = np_features.reshape(
                            np_features.shape[0], np_features.shape[1])
                        if full_features is None:
                            full_paths = np_paths
                            full_features = np_features
                        else:
                            full_paths = np.append(full_paths, np_paths)
                            full_features = np.vstack(
                                (full_features, np_features))

                    features_dict = {}
                    for i in range(len(full_paths)):
                        if full_paths[i] in features_dict:
                            print(str(full_paths[i]) + ' is redundant ')
                        features_dict[full_paths[i]] = full_features[i]

                    #########################################################

                    images_files = open(new_train_file_path, 'r').readlines()
                    batch_features_destination_dir = os.path.join(
                        run_features_destination_dir, 'batch' + str(b))
                    if not os.path.exists(batch_features_destination_dir):
                        os.makedirs(batch_features_destination_dir)
                    features_out_file = os.path.join(
                        batch_features_destination_dir, 'features')

                    features_out = open(features_out_file, 'w')
                    for image_file in images_files:
                        image_file = image_file.strip('\n')
                        image_file = image_file.split()[0]
                        if '.jpg' in image_file or '.jpeg' in image_file or '.JPEG' in image_file or '.png' in image_file:
                            features_out.write(
                                str(' '.join([
                                    str(e)
                                    for e in list(features_dict[image_file])
                                ])) + '\n')
                        else:
                            print('image file = ' + str(image_file))
                    features_out.close()

                    print('Exemplars number per class = ' + str(m))
                    print('Choosing exemplars for new classes...')

                    herding.compute_rebuffi_herding_faster(
                        run_data_output_dir, new_train_file_path,
                        features_out_file, batch_oracle_annotated_paths[b], m,
                        str(b + 1) + '_old')

                    if b != 1:
                        print('Reducing exemplars for old classes...')
                        herding.reduce_exemplars(run_data_output_dir,
                                                 old_train_dataset.classes, m,
                                                 b, full_paths_suffix)

                    print('Old data for batch {} saved in {} '.format(
                        b + 1,
                        os.path.join(run_data_output_dir,
                                     str(b + 1) + '_old')))

                print('Current run elapsed time : {}'.format(
                    timedelta(seconds=round(time.time() - run_starting_time))))

                mean_top1 = np.mean(np.array(top1_val_accuracies)[1:]
                                    ) if len(top1_val_accuracies) > 1 else 0.0
                mean_topx = np.mean(np.array(topx_val_accuracies)[1:]
                                    ) if len(topx_val_accuracies) > 1 else 0.0
                print("")
                print('TOP1 validation accuracies = ' +
                      str([float(str(e)[:6]) for e in top1_val_accuracies]))
                print('TOP1 mean incremental accuracy = ' + str(mean_top1)[:6])
                print('***************')
                print('TOP{} validation accuracies = '.format(top) +
                      str([float(str(e)[:6]) for e in topx_val_accuracies]))
                print('TOP{} mean incremental accuracy = '.format(top) +
                      str(mean_topx)[:6])

                runs_top1_acc.append(mean_top1)
                runs_topx_acc.append(mean_topx)

        runs_mean_top1_acc = np.mean(np.array(runs_top1_acc))
        runs_mean_topx_acc = np.mean(np.array(runs_topx_acc))
        runs_std_top1_acc = np.std(np.array(runs_top1_acc))
        runs_std_topx_acc = np.std(np.array(runs_topx_acc))

        print('*' * 110)
        print('*' * 110)
        print('Total elapsed time : {}'.format(
            timedelta(seconds=round(time.time() - first_run_starting_time))))
        print(
            '****************************************************************')
        print('Average runs scores')
        print(
            '****************************************************************')
        print(
            'TOP1 mean incremental accuracy = {:.3f}      [+/- {:.2f}]'.format(
                runs_mean_top1_acc, runs_std_top1_acc))
        print('TOP{} mean incremental accuracy = {:.3f}      [+/- {:.2f}]'.
              format(top, runs_mean_topx_acc, runs_std_topx_acc))

        # Print warnings (Possibly corrupt EXIF files):
        if len(warn_list) > 0:
            print("\n" + str(len(warn_list)) + " Warnings\n")
            # for i in range(len(warn_list)):
            #     print("warning " + str(i) + ":")
            #     print(str(i)+":"+ str(warn_list[i].category) + ":\n     " + str(warn_list[i].message))
        else:
            print('No warnings.')