model_load_path = os.path.join( models_save_dir, algo_name + '_b' + str(b - 1) + '.pt') print('New train data loaded from ' + new_train_file_path) batch_models_save_dir = os.path.join(models_save_dir, batch_algo_name) if saving_intermediate_models == True: if not os.path.exists(batch_models_save_dir): os.mkdir(batch_models_save_dir) new_train_dataset = ImagesListFileFolder( new_train_file_path, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]), target_transform=lambda x: target_transform(x, P, b)) train_loader = torch.utils.data.DataLoader( new_train_dataset, batch_size=new_batch_size, shuffle=True, num_workers=num_workers, pin_memory=False) new_classes_number = len(new_train_dataset.classes) print("New classes number = " + str(new_classes_number))
import sys, os, warnings, time import numpy as np from MyImageFolder import ImagesListFileFolder import torch as th if len(sys.argv) != 2: print('Arguments: images_list_file_path') sys.exit(-1) train_file_path = sys.argv[1] print('Train file path = ' + train_file_path) #catching warnings with warnings.catch_warnings(record=True) as warn_list: train_dataset = ImagesListFileFolder(train_file_path, transforms.ToTensor()) num_classes = len(train_dataset.classes) print("Number of classes = " + str(num_classes)) print("Training-set size = " + str(len(train_dataset))) dataloader = th.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=12) mean = th.zeros(3) std = th.zeros(3) print('==> Computing mean and std..') cpt = 0 for inputs, targets in dataloader: cpt += 1
S = int((num_classes - B) / P) + 1 print('S = ' + str(S)) ckp_prefix = '{}_s{}_k{}'.format(normalization_dataset_name, S, memory_size) np.random.seed(random_seed) # Fix the random seed ######################################## device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") dataset_mean, dataset_std = utils.get_dataset_mean_std( normalization_dataset_name, datasets_mean_std_file_path) normalize = transforms.Normalize(mean=dataset_mean, std=dataset_std) trainset = ImagesListFileFolder( train_file_path, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) testset = ImagesListFileFolder( test_file_path, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) evalset = ImagesListFileFolder( test_file_path,
print('New train data loaded from ' + new_train_file_path) print('Old train data loaded from ' + old_train_file_path) print('New val data loaded from ' + new_val_file_path) print('Old val data loaded from ' + old_val_file_path) batch_models_save_dir = os.path.join(models_save_dir, batch_algo_name) if saving_intermediate_models == True: if not os.path.exists(batch_models_save_dir): os.mkdir(batch_models_save_dir) old_train_dataset = ImagesListFileFolder( old_train_file_path, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) new_train_dataset = ImagesListFileFolder( new_train_file_path, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]), ) new_and_old_train_datasets = torch.utils.data.dataset.ConcatDataset( (old_train_dataset, new_train_dataset))
print('normalization dataset name = ' + str(normalization_dataset_name)) print('dataset mean = ' + str(dataset_mean)) print('dataset std = ' + str(dataset_std)) normalize = transforms.Normalize(mean=dataset_mean, std=dataset_std) print("Number of workers = " + str(num_workers)) print("Batch size = " + str(batch_size)) print("Running on gpu " + str(gpu)) print('-------> Val data') val_dataset = ImagesListFileFolder(val_images_list, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]), return_path=True) print("Val-set size = " + str(len(val_dataset))) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=False) print('Loading list file from ' + val_images_list) print('Destination directory ' + val_destination_dir)
def main(I): if __name__ == '__main__': if not os.path.exists(data_output_dir): os.makedirs(data_output_dir) if not os.path.exists(models_save_dir): os.makedirs(models_save_dir) # catching warnings with warnings.catch_warnings(record=True) as warn_list: herding = StaticHerding() runs_top1_acc = [] runs_topx_acc = [] first_run_starting_time = time.time() for r in range(1, num_runs + 1): run_data_output_dir = os.path.join(data_output_dir, 'run_' + str(r)) if not os.path.exists(run_data_output_dir): os.makedirs(run_data_output_dir) run_models_save_dir = os.path.join(models_save_dir, 'run_' + str(r)) if not os.path.exists(run_models_save_dir): os.makedirs(run_models_save_dir) run_features_destination_dir = os.path.join( run_data_output_dir, 'features') if not os.path.exists(run_features_destination_dir): os.mkdir(run_features_destination_dir) top1_val_accuracies = [] topx_val_accuracies = [] previous_model = None run_starting_time = time.time() batch_oracle_annotated_paths = {} undetected_classes = [] for b in range(1, T + 1): print('*' * 110) print('*' * 46 + ' Run {}/{} | BATCH {} '.format(r, num_runs, b) + '*' * 45) print('*' * 110 + '\n') if b == 1: model_load_path = first_model_load_path new_train_file_path = path_train_batch1 val_file_path = path_val_batch1 print('Train data loaded from ' + new_train_file_path) print('Val data loaded from ' + val_file_path) new_train_dataset = ImagesListFileFolder( new_train_file_path, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]), return_path=True) model_dsets = [new_train_dataset] val_dataset = ImagesListFileFolder( val_file_path, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ]), return_path=True) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=val_batch_size, shuffle=True, num_workers=num_workers, pin_memory=False) old_classes_number = 0 new_classes_number = len(val_dataset.classes) print("Classes number = " + str(new_classes_number)) print("Validation-set size = " + str(len(val_dataset))) model = models.resnet18(pretrained=False, num_classes=base) print('\nLoading model from ' + model_load_path) state = torch.load( model_load_path, map_location=lambda storage, loc: storage) model.load_state_dict(state['state_dict']) model = model.cuda(gpu) print('\n\n********* VALIDATION ********* ') model.eval() top1 = AverageMeter() topx = AverageMeter() top = min(5, new_classes_number) N, n = get_dataset_N_n(model_dsets, model.fc.out_features) # Validation on batch 1 for data in val_loader: (inputs, labels), paths = data inputs, labels = inputs.cuda(gpu), labels.cuda(gpu) scores = model(Variable(inputs)) if apply_th_train or apply_th_val_al: scores = th_calibration( F.softmax(scores, dim=1), N, n) prec1, prec5 = utils.accuracy(scores.data, labels, topk=(1, top)) top1.update(prec1.item(), inputs.size(0)) topx.update(prec5.item(), inputs.size(0)) # ------------------------------------------- print( 'BATCH 1 | Val : acc@1 = {:.2f}% ; acc@{} = {:.2f}%' .format(top1.avg, top, topx.avg)) top1_val_accuracies.append(top1.avg) topx_val_accuracies.append(topx.avg) oracle_annotated_paths = open(new_train_file_path, 'r').readlines() batch_oracle_annotated_paths[ b] = oracle_annotated_paths else: batch_algo_name = algo_name + '_b' + str(b) old_train_file_path = os.path.join( run_data_output_dir, str(b) + '_old') new_val_file_path = os.path.join( dataset_files_dir, 'separated/val/batch' + str(b)) if b == 2: old_val_file_path = path_val_batch1 else: old_val_file_path = os.path.join( dataset_files_dir, 'accumulated/val/batch' + str(b - 1)) if mode == "il": # supervised : I = 1 new_train_file_path = os.path.join( train_files_dir, 'batch' + str(b)) oracle_annotated_paths = open( new_train_file_path, 'r').readlines() batch_oracle_annotated_paths[ b] = oracle_annotated_paths print('Old train data loaded from ' + old_train_file_path) print('New val data loaded from ' + new_val_file_path) print('Old val data loaded from ' + old_val_file_path) # Data loaders for training old_train_dataset = ImagesListFileFolder( old_train_file_path, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]), return_path=True) old_val_dataset = ImagesListFileFolder( old_val_file_path, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ]), return_path=True) new_val_dataset = ImagesListFileFolder( new_val_file_path, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ]), return_path=True) val_datasets = torch.utils.data.dataset.ConcatDataset( (old_val_dataset, new_val_dataset)) val_loader = torch.utils.data.DataLoader( val_datasets, batch_size=val_batch_size, shuffle=True, num_workers=num_workers, pin_memory=False) old_classes_number = len(old_train_dataset.classes) # Loading the model if b == 2: model_load_path = first_model_load_path else: model_load_path = os.path.join( run_models_save_dir, algo_name + '_b' + str(b - 1) + '.pt') model = models.resnet18(pretrained=False, num_classes=base + P * (b - 2)) print('\nLoading saved model from ' + model_load_path) state = torch.load( model_load_path, map_location=lambda storage, loc: storage) model.load_state_dict(state['state_dict']) model.fc = nn.Linear(model.fc.in_features, base + P * (b - 1)) model = model.cuda(gpu) # Define Loss and Optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay) scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, patience=patience, factor=lr_decay) print("\nlr = {:.4f}".format(lr)) print("Old classes number = " + str(old_classes_number)) print("Old Training-set size = " + str(len(old_train_dataset))) print("Validation-set size = " + str(len(val_datasets)) + '\n') ############################## # Active learning : update batch_oracle_annotated_paths / Semi-supervised labelisation step batch_oracle_annotated_paths[b] = [] next_new_train_file_path = os.path.join( train_files_dir, 'batch' + str(b)) for sess in range(I): if sess == 0: al_model = previous_model else: al_model = model sess_epochs = int(num_epochs / I) #todo modify for param_group in optimizer.param_groups: param_group['lr'] = lr if mode == "il" or I == 1: sess_budget = B else: if sess == 0: # take 40% of budget sess_budget = math.ceil(int(B * 40 / 100)) else: sess_budget = math.ceil(int(B * 20 / 100)) next_new_train_paths_list = open( next_new_train_file_path, 'r').readlines() assert (sorted(list( set(next_new_train_paths_list))) == sorted( next_new_train_paths_list)) assert (sorted( list(set(batch_oracle_annotated_paths[b]))) == sorted(batch_oracle_annotated_paths[b])) sess_new_train_paths = list( set(next_new_train_paths_list) - set(batch_oracle_annotated_paths[b])) oracle_annotated_paths = active_learning( rerun, sess, new_batch_size, b, al_model, N, n, sess_budget, next_new_train_file_path, sess_new_train_paths, run_data_output_dir, undetected_classes) batch_oracle_annotated_paths[b].extend( oracle_annotated_paths) new_train_file_path = os.path.join( run_data_output_dir, str(b) + '_new') print('New train data loaded from ' + new_train_file_path) new_train_dataset = ImagesListFileFolder( new_train_file_path, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]), return_path=True) model_dsets = [ old_train_dataset, new_train_dataset ] new_and_old_train_datasets = torch.utils.data.dataset.ConcatDataset( (old_train_dataset, new_train_dataset)) train_loader = torch.utils.data.DataLoader( new_and_old_train_datasets, shuffle=True, batch_size=new_batch_size, num_workers=num_workers, pin_memory=False) new_classes_number = len(new_train_dataset.classes) undetected_classes.extend( list( set( range(base + P * (b - 2), base + P * (b - 1))) - set(new_train_dataset.classes))) undetected_classes = sorted( list(set(undetected_classes))) print('undetected_classes = ' + str(undetected_classes)) print("New classes number = " + str(new_classes_number)) print("New Training-set size = " + str(len(new_train_dataset))) print("Training-set size = " + str(len(new_and_old_train_datasets))) N, n = get_dataset_N_n(model_dsets, model.fc.out_features) # Training print("-" * 20) print('\n\n********* TRAINING ********* ') starting_time = time.time() for epoch in range(sess_epochs): top1 = AverageMeter() topx = AverageMeter() model.train() running_loss = 0.0 nb_batches = 0 optimizer.zero_grad() for i, data in enumerate(train_loader, 0): nb_batches += 1 (inputs, labels), paths = data inputs, labels = Variable( inputs.cuda(gpu)), Variable( labels.cuda(gpu)) scores = model(inputs) # scores[:, undetected_classes] = -np.Inf loss = criterion(scores, labels) loss.data /= iter_size loss.backward() running_loss += loss.data.item() if (i + 1) % iter_size == 0: optimizer.step() optimizer.zero_grad() scheduler.step(loss.cpu().data.numpy()) # Model evaluation model.eval() top = min( 5, old_classes_number + new_classes_number) for data in val_loader: (inputs, labels), paths = data inputs, labels = inputs.cuda( gpu), labels.cuda(gpu) scores = model(Variable(inputs)) # scores[:, undetected_classes] = -np.Inf if apply_th_train or apply_th_val_al: scores = th_calibration( F.softmax(scores, dim=1), N, n) prec1, prec5 = utils.accuracy(scores.data, labels, topk=(1, top)) top1.update(prec1.item(), inputs.size(0)) topx.update(prec5.item(), inputs.size(0)) current_elapsed_time = time.time( ) - starting_time print( '{}/{} | lr={:.5f} |{:03}/{:03} | {} | Train : loss = {:.4f} | Val : acc@1 = {:.2f}% ; acc@{}= {:.2f}%' .format( sess, I, optimizer.param_groups[0]['lr'], epoch + 1, num_epochs, timedelta(seconds=round( current_elapsed_time)), running_loss / nb_batches, top1.avg, top, topx.avg)) # Training finished print('Saving model in ' + batch_algo_name + '.pt' + '...') state = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save( state, os.path.join(run_models_save_dir, batch_algo_name) + '.pt') top1_val_accuracies.append(top1.avg) topx_val_accuracies.append(topx.avg) print("") print('TOP1 val acc = ' + str( [float(str(e)[:6]) for e in top1_val_accuracies])) print('TOP{} val acc = '.format(top) + str( [float(str(e)[:6]) for e in topx_val_accuracies])) previous_model = model ########## Herding new_train_dataset = ImagesListFileFolder( new_train_file_path, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]), return_path=True) # computing number of exemplars m = int( math.ceil(K / (old_classes_number + new_classes_number))) new_train_loader = torch.utils.data.DataLoader( new_train_dataset, batch_size=new_batch_size, shuffle=True, num_workers=num_workers, pin_memory=False) features_extractor = nn.Sequential( *list(model.children())[:-1]) features_extractor.eval() features_extractor = features_extractor.cuda(gpu) print( '\n\n********* PREPARING OLD DATA FOR THE NEXT BATCH ********* ' ) if b == 1: print( '------> Features extraction of new data S{}* using model M{}' .format(b, b)) else: print( '------> Features extraction of new data S{}+ using model M{}' .format(b, b)) print('data loaded from : ' + new_train_file_path) full_features = None full_paths = None for data in new_train_loader: (inputs, labels), paths = data inputs = inputs.cuda(gpu) features = features_extractor(Variable(inputs)) np_paths = np.array(paths) np_features = features.data.cpu().numpy() np_features = np_features.reshape( np_features.shape[0], np_features.shape[1]) if full_features is None: full_paths = np_paths full_features = np_features else: full_paths = np.append(full_paths, np_paths) full_features = np.vstack( (full_features, np_features)) features_dict = {} for i in range(len(full_paths)): if full_paths[i] in features_dict: print(str(full_paths[i]) + ' is redundant ') features_dict[full_paths[i]] = full_features[i] ######################################################### images_files = open(new_train_file_path, 'r').readlines() batch_features_destination_dir = os.path.join( run_features_destination_dir, 'batch' + str(b)) if not os.path.exists(batch_features_destination_dir): os.makedirs(batch_features_destination_dir) features_out_file = os.path.join( batch_features_destination_dir, 'features') features_out = open(features_out_file, 'w') for image_file in images_files: image_file = image_file.strip('\n') image_file = image_file.split()[0] if '.jpg' in image_file or '.jpeg' in image_file or '.JPEG' in image_file or '.png' in image_file: features_out.write( str(' '.join([ str(e) for e in list(features_dict[image_file]) ])) + '\n') else: print('image file = ' + str(image_file)) features_out.close() print('Exemplars number per class = ' + str(m)) print('Choosing exemplars for new classes...') herding.compute_rebuffi_herding_faster( run_data_output_dir, new_train_file_path, features_out_file, batch_oracle_annotated_paths[b], m, str(b + 1) + '_old') if b != 1: print('Reducing exemplars for old classes...') herding.reduce_exemplars(run_data_output_dir, old_train_dataset.classes, m, b, full_paths_suffix) print('Old data for batch {} saved in {} '.format( b + 1, os.path.join(run_data_output_dir, str(b + 1) + '_old'))) print('Current run elapsed time : {}'.format( timedelta(seconds=round(time.time() - run_starting_time)))) mean_top1 = np.mean(np.array(top1_val_accuracies)[1:] ) if len(top1_val_accuracies) > 1 else 0.0 mean_topx = np.mean(np.array(topx_val_accuracies)[1:] ) if len(topx_val_accuracies) > 1 else 0.0 print("") print('TOP1 validation accuracies = ' + str([float(str(e)[:6]) for e in top1_val_accuracies])) print('TOP1 mean incremental accuracy = ' + str(mean_top1)[:6]) print('***************') print('TOP{} validation accuracies = '.format(top) + str([float(str(e)[:6]) for e in topx_val_accuracies])) print('TOP{} mean incremental accuracy = '.format(top) + str(mean_topx)[:6]) runs_top1_acc.append(mean_top1) runs_topx_acc.append(mean_topx) runs_mean_top1_acc = np.mean(np.array(runs_top1_acc)) runs_mean_topx_acc = np.mean(np.array(runs_topx_acc)) runs_std_top1_acc = np.std(np.array(runs_top1_acc)) runs_std_topx_acc = np.std(np.array(runs_topx_acc)) print('*' * 110) print('*' * 110) print('Total elapsed time : {}'.format( timedelta(seconds=round(time.time() - first_run_starting_time)))) print( '****************************************************************') print('Average runs scores') print( '****************************************************************') print( 'TOP1 mean incremental accuracy = {:.3f} [+/- {:.2f}]'.format( runs_mean_top1_acc, runs_std_top1_acc)) print('TOP{} mean incremental accuracy = {:.3f} [+/- {:.2f}]'. format(top, runs_mean_topx_acc, runs_std_topx_acc)) # Print warnings (Possibly corrupt EXIF files): if len(warn_list) > 0: print("\n" + str(len(warn_list)) + " Warnings\n") # for i in range(len(warn_list)): # print("warning " + str(i) + ":") # print(str(i)+":"+ str(warn_list[i].category) + ":\n " + str(warn_list[i].message)) else: print('No warnings.')