def compute_fstar(self, model_func, loss_function, fname): if os.path.exists(fname): fstar_list = hu.load_pkl(fname) else: fstar_list = np.ones(len(self)) * -1 for i in range(len(self)): batch = self[i] images, labels = batch['images'][None].cuda(), batch['labels'][None].cuda() model = model_func() opt = torch.optim.Adam(model.parameters()) for j in range(10000): opt.zero_grad() closure = lambda : loss_function(model, images, labels, backwards=True) loss = opt.step(closure).item() grad_current = sps.get_grad_list(model.parameters()) grad_norm = sps.compute_grad_norm(grad_current) if np.isnan(loss): print('nan') # print(i, loss) if grad_norm < 1e-6: break if j > 0 and abs(loss_old - loss) < 1e-6: break loss_old = loss print("%d/%d - converged:%d - %.6f"% (i, len(self), j, loss)) fstar_list[i] = loss hu.save_pkl(fname, fstar_list) self.fstar_list = fstar_list
def __init__(self, model, nclasses, exp_dict): """ Constructor Args: model: architecture to train nclasses: number of output classes exp_dict: reference to dictionary with the hyperparameters """ super().__init__() self.model = model self.exp_dict = exp_dict self.ngpu = self.exp_dict["ngpu"] self.embedding_propagation = EmbeddingPropagation() self.label_propagation = LabelPropagation() self.model.add_classifier(nclasses, modalities=0) self.nclasses = nclasses if self.exp_dict["rotation_weight"] > 0: self.model.add_classifier(4, "classifier_rot") best_accuracy = -1 if self.exp_dict["pretrained_weights_root"] is not None: for exp_hash in os.listdir(self.exp_dict['pretrained_weights_root']): base_path = os.path.join(self.exp_dict['pretrained_weights_root'], exp_hash) exp_dict_path = os.path.join(base_path, 'exp_dict.json') if not os.path.exists(exp_dict_path): continue loaded_exp_dict = haven.load_json(exp_dict_path) pkl_path = os.path.join(base_path, 'score_list_best.pkl') if (loaded_exp_dict["model"]["name"] == 'pretraining' and loaded_exp_dict["dataset_train"].split('_')[-1] == exp_dict["dataset_train"].split('_')[-1] and loaded_exp_dict["model"]["backbone"] == exp_dict['model']["backbone"] and # loaded_exp_dict["labelprop_alpha"] == exp_dict["labelprop_alpha"] and # loaded_exp_dict["labelprop_scale"] == exp_dict["labelprop_scale"] and os.path.exists(pkl_path)): accuracy = haven.load_pkl(pkl_path)[-1]["val_accuracy"] try: self.model.load_state_dict(torch.load(os.path.join(base_path, 'checkpoint_best.pth'))['model'], strict=False) if accuracy > best_accuracy: best_path = os.path.join(base_path, 'checkpoint_best.pth') best_accuracy = accuracy except: continue assert(best_accuracy > 0.1) print("Finetuning %s with original accuracy : %f" %(base_path, best_accuracy)) self.model.load_state_dict(torch.load(best_path)['model'], strict=False) # Add optimizers here self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.exp_dict["lr"], momentum=0.9, weight_decay=self.exp_dict["weight_decay"], nesterov=True) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode="min" if "loss" in self.exp_dict["target_loss"] else "max", patience=self.exp_dict["patience"]) self.model.cuda() if self.ngpu > 1: self.parallel_model = torch.nn.DataParallel(self.model, device_ids=list(range(self.ngpu)))
def __init__( self, split, datadir, exp_dict, ): self.exp_dict = exp_dict self.datadir = datadir self.split = split self.n_classes = 5 self.img_path = os.path.join(datadir, 'OpenSourceDCMs') self.lung_path = os.path.join(datadir, 'LungMasks') self.tgt_path = os.path.join(datadir, 'InfectionMasks') self.img_tgt_dict = [] for tgt_name in os.listdir(self.tgt_path): lung_name = os.path.join(self.lung_path, tgt_name) scan_id, slice_id = tgt_name.split('_') slice_id = str(int(slice_id.replace('z', '').replace('.png', ''))).zfill(4) img_name = [ f for f in os.listdir( os.path.join(self.img_path, 'DCM' + scan_id)) if 's%s' % slice_id in f ][0] img_name = os.path.join('DCM' + scan_id, img_name) self.img_tgt_dict += [{ 'img': img_name, 'tgt': tgt_name, 'lung': lung_name }] # get label_meta fname = os.path.join(datadir, 'tmp', 'labels_array.pkl') if not os.path.exists(fname): labels_array = np.zeros((len(self.img_tgt_dict), 3)) for i, idict in enumerate(tqdm.tqdm(self.img_tgt_dict)): img_name, tgt_name = idict['img'], idict['tgt'] mask = np.array( Image.open(os.path.join(self.tgt_path, tgt_name))) uniques = np.unique(mask) if 0 in uniques: labels_array[i, 0] = 1 if 127 in uniques: labels_array[i, 1] = 1 if 255 in uniques: labels_array[i, 2] = 1 hu.save_pkl(fname, labels_array) labels_array = hu.load_pkl(fname) # self.np.where(labels_array[:,1:].max(axis=1)) ind_list = np.where(labels_array[:, 1:].max(axis=1))[0] self.img_tgt_dict = np.array(self.img_tgt_dict)[ind_list] if split == 'train': self.img_tgt_dict = self.img_tgt_dict[:300] elif split == 'val': self.img_tgt_dict = self.img_tgt_dict[300:]
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0): # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) if not os.path.join(savedir, "exp_dict.json"): hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # BCD train # ================== # Ignore the following combinations if not ut.is_valid_exp(exp_dict): return score_list_fname = os.path.join(savedir, 'score_list.pkl') if os.path.exists(score_list_fname): score_list = hu.load_pkl(score_list_fname) else: score_list = train(dataset_name=exp_dict['dataset']['name'], loss_name=exp_dict['dataset']['loss'], block_size=exp_dict['block_size'], partition_rule=exp_dict['partition'], selection_rule=exp_dict['selection'], update_rule=exp_dict['update'], n_iters=exp_dict['max_iters'], L1=exp_dict.get('l1', 0), L2=0, datasets_path=datadir) hu.save_pkl(score_list_fname, score_list) print('Experiment completed.') return score_list
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0): # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # set seed # ================== seed = 42 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ================== # train set train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="train", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val set val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="val", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # test set test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="test", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val_sampler = torch.utils.data.SequentialSampler(val_set) val_loader = DataLoader( val_set, # sampler=val_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=num_workers) test_loader = DataLoader( test_set, # sampler=val_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=num_workers) # Model # ================== model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() # model.opt = optimizers.get_optim(exp_dict['opt'], model) model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ================== print("Starting experiment at epoch %d" % (s_epoch)) model.waiting = 0 model.val_score_best = -np.inf train_sampler = torch.utils.data.RandomSampler(train_set, replacement=True, num_samples=2 * len(test_set)) train_loader = DataLoader(train_set, sampler=train_sampler, collate_fn=ut.collate_fn, batch_size=exp_dict["batch_size"], drop_last=True, num_workers=num_workers) for e in range(s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} test_dict = model.val_on_loader(test_loader, savedir_images=os.path.join( savedir, "images"), n_images=3) # Train the model train_dict = model.train_on_loader(train_loader) # Validate the model val_dict = model.val_on_loader(val_loader) score_dict["val_score"] = val_dict["val_score"] # Get new score_dict score_dict.update(train_dict) score_dict["epoch"] = e score_dict["waiting"] = model.waiting model.waiting += 1 # Add to score_list and save checkpoint score_list += [score_dict] # Save Best Checkpoint score_df = pd.DataFrame(score_list) if score_dict["val_score"] >= model.val_score_best: test_dict = model.val_on_loader(test_loader, savedir_images=os.path.join( savedir, "images"), n_images=3) score_dict.update(test_dict) hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) # score_df.to_csv(os.path.join(savedir, "score_best_df.csv")) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) model.waiting = 0 model.val_score_best = score_dict["val_score"] print("Saved Best: %s" % savedir) # Report & Save score_df = pd.DataFrame(score_list) # score_df.to_csv(os.path.join(savedir, "score_df.csv")) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) if model.waiting > 100: break print('Experiment completed et epoch %d' % e)
def trainval(exp_dict, savedir_base, data_root, reset=False, tensorboard=True): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) np.random.seed(exp_dict["seed"]) torch.manual_seed(exp_dict["seed"]) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) writer = tensorboardX.SummaryWriter(savedir) \ if tensorboard == 1 else None # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) pprint.pprint(exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ----------- train_dataset, val_dataset = get_dataset(['train', 'val'], data_root, exp_dict) # val_dataset = get_dataset('val', exp_dict) # train and val loader if exp_dict["episodic"] == False: train_loader = DataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) val_loader = DataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) else: # to support episodes TODO: move inside each model from datasets.episodic_dataset import EpisodicDataLoader train_loader = EpisodicDataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) val_loader = EpisodicDataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) # Model # ----------- model = get_model(exp_dict, labelset=train_dataset.raw_labelset, writer=writer) print("Model with:", sum(p.numel() for p in model.parameters() if p.requires_grad), "parameters") # Checkpoint # ----------- model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print("Starting experiment at epoch %d" % (s_epoch)) for e in range(s_epoch, exp_dict['max_epoch']): score_dict = {} # Train the model score_dict.update(model.train_on_loader(e, train_loader)) # Validate the model score_dict.update(model.val_on_loader(e, val_loader)) score_dict["epoch"] = e if tensorboard: for key, value in score_dict.items(): writer.add_scalar(key, value, e) writer.flush() # Visualize the model # model.vis_on_loader(vis_loader, savedir=savedir+"/images/") # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) # if model.is_end(): # print("Early stopping") # break print('experiment completed') # Cleanup if tensorboard == 1: writer.close()
def trainval(exp_dict, savedir_base, datadir_base, reset=False): # bookkeeping stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ================== # load train and acrtive set train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="train", datadir_base=datadir_base, exp_dict=exp_dict) active_set = ActiveLearningDataset(train_set, random_state=42) # val set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="val", datadir_base=datadir_base, exp_dict=exp_dict) val_loader = DataLoader(val_set, batch_size=exp_dict["batch_size"]) # Model # ================== model = models.get_model(model_name=exp_dict['model']['name'], exp_dict=exp_dict).cuda() model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) active_set.load_state_dict( hu.load_pkl(os.path.join(savedir, "active_set.pkl"))) score_list = hu.load_pkl(score_list_path) inner_s_epoch = score_list[-1]['inner_epoch'] + 1 s_cycle = score_list[-1]['cycle'] else: # restart experiment score_list = [] inner_s_epoch = 0 s_cycle = 0 # Train & Val # ================== print("Starting experiment at cycle %d epoch %d" % (s_cycle, inner_s_epoch)) for c in range(s_cycle, exp_dict['max_cycle']): # Set seed np.random.seed(c) torch.manual_seed(c) torch.cuda.manual_seed_all(c) if inner_s_epoch == 0: active_set.label_next_batch(model) hu.save_pkl(os.path.join(savedir, "active_set.pkl"), active_set.state_dict()) train_loader = DataLoader(active_set, sampler=samplers.get_sampler( exp_dict['sampler']['train'], active_set), batch_size=exp_dict["batch_size"]) # Visualize the model model.vis_on_loader(vis_loader, savedir=os.path.join(savedir, "images")) for e in range(inner_s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} if e == 0: score_dict.update(model.val_on_loader(val_loader)) # Train the model score_dict.update(model.train_on_loader(train_loader)) # Validate the model score_dict["epoch"] = len(score_list) score_dict["inner_epoch"] = e score_dict["cycle"] = c score_dict['n_ratio'] = active_set.n_labelled_ratio score_dict["n_train"] = len(train_loader.dataset) score_dict["n_pool"] = len(train_loader.dataset.pool) # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) inner_s_epoch = 0
def get_dataset(dataset_name, train_flag, datadir, exp_dict): if dataset_name == "mnist": dataset = torchvision.datasets.MNIST(datadir, train=train_flag, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize( (0.5,), (0.5,)) ])) if dataset_name == "cifar10": if train_flag: transform_function = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) else: transform_function = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) dataset = torchvision.datasets.CIFAR10( root=datadir, train=train_flag, download=True, transform=transform_function) if dataset_name == "cifar100": if train_flag: transform_function = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) else: transform_function = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) dataset = torchvision.datasets.CIFAR100( root=datadir, train=train_flag, download=True, transform=transform_function) if dataset_name in ['syn']: bias = 1; scaling = 10; sparsity = 10; solutionSparsity = 0.1; n = 1000 p = 100 A = np.random.randn(n,p)+bias; A = A.dot(np.diag(scaling* np.random.randn(p))) A = A * (np.random.rand(n,p) < (sparsity*np.log(n)/n)); w = np.random.randn(p) * (np.random.rand(p) < solutionSparsity); b = np.sign(A.dot(w)); b = b * np.sign(np.random.rand(n)-0.1); labels = np.unique(b) A = A / np.linalg.norm(A, axis=1)[:, None].clip(min=1e-6) A = A * 2 b[b==labels[0]] = 0 b[b==labels[1]] = 1 dataset = torch.utils.data.TensorDataset(torch.FloatTensor(A), torch.FloatTensor(b)) return DatasetWrapper(dataset) if dataset_name in ['mushrooms', 'w8a', 'rcv1', 'ijcnn']: sigma_dict = {"mushrooms": 0.5, "w8a":20.0, "rcv1":0.25 , "ijcnn":0.05} X, y = load_libsvm(dataset_name, data_dir=datadir) labels = np.unique(y) y[y==labels[0]] = 0 y[y==labels[1]] = 1 # splits used in experiments splits = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=9513451) X_train, X_test, Y_train, Y_test = splits if train_flag: # fname_rbf = "%s/rbf_%s_%s_train.pkl" % (datadir, dataset_name, sigma_dict[dataset_name]) fname_rbf = "%s/rbf_%s_%s_train.npy" % (datadir, dataset_name, sigma_dict[dataset_name]) if os.path.exists(fname_rbf): k_train_X = np.load(fname_rbf) else: k_train_X = rbf_kernel(X_train, X_train, sigma_dict[dataset_name]) np.save(fname_rbf, k_train_X) print('%s saved' % fname_rbf) X_train = k_train_X X_train = torch.FloatTensor(X_train) Y_train = torch.FloatTensor(Y_train) dataset = torch.utils.data.TensorDataset(X_train, Y_train) else: fname_rbf = "%s/rbf_%s_%s_test.npy" % (datadir, dataset_name, sigma_dict[dataset_name]) if os.path.exists(fname_rbf): k_test_X = np.load(fname_rbf) else: k_test_X = rbf_kernel(X_test, X_train, sigma_dict[dataset_name]) # hu.save_pkl(fname_rbf, k_test_X) np.save(fname_rbf, k_test_X) print('%s saved' % fname_rbf) X_test = k_test_X X_test = torch.FloatTensor(X_test) Y_test = torch.FloatTensor(Y_test) dataset = torch.utils.data.TensorDataset(X_test, Y_test) if dataset_name == "matrix_fac": fname = datadir + 'matrix_fac.pkl' if not os.path.exists(fname): data = generate_synthetic_matrix_factorization_data() hu.save_pkl(fname, data) A, y = hu.load_pkl(fname) X_train, X_test, y_train, y_test = train_test_split(A, y, test_size=0.2, random_state=9513451) training_set = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.float), torch.tensor(y_train, dtype=torch.float)) test_set = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.float), torch.tensor(y_test, dtype=torch.float)) if train_flag: dataset = training_set else: dataset = test_set return DatasetWrapper(dataset)
def get_dataset(dataset_name, train_flag, datadir, exp_dict): if dataset_name == "mnist": dataset = torchvision.datasets.MNIST(datadir, train=train_flag, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize( (0.5,), (0.5,)) ])) if dataset_name == "cifar10": transform_function = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) dataset = torchvision.datasets.CIFAR10( root=datadir, train=train_flag, download=True, transform=transform_function) if dataset_name == "cifar100": transform_function = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) dataset = torchvision.datasets.CIFAR100( root=datadir, train=train_flag, download=True, transform=transform_function) if dataset_name in ["mushrooms", "w8a", "rcv1", "ijcnn"]: sigma_dict = {"mushrooms": 0.5, "w8a":20.0, "rcv1":0.25 , "ijcnn":0.05} X, y = load_libsvm(dataset_name, data_dir=datadir) labels = np.unique(y) y[y==labels[0]] = 0 y[y==labels[1]] = 1 # TODO: (amishkin) splits = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=9513451) splits = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42) X_train, X_test, Y_train, Y_test = splits if train_flag: # fname_rbf = "%s/rbf_%s_train.pkl" % (datadir, dataset_name) # if os.path.exists(fname_rbf): # k_train_X = hu.load_pkl(fname_rbf) # else: k_train_X = rbf_kernel(X_train, X_train, sigma_dict[dataset_name]) # hu.save_pkl(fname_rbf, k_train_X) X_train = k_train_X X_train = torch.FloatTensor(X_train) Y_train = torch.FloatTensor(Y_train) dataset = torch.utils.data.TensorDataset(X_train, Y_train) else: # fname_rbf = "%s/rbf_%s_test.pkl" % (datadir, dataset_name) # if os.path.exists(fname_rbf): # k_test_X = hu.load_pkl(fname_rbf) # else: k_test_X = rbf_kernel(X_test, X_train, sigma_dict[dataset_name]) # hu.save_pkl(fname_rbf, k_test_X) X_test = k_test_X X_test = torch.FloatTensor(X_test) Y_test = torch.FloatTensor(Y_test) dataset = torch.utils.data.TensorDataset(X_test, Y_test) return dataset if dataset_name == "synthetic": margin = exp_dict["margin"] X, y, _, _ = make_binary_linear(n=exp_dict["n_samples"], d=exp_dict["d"], margin=margin, y01=True, bias=True, separable=True, seed=42) # No shuffling to keep the support vectors inside the training set splits = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42) X_train, X_test, Y_train, Y_test = splits X_train = torch.FloatTensor(X_train) X_test = torch.FloatTensor(X_test) Y_train = torch.FloatTensor(Y_train) Y_test = torch.FloatTensor(Y_test) if train_flag: dataset = torch.utils.data.TensorDataset(X_train, Y_train) else: dataset = torch.utils.data.TensorDataset(X_test, Y_test) return dataset if dataset_name == "matrix_fac": fname = datadir + 'matrix_fac.pkl' if not os.path.exists(fname): data = generate_synthetic_matrix_factorization_data() hu.save_pkl(fname, data) A, y = hu.load_pkl(fname) X_train, X_test, y_train, y_test = train_test_split(A, y, test_size=0.2, random_state=9513451) training_set = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.float), torch.tensor(y_train, dtype=torch.float)) test_set = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.float), torch.tensor(y_test, dtype=torch.float)) if train_flag: dataset = training_set else: dataset = test_set return dataset
def trainval(exp_dict, savedir_base, datadir_base, reset=False, num_workers=0, pin_memory=False, ngpu=1, cuda_deterministic=False): # bookkeeping # ================== # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) if DEVICE.type == "cuda": if cuda_deterministic: cudnn.benchmark = False cudnn.deterministic = True else: cudnn.benchmark = True # Dataset # ================== trainset = get_dataset(exp_dict['dataset'], 'train', exp_dict=exp_dict, datadir_base=datadir_base, n_samples=exp_dict['dataset_size']['train'], transform_lvl=exp_dict['dataset']['transform_lvl'], colorjitter=exp_dict['dataset'].get('colorjitter') ) valset = get_dataset(exp_dict['dataset'], 'validation', exp_dict=exp_dict, datadir_base=datadir_base, n_samples=exp_dict['dataset_size']['train'], transform_lvl=0, val_transform=exp_dict['dataset']['val_transform']) testset = get_dataset(exp_dict['dataset'], 'test', exp_dict=exp_dict, datadir_base=datadir_base, n_samples=exp_dict['dataset_size']['test'], transform_lvl=0, val_transform=exp_dict['dataset']['val_transform']) print("Dataset defined.") # define dataloaders if exp_dict['dataset']['name'] == 'bach': testloader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False, num_workers=num_workers, pin_memory=pin_memory) else: testloader = torch.utils.data.DataLoader(testset, batch_size=exp_dict['batch']['size'], shuffle=False, num_workers=num_workers, pin_memory=pin_memory) print("Testloader defined.") # Model # ================== model = get_model(exp_dict, trainset, device=DEVICE) print("Model loaded") model_path = os.path.join(savedir, 'model.pth') model_best_path = os.path.join(savedir, 'model_best.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') # checkpoint management if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = len(score_list) else: # restart experiment score_list = [] s_epoch = 0 # define and log random seed for reproducibility assert('fixedSeed' in exp_dict) seed = exp_dict['fixedSeed'] random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) print("Seed defined.") # Train & Val # ================== print("Starting experiment at epoch %d/%d" % (s_epoch, exp_dict['niter'])) for epoch in range(s_epoch, exp_dict['niter']): s_time = time.time() # Sample new train val trainloader, valloader = get_train_val_dataloader(exp_dict, trainset, valset, mixtrainval=exp_dict['mixTrainVal'], num_workers=num_workers, pin_memory=pin_memory) # Train & validate train_dict = model.train_on_loader(trainloader, valloader, epoch=epoch, exp_dict=exp_dict) # Test phase train_dict_2 = model.test_on_loader(trainloader) val_dict = model.test_on_loader(valloader) test_dict = model.test_on_loader(testloader) # Vis phase model.vis_on_loader('train', trainset, savedir_images=os.path.join( savedir, 'images'), epoch=epoch) score_dict = {} score_dict["epoch"] = epoch score_dict["test_acc"] = test_dict['acc'] score_dict["val_acc"] = val_dict['acc'] score_dict["train_acc"] = train_dict_2['acc'] score_dict["train_loss"] = train_dict['loss'] score_dict["time_taken"] = time.time() - s_time score_dict["netC_lr"] = train_dict['netC_lr'] if exp_dict['model']['netA'] is not None: if 'transformations_mean' in train_dict: for i in range(len(train_dict['transformations_mean'])): score_dict[str( i) + "_mean"] = train_dict['transformations_mean'][i].item() if 'transformations_std' in train_dict: for i in range(len(train_dict['transformations_std'])): score_dict[str( i) + "_std"] = train_dict['transformations_std'][i].item() # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) # Update best score if epoch == 0 or (score_dict["test_acc"] >= score_df["test_acc"][:-1].max()): hu.save_pkl(os.path.join( savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) print('experiment completed')
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0): # bookkeepting stuff # ================== savedir = os.path.join(savedir_base, hu.hash_dict(exp_dict)) os.makedirs(savedir, exist_ok=True) if reset: hc.delete_and_backup_experiment(savedir) print("Experiment saved in %s" % savedir) # Dataset # ================== # train set data_transform = A.Compose( [ A.Flip(p=0.3), A.IAAAffine(p=0.3), A.Rotate(p=0.3), A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=15, val_shift_limit=10, p=0.3), A.GaussianBlur(3, p=0.3), A.GaussNoise(30, p=0.3) ], keypoint_params=A.KeypointParams(format='xy'), additional_targets={ 'mask0': 'mask', 'mask1': 'mask', 'mask2': 'mask', 'keypoints0': 'keypoints', 'keypoints1': 'keypoints', 'keypoints2': 'keypoints', 'keypoints3': 'keypoints', 'keypoints4': 'keypoints', 'keypoints5': 'keypoints' }) # random.seed(20201009) random_seed = random.randint(0, 20201009) train_set = HEDataset_Fast(data_dir=datadir, n_classes=exp_dict["n_classes"], transform=data_transform, option="Train", random_seed=random_seed, obj_option=exp_dict["obj"], patch_size=exp_dict["patch_size"], bkg_option=exp_dict["bkg"]) test_transform = A.Compose([A.Resize(1024, 1024)], keypoint_params=A.KeypointParams(format='xy'), additional_targets={ 'mask0': 'mask', 'mask1': 'mask' }) # val set val_set = HEDataset(data_dir=datadir, transform=test_transform, option="Validation") val_loader = DataLoader(val_set, batch_size=1, num_workers=num_workers) # test set test_set = HEDataset(data_dir=datadir, transform=test_transform, option="Test") test_loader = DataLoader(test_set, batch_size=1, num_workers=num_workers) # Model # ================== # torch.manual_seed(20201009) model = models.get_model(exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ================== print("Starting experiment at epoch %d" % (s_epoch)) # train_sampler = torch.utils.data.RandomSampler( # train_set, replacement=True, num_samples=2*len(val_set)) train_loader = DataLoader(train_set, batch_size=exp_dict["batch_size"], shuffle=True, num_workers=num_workers) for e in range(s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} # Train the model train_dict = model.train_on_loader(train_loader) # Validate and Visualize the model val_dict = model.val_on_loader(val_loader, savedir_images=os.path.join( savedir, "images"), n_images=7) score_dict.update(val_dict) # Get new score_dict score_dict.update(train_dict) score_dict["epoch"] = len(score_list) # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) # Save Best Checkpoint if e == 0 or (score_dict.get("val_score", 0) > score_df["val_score"][:-1].fillna(0).max()): hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) # if s_epoch==exp_dict['max_epoch']: # e = s_epoch model.load_state_dict( hu.torch_load(os.path.join(savedir, "model_best.pth"))) test_dict = model.test_on_loader(test_loader) hu.save_pkl(os.path.join(savedir, 'test_iou.pkl'), test_dict) print('Test IoU:{}'.format(test_dict["test_iou"])) print('Experiment completed et epoch %d' % e)
def __getitem__(self, index): # index = 0 img_path = self.dataset.images[index] name = os.path.split(img_path)[-1].split('.')[0] img_pil = Image.open(img_path).convert("RGB") W, H = img_pil.size points_list = self.point_dict[name] points_mask = np.zeros((H, W)) for p in points_list: if p['y'] >= H or p['x'] >= W: continue points_mask[int(p['y']), int(p['x'])] = p['cls'] if self.supervision == 'full': mask_path = self.dataset.masks[index] if '.mat' in mask_path: mask_pil = Image.fromarray( hu.load_mat(mask_path)['GTcls'][0]['Segmentation'][0]) else: mask_pil = Image.open(mask_path) # mask_pil = hu.load_mat(mask_path) inst_path = self.dataset.masks[index].replace( 'SegmentationClass', 'SegmentationObject') if '.mat' in inst_path: inst_pil = None else: inst_pil = Image.open(inst_path) elif self.supervision == 'seam': path_base = os.path.join(self.datadir, 'seam') os.makedirs(path_base, exist_ok=True) mask_path = os.path.join(path_base, 'masks', '%s_dict.pkl' % name) if not os.path.exists(mask_path): ut.generate_seam_segmentation(self, path_base=path_base) # mask_path = self.dataset.masks[index] mask_dict = hu.load_pkl(mask_path) if self.exp_dict.get('split_inst', False): blob_list, color_mask, inst_mask = get_blob_list_v2( mask_dict, points_mask, img_pil) else: stop # if points_mask.sum() > 0: # assert inst_mask.sum() > 1 # assert (inst_mask!=0).sum() == (color_mask!=0).sum() # hu.save_image('tmp.jpg', hi.mask_on_image(img_pil, inst_mask, add_bbox=True)) mask_pil = Image.fromarray(color_mask) inst_pil = Image.fromarray(inst_mask) elif self.supervision == 'top_rpn': pm = proposals.ProposalManager(region_mode='rpn', n_regions=100) bbox_yxyx = pm.get_top_bbox_yxyx(img_pil, points_mask=points) mask = np.zeros((H, W), dtype='uint8') inst = np.zeros((H, W), dtype='uint8') for i, b in enumerate(bbox_yxyx): y1, x1, y2, x2 = map(int, b) assert (y2 <= H and x2 <= W) mask[y1:y2, x1:x2] = points[i]['cls'] inst[y1:y2, x1:x2] = i + 1 mask_pil = Image.fromarray(mask) inst_pil = Image.fromarray(inst) elif self.supervision == 'points_sharpmask': region_list = datasets.get_sharpmask(name) mask_list, c_list = datasets.get_mask_list(img_pil, points, region_list) mask = np.zeros((H, W), dtype='uint8') inst = np.zeros((H, W), dtype='uint8') for i, m in enumerate(mask_list): mask[m == 1] = c_list[i] inst[m == 1] = i + 1 mask_pil = Image.fromarray(mask) inst_pil = Image.fromarray(inst) elif self.supervision == 'points_irn': region_list = datasets.get_irn_regions(name) mask_list, c_list = datasets.get_mask_list(img_pil, points, region_list) mask = np.zeros((H, W), dtype='uint8') inst = np.zeros((H, W), dtype='uint8') for i, m in enumerate(mask_list): mask[m == 1] = c_list[i] inst[m == 1] = i + 1 mask_pil = Image.fromarray(mask) inst_pil = Image.fromarray(inst) elif self.supervision == 'points_slic': region_list = datasets.get_superpixels(img_pil, points) mask_list, c_list = datasets.get_mask_list(img_pil, points, region_list) mask = np.zeros((H, W), dtype='uint8') inst = np.zeros((H, W), dtype='uint8') for i, m in enumerate(mask_list): mask[m == 1] = c_list[i] inst[m == 1] = i + 1 mask_pil = Image.fromarray(mask) inst_pil = Image.fromarray(inst) else: raise ValueError('%s not found' % self.supervision) images = torchvision.transforms.ToTensor()(np.array(img_pil)) mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] images = transforms.Normalize(mean=mean, std=std)(images) masks = torch.as_tensor(np.array(mask_pil)) y_list, x_list = np.where(masks == 0) if len(y_list) > 0: yi, xi = datasets.get_random(y_list, x_list) points_list += [{'cls': 0, 'x': xi, 'y': yi}] batch = { "images": images, "img_pil": img_pil, 'points': torch.as_tensor(points_mask), 'point_list': points_list, # 'inst':inst, # 'flipped':flipped, "masks": masks, # "original":inv_transform(images), "meta": { "index": index, 'hash': hu.hash_dict({ 'id': index, 'split': self.split }), "name": self.dataset.images[index], "size": images.shape[-2:], "image_id": index, "split": self.split } } return batch
def __init__(self, split, datadir, exp_dict, mode='counting'): self.exp_dict = exp_dict if self.exp_dict['dataset']['mode'] == 'crowded': self.path = os.path.join( datadir, 'counting-crowded_n=100000_2020-Oct-19.h5py') elif self.exp_dict['dataset']['mode'] == 'fixed_scale': self.path = os.path.join( datadir, 'counting-fix-scale_n=100000_2020-Oct-19.h5py') elif ['no_overlap', 'overlap']: self.path = os.path.join(datadir, 'counting_n=100000_2020-Oct-19.h5py') else: stop path_id = hu.hash_str(self.path) train_meta_fname = os.path.join(datadir, 'train_meta_list_v1_%s.pkl' % path_id) val_meta_fname = os.path.join(datadir, 'val_meta_list_v1_%s.pkl' % path_id) if not os.path.exists(train_meta_fname): meta = load_attributes_h5(self.path) meta_list, splits = meta for i, m in enumerate(meta_list): meta_list[i] = json.loads(m) for i, m in enumerate(meta_list): meta_list[i]['index'] = i meta_list = np.array(meta_list) train_split = splits['stratified_char'][:, 0] val_split = splits['stratified_char'][:, 1] # test_split = splits['stratified_char'][:, 2] train_meta_list = meta_list[train_split][:10000] val_meta_list = meta_list[val_split][:10000] hu.save_pkl(train_meta_fname, train_meta_list) hu.save_pkl(val_meta_fname, val_meta_list) # self.transform = None # load_minibatch_h5(self.path, [indices]) # self.img_list = glob.glob(self.path+"/*.jpeg") self.split = split if split == 'train': self.meta_list = np.array(hu.load_pkl(train_meta_fname)) n = int(0.9 * len(self.meta_list)) self.meta_list = self.meta_list[:n] elif split == 'val': self.meta_list = np.array(hu.load_pkl(train_meta_fname)) n = int(0.9 * len(self.meta_list)) self.meta_list = self.meta_list[n:] elif split == 'test': self.meta_list = np.array(hu.load_pkl(val_meta_fname)) if self.exp_dict['dataset']['mode'] == 'no_overlap': self.meta_list = [ m for m in self.meta_list if m['overlap_score'] == 0 ] elif self.exp_dict['dataset']['mode'] == 'overlap': self.meta_list = [ m for m in self.meta_list if m['overlap_score'] > 0 ] elif self.exp_dict['dataset']['mode'] in ['crowded', 'fixed_scale']: self.meta_list = self.meta_list else: stop symbol_dict = {} for i in range(len(self.meta_list)): meta = self.meta_list[i] for s in meta['symbols']: if s['char'] not in symbol_dict: symbol_dict[s['char']] = [] symbol_dict[s['char']] += [i] self.n_classes = 2 self.symbol_dict = symbol_dict self.img_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ])
def trainval(exp_dict, savedir_base, data_root, reset=False, wandb='None', wandb_key='None'): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print(exp_dict) print("Experiment saved in %s" % savedir) model_name = exp_dict['model'] + \ "_lr_" + str(exp_dict['lr']) +\ "_hs_" + str(exp_dict['backbone']['hidden_size']) +\ "_pa_" + str(exp_dict['patience']) if exp_dict['model'] == 'MAML': model_name += "_ilr_" + str(exp_dict['inner_lr']) +\ "_nii_" + str(exp_dict['n_inner_iter']) #TODO add seed if wandb is not 'None': # https://docs.wandb.com/quickstart import wandb as logger if wandb_key is not 'None': logger.login(key=wandb_key) logger.init(project=wandb, group=model_name) logger.config.update(exp_dict) # Dataset # ----------- train_dataset = get_dataset('train', data_root, exp_dict) val_dataset = get_dataset('val', data_root, exp_dict) test_dataset = get_dataset('test', data_root, exp_dict) if 'ood' in exp_dict['dataset']['task']: ood_dataset = get_dataset('ood', data_root, exp_dict) ood = True else: ood = False # train and val loader if exp_dict["episodic"] == False: train_loader = DataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) val_loader = DataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) test_loader = DataLoader(test_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) else: # to support episodes TODO: move inside each model from datasets.episodic_dataset import EpisodicDataLoader train_loader = EpisodicDataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) val_loader = EpisodicDataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) test_loader = EpisodicDataLoader(test_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) if ood: ood_loader = EpisodicDataLoader(ood_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) # Model # ----------- model = get_model(exp_dict) # Checkpoint # ----------- model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 patience_counter = 0 # Train & Val # ------------ print("Starting experiment at epoch %d" % (s_epoch)) for e in range(s_epoch, exp_dict['max_epoch']): score_dict = {} # Train the model score_dict.update(model.train_on_loader(train_loader)) # Validate and Test the model score_dict.update( model.val_on_loader(val_loader, mode='val', savedir=os.path.join( savedir_base, exp_dict['dataset']['name']))) score_dict.update(model.val_on_loader(test_loader, mode='test')) if ood: score_dict.update(model.val_on_loader(ood_loader, mode='ood')) score_dict["epoch"] = e # Visualize the model # model.vis_on_loader(vis_loader, savedir=savedir+"/images/") # Test error at best validation: if score_dict["val_accuracy"] > model.best_val: score_dict["test_accuracy_at_best_val"] = score_dict[ "test_accuracy"] score_dict["ood_accuracy_at_best_val"] = score_dict["ood_accuracy"] model.best_val = score_dict["val_accuracy"] patience_counter = 0 # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) if wandb is not 'None': for key, values in score_dict.items(): logger.log({key: values}) patience_counter += 1 # Patience: if patience_counter > exp_dict['patience'] * 3: print('training done, out of patience') break print('experiment completed')
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0): # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: hc.delete_and_backup_experiment(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ================== # train set train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="train", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) # val set val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="val", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) val_sampler = torch.utils.data.SequentialSampler(val_set) val_loader = DataLoader(val_set, sampler=val_sampler, batch_size=1, num_workers=num_workers) # Model # ================== model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=train_set).cuda() # model.opt = optimizers.get_optim(exp_dict['opt'], model) model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ================== print("Starting experiment at epoch %d" % (s_epoch)) train_sampler = torch.utils.data.RandomSampler(train_set, replacement=True, num_samples=2 * len(val_set)) train_loader = DataLoader(train_set, sampler=train_sampler, batch_size=exp_dict["batch_size"], drop_last=True, num_workers=num_workers) for e in range(s_epoch, exp_dict['max_epoch']): # Validate only at the start of each cycle score_dict = {} # Train the model train_dict = model.train_on_loader(train_loader) # Validate and Visualize the model val_dict = model.val_on_loader(val_loader, savedir_images=os.path.join( savedir, "images"), n_images=3) score_dict.update(val_dict) # model.vis_on_loader( # vis_loader, savedir=os.path.join(savedir, "images")) # Get new score_dict score_dict.update(train_dict) score_dict["epoch"] = len(score_list) # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail(), "\n") hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) # Save Best Checkpoint if e == 0 or (score_dict.get("val_score", 0) > score_df["val_score"][:-1].fillna(0).max()): hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "model_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) print('Experiment completed et epoch %d' % e)
# lcfcn loss with_affinity=True # hash_id = '84ced18cf5c1fb3ad5820cc1b55a38fa' # point level # hash_id = 'd7040c9534b08e765f48c6cb034b26b2' # LCFCN # hash_id = 'bcba046296675e9e3af5cd9f353d217b' for hash_id in hash_list: exp_dict = hu.load_json( os.path.join(savedir_base, hash_id, 'exp_dict.json')) fname = '.tmp/train_dict_%s.pkl' % hash_id datadir = '/mnt/public/datasets/DeepFish/' if os.path.exists(fname) and 0: train_dict = hu.load_pkl(fname) else: split = 'train' exp_dict['model']['count_mode'] = 0 train_set = datasets.get_dataset( dataset_dict=exp_dict["dataset"], split=split, datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) train_loader = DataLoader( train_set, # sampler=val_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=0)
def train(exp_dict, savedir_base, reset, compute_fid=False): # Book keeping pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: ut.rmtree(savedir) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) print('Experiment saved in %s' % savedir) device = \ torch.device('cuda:' + exp_dict['gpu'] if torch.cuda.is_available() else 'cpu') # 1. Load dataset and loader train_set, test_set, num_channels, num_train_classes, num_test_classes = \ datasets.get_dataset(exp_dict['dataset'], dataset_path=savedir_base, image_size=exp_dict['image_size']) train_loader, test_loader = \ dataloaders.get_dataloader(exp_dict['dataloader'], train_set, test_set, exp_dict) # 2. Fetch model to train model = models.get_model(exp_dict['model'], num_train_classes, num_test_classes, num_channels, device, exp_dict) # 3. Resume experiment or start from scratch score_list_path = os.path.join(savedir, 'score_list.pkl') if os.path.exists(score_list_path): # Resume experiment if it exists model_path = os.path.join(savedir, 'model_state_dict.pth') model.load_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) meta_dict_path = os.path.join(savedir, 'meta_dict.pkl') meta_dict = hu.load_pkl(meta_dict_path) print('Resuming experiment at episode %d epoch %d' % (meta_dict['episode'], meta_dict['epoch'])) else: # Start experiment from scratch meta_dict = {'episode': 1, 'epoch': 1} score_list = [] # Remove TensorBoard logs from previous runs ut.rmtree(os.path.join(savedir, 'tensorboard_logs')) print('Starting experiment at episode %d epoch %d' % (meta_dict['episode'], meta_dict['epoch'])) # 4. Train and eval loop s_epoch = meta_dict['epoch'] for e in range(s_epoch, exp_dict['num_epochs'] + 1): # 0. Initialize dicts score_dict = {'epoch': e} meta_dict['epoch'] = e # 1. Train on loader train_dict = model.train_on_loader(train_loader) # 1b. Compute FID if compute_fid == 1: if e % 20 == 0 or e == 1 or e == exp_dict['num_epochs']: print('Starting FID computation...') train_dict['fid'] = fid(model, train_loader.dataset, train_loader.sampler, save_dir) score_dict.update(train_dict) # 2. Eval on loader eval_dict = model.val_on_loader(test_loader, savedir, e) score_dict.update(eval_dict) # 3. Report and save model state, optimizer state, and scores score_list += [score_dict] score_df = pd.DataFrame(score_list) print('\n', score_df.tail(), '\n') if e % 10 == 0: hu.torch_save(os.path.join(savedir, 'model_state_dict.pth'), model.get_state_dict()) hu.save_pkl(os.path.join(savedir, 'score_list.pkl'), score_list) hu.save_pkl(os.path.join(savedir, 'meta_dict.pkl'), meta_dict)
def trainval(exp_dict, savedir, args): """ exp_dict: dictionary defining the hyperparameters of the experiment savedir: the directory where the experiment will be saved args: arguments passed through the command line """ # set seed # ================== seed = 42 np.random.seed(seed) torch.manual_seed(seed) #helen commented out the following lines to hard code in that the device was 'cpu' to resolve errors #if args.use_cuda: #device = 'cuda' #torch.cuda.manual_seed_all(seed) #assert torch.cuda.is_available(), 'cuda is not, available please run with "-c 0"' #else: device = 'cpu' print('Running on device: %s' % device) # Dataset # Load val set and train set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="val", transform=exp_dict.get("transform"), datadir=args.datadir) train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="train", transform=exp_dict.get("transform"), datadir=args.datadir) # Load train loader, val loader, and vis loader train_loader = DataLoader(train_set, sampler=RandomSampler( train_set, replacement=True, num_samples=max(min(500, len(train_set)), len(val_set))), batch_size=exp_dict["batch_size"]) val_loader = DataLoader(val_set, shuffle=False, batch_size=exp_dict["batch_size"]) vis_loader = DataLoader(val_set, sampler=ut.SubsetSampler(train_set, indices=[0, 1, 2]), batch_size=1) # Create model, opt, wrapper model_original = models.get_model(exp_dict["model"], exp_dict=exp_dict).cuda() opt = torch.optim.Adam(model_original.parameters(), lr=1e-5, weight_decay=0.0005) model = wrappers.get_wrapper(exp_dict["wrapper"], model=model_original, opt=opt).cuda() score_list = [] # Checkpointing # ============= #score_list_path = os.path.join(savedir, "score_list.pkl") #helen commented out these three lines and hard coded the model and opt paths to resolve errors #model_path = os.path.join(savedir, "model_state_dict.pth") #opt_path = os.path.join(savedir, "opt_state_dict.pth") score_list_path = '/Users/helenpropson/Documents/git/marepesca/results/testresults/score_list.pkl' #helen added this model_path = '/Users/helenpropson/Documents/git/marepesca/results/testresults/model_state_dict.pth' #helen added this opt_path = '/Users/helenpropson/Documents/git/marepesca/results/testresults/opt_state_dict.pth' #helen added this #helen hard coded that the experiment would resume instead of restarting from epoch 0 #if os.path.exists(score_list_path): # resume experiment score_list = hu.load_pkl( score_list_path ) #helen changed this from ut.load_pkl to hu.load_pkl to resolve error model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]["epoch"] + 1 #else: # restart experiment #score_list = [] #s_epoch = 0 # *************** helen added this code im = Image.open("/Users/helenpropson/Documents/git/marepesca/tank.jpg") # im.show() #this line will display the image you are running the model on if uncommented mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] normalize_transform = transforms.Normalize(mean=mean, std=std) data_transform = transforms.Compose( [transforms.ToTensor(), normalize_transform]) #transformations we will use on our image im_new = data_transform( im) #transforms the image into a tensor and normalizes it im_final = im_new.unsqueeze( 0) #adds another dimension so image is the correct shape for the model print("now trying helen's code") #print statement for debugging #model.vis_on_batch_helen(im_final, f'im_new') #uncomment this line to run model on image # *************** this is the end of helen's code # Run training and validation for epoch in range(s_epoch, exp_dict["max_epoch"]): score_dict = {"epoch": epoch} # visualize model.vis_on_loader(vis_loader, savedir=os.path.join(savedir, "images")) print("after vis_on_loader" ) #helen add this print statement as an update while iterating # validate score_dict.update(model.val_on_loader(val_loader)) print("after validate") # train score_dict.update(model.train_on_loader(train_loader)) print("after train") # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved in %s" % savedir)
def test(exp_dict, savedir_base, datadir, num_workers=0, model_path=None, scan_id=None): # bookkeepting stuff # ================== pprint.pprint(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ================== # val set test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"], split="val", datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) if str(scan_id) != 'None': test_set.active_data = test_set.get_scan(scan_id) test_sampler = torch.utils.data.SequentialSampler(test_set) test_loader = DataLoader(test_set, sampler=test_sampler, batch_size=1, collate_fn=ut.collate_fn, num_workers=num_workers) # Model # ================== # chk = torch.load('best_model.ckpt') model = models.get_model_for_onnx_export(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=test_set).cuda() epoch = -1 if str(model_path) != 'None': model_path = model_path model.load_state_dict(hu.torch_load(model_path)) else: try: exp_dict_train = copy.deepcopy(exp_dict) del exp_dict_train['test_mode'] savedir_train = os.path.join(savedir_base, hu.hash_dict(exp_dict_train)) model_path = os.path.join(savedir_train, "model_best.pth") score_list = hu.load_pkl( os.path.join(savedir_train, 'score_list_best.pkl')) epoch = score_list[-1]['epoch'] print('Loaded model at epoch %d with score %.3f' % epoch) model.load_state_dict(hu.torch_load(model_path)) except: pass s_time = time.time() savedir_images = os.path.join(savedir, 'images') # delete image folder if exists if os.path.exists(savedir_images): shutil.rmtree(savedir_images) os.makedirs(savedir_images, exist_ok=True) # for i in range(20): # score_dict = model.train_on_loader(test_loader) score_dict = model.val_on_loader(test_loader, savedir_images=savedir_images, n_images=30000, save_preds=True) score_dict['epoch'] = epoch score_dict["time"] = time.time() - s_time score_dict["saved_at"] = hu.time_to_montreal() # save test_score_list test_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(test_path): test_score_list = [ sd for sd in hu.load_pkl(test_path) if sd['epoch'] != epoch ] else: test_score_list = [] # append score_dict to last result test_score_list += [score_dict] hu.save_pkl(test_path, test_score_list) print('Final Score is ', str(score_dict["val_score"]) + "\n")
def __init__(self, model, n_classes, exp_dict, pretrained_savedir=None, savedir_base=None): """ Constructor Args: model: architecture to train exp_dict: reference to dictionary with the global state of the application """ super().__init__() self.model = model self.exp_dict = exp_dict self.ngpu = self.exp_dict["ngpu"] self.predict_method = exp_dict['predict_method'] self.model.add_classifier(n_classes, modalities=0) self.nclasses = n_classes best_accuracy = -1 self.label = exp_dict['model']['backbone'] + "_" + exp_dict['dataset_test'].split('_')[1].replace('-imagenet','') print('=============') print('dataset:', exp_dict["dataset_train"].split('_')[-1]) print('backbone:', exp_dict['model']["backbone"]) print('n_classes:', exp_dict['n_classes']) print('support_size_train:', exp_dict['support_size_train']) if pretrained_savedir is None: # find the best checkpoint savedir_base = exp_dict["finetuned_weights_root"] if not os.path.exists(savedir_base): raise ValueError("Please set the variable named \ 'finetuned_weights_root' with the path of the folder \ with the episodic finetuning experiments") for exp_hash in os.listdir(savedir_base): base_path = os.path.join(savedir_base, exp_hash) exp_dict_path = os.path.join(base_path, 'exp_dict.json') if not os.path.exists(exp_dict_path): continue loaded_exp_dict = hu.load_json(exp_dict_path) pkl_path = os.path.join(base_path, 'score_list_best.pkl') if exp_dict['support_size_train'] in [2,3,4]: support_size_needed = 1 else: support_size_needed = exp_dict['support_size_train'] if (loaded_exp_dict["model"]["name"] == 'finetuning' and loaded_exp_dict["dataset_train"].split('_')[-1] == exp_dict["dataset_train"].split('_')[-1] and loaded_exp_dict["model"]["backbone"] == exp_dict['model']["backbone"] and loaded_exp_dict['n_classes'] == exp_dict["n_classes"] and loaded_exp_dict['support_size_train'] == support_size_needed, loaded_exp_dict["embedding_prop"] == exp_dict["embedding_prop"]): model_path = os.path.join(base_path, 'checkpoint_best.pth') try: print("Attempting to load ", model_path) accuracy = hu.load_pkl(pkl_path)[-1]["val_accuracy"] self.model.load_state_dict(torch.load(model_path)['model'], strict=False) if accuracy > best_accuracy: best_path = os.path.join(base_path, 'checkpoint_best.pth') best_accuracy = accuracy except Exception as e: print(e) assert(best_accuracy > 0.1) print("Finetuning %s with original accuracy : %f" %(base_path, best_accuracy)) self.model.load_state_dict(torch.load(best_path)['model'], strict=False) self.best_accuracy = best_accuracy self.acc_sum = 0.0 self.n_count = 0 self.model.cuda()
if points.sum() == 0: continue savedir_image = os.path.join('.tmp/qualitative/%d.png' % (i)) img = hu.denormalize(batch['images'], mode='rgb') img_org = np.array( hu.save_image(savedir_image, img, mask=batch['masks'].numpy(), return_image=True)) img_list = [img_org] with torch.no_grad(): for hash_id in hash_list: score_path = os.path.join(savedir_base, hash_id, 'score_list_best.pkl') score_list = hu.load_pkl(score_path) exp_dict = hu.load_json( os.path.join(savedir_base, hash_id, 'exp_dict.json')) print(i, exp_dict['model']['loss'], exp_dict['model'].get('with_affinity'), 'score:', score_list[-1]['test_class1']) model = models.get_model(model_dict=exp_dict['model'], exp_dict=exp_dict, train_set=test_set).cuda() model_path = os.path.join(savedir_base, hash_id, 'model_best.pth') model.load_state_dict(hu.torch_load(model_path), with_opt=False)
def trainval(exp_dict, savedir_base, reset=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # set seed # --------------- seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ----------- # train loader train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=savedir_base, exp_dict=exp_dict) train_loader = torch.utils.data.DataLoader( train_set, drop_last=True, shuffle=True, batch_size=exp_dict["batch_size"]) # val set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=savedir_base, exp_dict=exp_dict) # Model # ----------- model = models.get_model(exp_dict["model"], train_set=train_set).cuda() # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Compute fstar # ------------- if exp_dict['opt'].get('fstar_flag'): ut.compute_fstar(train_set, loss_function, savedir_base, exp_dict) # Load Optimizer n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt_dict=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') opt_path = os.path.join(savedir, 'opt_state_dict.pth') if os.path.exists(score_list_path): # resume experiment score_list = hu.load_pkl(score_list_path) model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print('Starting experiment at epoch %d/%d' % (s_epoch, exp_dict['max_epoch'])) for e in range(s_epoch, exp_dict['max_epoch']): # Set seed seed = e + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) score_dict = {} # Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name=exp_dict["loss_func"]) # Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # Train over train loader model.train() print("%d - Training model with %s..." % (e, exp_dict["loss_func"])) # train and validate s_time = time.time() for batch in tqdm.tqdm(train_loader): images, labels = batch["images"].cuda(), batch["labels"].cuda() opt.zero_grad() # closure def closure(): return loss_function(model, images, labels, backwards=True) opt.step(closure) e_time = time.time() # Record metrics score_dict["epoch"] = e score_dict["step_size"] = opt.state["step_size"] score_dict["step_size_avg"] = opt.state["step_size_avg"] score_dict["n_forwards"] = opt.state["n_forwards"] score_dict["n_backwards"] = opt.state["n_backwards"] score_dict["grad_norm"] = opt.state["grad_norm"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) print('Experiment completed')
def trainval(exp_dict, savedir_base, reset=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # Dataset # ----------- # train loader train_loader = datasets.get_loader(dataset_name=exp_dict['dataset'], datadir=savedir_base, split='train') # val loader val_loader = datasets.get_loader(dataset_name=exp_dict['dataset'], datadir=savedir_base, split='val') # Model # ----------- model = models.get_model(model_name=exp_dict['model']) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print('Starting experiment at epoch %d' % (s_epoch)) for e in range(s_epoch, 10): score_dict = {} # Train the model train_dict = model.train_on_loader(train_loader) # Validate the model val_dict = model.val_on_loader(val_loader) # Get metrics score_dict['train_loss'] = train_dict['train_loss'] score_dict['val_acc'] = val_dict['val_acc'] score_dict['epoch'] = e # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print(score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print('Checkpoint Saved: %s' % savedir) print('experiment completed')
def trainval(exp_dict, savedir_base, data_root, reset=False, test_only=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) np.random.seed(exp_dict["seed"]) torch.manual_seed(exp_dict["seed"]) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) pprint.pprint(exp_dict) print("Experiment saved in %s" % savedir) # Dataset # ----------- # train and val loader if exp_dict["episodic"] == False: if (int(test_only) == 0): train_dataset, val_dataset, test_dataset = get_dataset( ['train', 'val', 'test'], data_root, exp_dict) train_loader = DataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) val_loader = DataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) test_loader = DataLoader(test_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) if hasattr(train_dataset, "mask"): # assert((train_dataset.mask == val_dataset.mask).all()) # assert((train_dataset.mask == test_dataset.mask).all()) np.save(os.path.join(savedir, "mask.npy"), train_dataset.mask) else: test_dataset, = get_dataset(['test'], exp_dict) test_loader = DataLoader(test_dataset, batch_size=exp_dict['batch_size'], shuffle=True, num_workers=args.num_workers) else: # to support episodes TODO: move inside each model from datasets.episodic_dataset import EpisodicDataLoader train_loader = EpisodicDataLoader(train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) val_loader = EpisodicDataLoader(val_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x, num_workers=args.num_workers) # Model # ----------- model = get_model(exp_dict) print("Parameters: ", sum([torch.numel(v) for v in model.parameters()])) # Checkpoint # ----------- model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment print("Resuming from", model_path) model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 if int(test_only) == 0: # Train & Val # ------------ print("Starting experiment at epoch %d" % (s_epoch)) for e in range(s_epoch, exp_dict['max_epoch']): score_dict = {} # Train the model score_dict.update(model.train_on_loader(train_loader)) # Validate the model score_dict.update( model.val_on_loader(val_loader, savedir=os.path.join( savedir_base, exp_dict['dataset']['name']))) score_dict["epoch"] = e # Visualize the model # model.vis_on_loader(vis_loader, savedir=savedir+"/images/") # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) if model.is_end(): print("Early stopping") break print('experiment completed') print("Testing...") score_dict = model.test_on_loader(train_loader, tag="train") score_dict.update(model.test_on_loader(val_loader, tag="val")) score_dict.update(model.test_on_loader(test_loader, tag="test")) # Report & Save score_list_path = os.path.join(savedir, "score_list_test.pkl") hu.save_pkl(score_list_path, score_dict) else: print("Testing...") score_dict = model.test_on_loader(test_loader, "test") # Report & Save score_list_path = os.path.join(savedir, "score_list_test.pkl") hu.save_pkl(score_list_path, score_dict)
def trainval(exp_dict, savedir, args): """ exp_dict: dictionary defining the hyperparameters of the experiment savedir: the directory where the experiment will be saved args: arguments passed through the command line """ # set seed # ================== seed = 42 np.random.seed(seed) torch.manual_seed(seed) if args.use_cuda: device = 'cuda' torch.cuda.manual_seed_all(seed) assert torch.cuda.is_available( ), 'cuda is not, available please run with "-c 0"' else: device = 'cpu' print('Running on device: %s' % device) # Dataset # Load val set and train set val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="val", transform=exp_dict.get("transform"), datadir=args.datadir) train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], split="train", transform=exp_dict.get("transform"), datadir=args.datadir) # Load train loader, val loader, and vis loader train_loader = DataLoader(train_set, sampler=RandomSampler( train_set, replacement=True, num_samples=max(min(500, len(train_set)), len(val_set))), batch_size=exp_dict["batch_size"]) val_loader = DataLoader(val_set, shuffle=False, batch_size=exp_dict["batch_size"]) vis_loader = DataLoader(val_set, sampler=ut.SubsetSampler(train_set, indices=[0, 1, 2]), batch_size=1) # Create model, opt, wrapper model_original = models.get_model(exp_dict["model"], exp_dict=exp_dict).cuda() opt = torch.optim.Adam(model_original.parameters(), lr=1e-5, weight_decay=0.0005) model = wrappers.get_wrapper(exp_dict["wrapper"], model=model_original, opt=opt).cuda() score_list = [] # Checkpointing # ============= score_list_path = os.path.join(savedir, "score_list.pkl") model_path = os.path.join(savedir, "model_state_dict.pth") opt_path = os.path.join(savedir, "opt_state_dict.pth") if os.path.exists(score_list_path): # resume experiment score_list = hu.load_pkl(score_list_path) model.load_state_dict(torch.load(model_path)) opt.load_state_dict(torch.load(opt_path)) s_epoch = score_list[-1]["epoch"] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Run training and validation for epoch in range(s_epoch, exp_dict["max_epoch"]): score_dict = {"epoch": epoch} # visualize model.vis_on_loader(vis_loader, savedir=os.path.join(savedir, "images")) # validate score_dict.update(model.val_on_loader(val_loader)) # train score_dict.update(model.train_on_loader(train_loader)) # Add score_dict to score_list score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved in %s" % savedir)
def trainval(exp_dict, savedir_base, reset=False, num_workers=0, run_ssl=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # load datasets # ========================== train_set = datasets.get_dataset( dataset_name=exp_dict["dataset_train"], data_root=exp_dict["dataset_train_root"], split="train", transform=exp_dict["transform_train"], classes=exp_dict["classes_train"], support_size=exp_dict["support_size_train"], query_size=exp_dict["query_size_train"], n_iters=exp_dict["train_iters"], unlabeled_size=exp_dict["unlabeled_size_train"]) val_set = datasets.get_dataset( dataset_name=exp_dict["dataset_val"], data_root=exp_dict["dataset_val_root"], split="val", transform=exp_dict["transform_val"], classes=exp_dict["classes_val"], support_size=exp_dict["support_size_val"], query_size=exp_dict["query_size_val"], n_iters=exp_dict["val_iters"], unlabeled_size=exp_dict["unlabeled_size_val"]) test_set = datasets.get_dataset( dataset_name=exp_dict["dataset_test"], data_root=exp_dict["dataset_test_root"], split="test", transform=exp_dict["transform_val"], classes=exp_dict["classes_test"], support_size=exp_dict["support_size_test"], query_size=exp_dict["query_size_test"], n_iters=exp_dict["test_iters"], unlabeled_size=exp_dict["unlabeled_size_test"]) # get dataloaders # ========================== train_loader = torch.utils.data.DataLoader( train_set, batch_size=exp_dict["batch_size"], shuffle=True, num_workers=num_workers, collate_fn=ut.get_collate(exp_dict["collate_fn"]), drop_last=True) val_loader = torch.utils.data.DataLoader(val_set, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=lambda x: x, drop_last=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=lambda x: x, drop_last=True) # create model and trainer # ========================== # Create model, opt, wrapper backbone = backbones.get_backbone( backbone_name=exp_dict['model']["backbone"], exp_dict=exp_dict) model = models.get_model(model_name=exp_dict["model"]['name'], backbone=backbone, n_classes=exp_dict["n_classes"], exp_dict=exp_dict) if run_ssl: # runs the SSL experiments score_list_path = os.path.join(savedir, 'score_list.pkl') if not os.path.exists(score_list_path): test_dict = model.test_on_loader(test_loader, max_iter=None) hu.save_pkl(score_list_path, [test_dict]) return # Checkpoint # ----------- checkpoint_path = os.path.join(savedir, 'checkpoint.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') if os.path.exists(score_list_path): # resume experiment model.load_state_dict(hu.torch_load(checkpoint_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Run training and validation for epoch in range(s_epoch, exp_dict["max_epoch"]): score_dict = {"epoch": epoch} score_dict.update(model.get_lr()) # train score_dict.update(model.train_on_loader(train_loader)) # validate score_dict.update(model.val_on_loader(val_loader)) score_dict.update(model.test_on_loader(test_loader)) # Add score_dict to score_list score_list += [score_dict] # Report score_df = pd.DataFrame(score_list) print(score_df.tail()) # Save checkpoint hu.save_pkl(score_list_path, score_list) hu.torch_save(checkpoint_path, model.get_state_dict()) print("Saved: %s" % savedir) if "accuracy" in exp_dict["target_loss"]: is_best = score_dict[exp_dict["target_loss"]] >= score_df[ exp_dict["target_loss"]][:-1].max() else: is_best = score_dict[exp_dict["target_loss"]] <= score_df[ exp_dict["target_loss"]][:-1].min() # Save best checkpoint if is_best: hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list) hu.torch_save(os.path.join(savedir, "checkpoint_best.pth"), model.get_state_dict()) print("Saved Best: %s" % savedir) # Check for end of training conditions if model.is_end_of_training(): break
def newminimum(exp_id, savedir_base, datadir, name, exp_dict, metrics_flag=True): # bookkeeping # --------------- # get experiment directory old_modeldir = os.path.join(savedir_base, exp_id) savedir = os.path.join(savedir_base, exp_id, name) old_exp_dict = hu.load_json(os.path.join(old_modeldir, 'exp_dict.json')) # TODO: compare exp dict for possible errors: # optimizer have to be the same # same network, dataset # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict) pprint.pprint(exp_dict) print('Experiment saved in %s' % savedir) # set seed # --------------- seed = 42 + exp_dict['runs'] np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Dataset # ----------- # Load Train Dataset train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=True, datadir=datadir, exp_dict=exp_dict) train_loader = torch.utils.data.DataLoader( train_set, drop_last=True, shuffle=True, batch_size=exp_dict["batch_size"]) # Load Val Dataset val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"], train_flag=False, datadir=datadir, exp_dict=exp_dict) # Model # ----------- model = models.get_model(exp_dict["model"], train_set=train_set) # Choose loss and metric function loss_function = metrics.get_metric_function(exp_dict["loss_func"]) # Load Optimizer n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"]) opt = optimizers.get_optimizer(opt=exp_dict["opt"], params=model.parameters(), n_batches_per_epoch=n_batches_per_epoch) # Checkpoint # ----------- model_path = os.path.join(savedir, 'model.pth') score_list_path = os.path.join(savedir, 'score_list.pkl') opt_path = os.path.join(savedir, 'opt_state_dict.pth') old_model_path = os.path.join(old_modeldir, 'model.pth') old_score_list_path = os.path.join(old_modeldir, 'score_list.pkl') old_opt_path = os.path.join(old_modeldir, 'opt_state_dict.pth') score_list = hu.load_pkl(old_score_list_path) model.load_state_dict(torch.load(old_model_path)) opt.load_state_dict(torch.load(old_opt_path)) s_epoch = score_list[-1]['epoch'] + 1 # save current model state for comparison minimum = [] for param in model.parameters(): minimum.append(param.clone()) # Train & Val # ------------ print('Starting experiment at epoch %d/%d' % (s_epoch, exp_dict['max_epoch'])) for epoch in range(s_epoch, exp_dict['max_epoch']): # Set seed np.random.seed(exp_dict['runs'] + epoch) torch.manual_seed(exp_dict['runs'] + epoch) # torch.cuda.manual_seed_all(exp_dict['runs']+epoch) not needed since no cuda available score_dict = {"epoch": epoch} if metrics_flag: # 1. Compute train loss over train set score_dict["train_loss"] = metrics.compute_metric_on_dataset( model, train_set, metric_name='softmax_loss') # metric_name=exp_dict["loss_func"]) # TODO: which loss should be used? (normal or with reguralizer?) # 2. Compute val acc over val set score_dict["val_acc"] = metrics.compute_metric_on_dataset( model, val_set, metric_name=exp_dict["acc_func"]) # 3. Train over train loader model.train() print("%d - Training model with %s..." % (epoch, exp_dict["loss_func"])) s_time = time.time() for images, labels in tqdm.tqdm(train_loader): # images, labels = images.cuda(), labels.cuda() no cuda available opt.zero_grad() loss = loss_function(model, images, labels, minimum, 0.1) # just works for custom loss function loss.backward() opt.step() e_time = time.time() # Record metrics score_dict["step_size"] = opt.state["step_size"] score_dict["n_forwards"] = opt.state["n_forwards"] score_dict["n_backwards"] = opt.state["n_backwards"] score_dict["batch_size"] = train_loader.batch_size score_dict["train_epoch_time"] = e_time - s_time score_list += [score_dict] # Report and save print(pd.DataFrame(score_list).tail()) hu.save_pkl(score_list_path, score_list) hu.torch_save(model_path, model.state_dict()) hu.torch_save(opt_path, opt.state_dict()) print("Saved: %s" % savedir) with torch.nograd(): print('Current distance: %f', metrics.computedistance(minimum, model)) print('Experiment completed')
def __init__(self, model, n_classes, exp_dict): """ Constructor Args: model: architecture to train exp_dict: reference to dictionary with the global state of the application """ super().__init__() self.model = model self.exp_dict = exp_dict self.ngpu = self.exp_dict["ngpu"] self.predict_method = exp_dict['predict_method'] self.model.add_classifier(n_classes, modalities=0) self.nclasses = n_classes if self.exp_dict["rotation_weight"] > 0: self.model.add_classifier(4, "classifier_rot") best_accuracy = -1 self.label = exp_dict['model']['backbone'] + "_" + exp_dict[ 'dataset_test'].split('_')[1].replace('-imagenet', '') if self.exp_dict["pretrained_weights_root"] == 'tinder': best_scores = np.load( '/mnt/datasets/public/research/adaptron_laplace/best_scores.npy', allow_pickle=True) for r in best_scores: backbone_best = r[3] dataset_best = r[4] savedir_best = r[-1] best_accuracy = r[0] shot_best = r[2] if (exp_dict['model']['backbone'] == backbone_best and exp_dict['dataset_test'] == dataset_best and 5 == shot_best): self.best_accuracy = best_accuracy self.model.load_state_dict( torch.load( os.path.join(savedir_best, 'checkpoint_best.pth'))['model']) break elif self.exp_dict["pretrained_weights_root"] == 'csv': best_scores = np.load( '/mnt/datasets/public/research/adaptron_laplace/best_scores.npy', allow_pickle=True) for r in best_scores: backbone_best = r[3] dataset_best = r[4] savedir_best = r[-1] best_accuracy = r[0] shot_best = r[2] if (exp_dict['model']['backbone'] == backbone_best and exp_dict['dataset_test'] == dataset_best and exp_dict['support_size_test'] == shot_best): self.best_accuracy = best_accuracy self.model.load_state_dict( torch.load( os.path.join(savedir_best, 'checkpoint_best.pth'))['model']) break elif self.exp_dict["pretrained_weights_root"] == 'hdf5': fdir = '/mnt/datasets/public/research/adaptron_laplace/embeddings/finetuned' fpos = "%s_1shot_fine_*/test.h5" % (self.label) embeddings_fname = glob.glob(os.path.join(fdir, fpos))[0] self.best_accuracy = float( embeddings_fname.split('/')[-2].split('_')[-1]) / 100. self.sampler = oracle.Sampler(embeddings_fname=embeddings_fname, n_classes=exp_dict['classes_test'], distract_flag=exp_dict.get( 'distract_flag', False)) elif self.exp_dict["pretrained_weights_root"] is not None: for exp_hash in os.listdir( self.exp_dict['pretrained_weights_root']): base_path = os.path.join( self.exp_dict['pretrained_weights_root'], exp_hash) exp_dict_path = os.path.join(base_path, 'exp_dict.json') if not os.path.exists(exp_dict_path): continue loaded_exp_dict = haven.load_json(exp_dict_path) pkl_path = os.path.join(base_path, 'score_list_best.pkl') if not os.path.exists(pkl_path): continue if (loaded_exp_dict["model"]["name"] == 'finetuning' and loaded_exp_dict["dataset_train"].split('_')[-1] == exp_dict["dataset_train"].split('_')[-1] and loaded_exp_dict["model"]["backbone"] == exp_dict['model']["backbone"] and loaded_exp_dict["labelprop_alpha"] == exp_dict["labelprop_alpha"] and loaded_exp_dict["labelprop_scale"] == exp_dict["labelprop_scale"] and loaded_exp_dict["support_size_train"] == exp_dict["support_size_train"]): accuracy = haven.load_pkl(pkl_path)[-1]["val_accuracy"] try: self.model.load_state_dict(torch.load( os.path.join(base_path, 'checkpoint_best.pth'))['model'], strict=False) if accuracy > best_accuracy: best_path = os.path.join(base_path, 'checkpoint_best.pth') best_accuracy = accuracy best_score_list = haven.load_pkl(pkl_path) except Exception as e: print(str(e)) assert (best_accuracy > 0.1) self.best_accuracy = best_score_list[-1]['test_accuracy'] print("Finetuning %s with original accuracy : %f" % (base_path, best_accuracy)) self.model.load_state_dict(torch.load(best_path)['model'], strict=False) else: raise ValueError('weights are not defined') self.acc_sum = 0.0 self.n_count = 0 self.model.cuda()
'6d4af38d64b23586e71a198de2608333': 'LCFCN', '84ced18cf5c1fb3ad5820cc1b55a38fa': 'LCFCN+Affinity_(ours)', '63f29eec3dbe1e03364f198ed7d4b414': 'Point-level_Loss ', '017e7441c2f581b6fee9e3ac6f574edc': 'Cross_entropy_Loss+pseudo-mask' } datadir = '/mnt/public/datasets/DeepFish/' score_list = [] for hash_id in hash_list: fname = os.path.join('/mnt/public/predictions/habitat/%s.pkl' % hash_id) exp_dict = hu.load_json( os.path.join(savedir_base, hash_id, 'exp_dict.json')) if os.path.exists(fname): print('FOUND:', fname) val_dict = hu.load_pkl(fname) else: train_set = datasets.get_dataset( dataset_dict=exp_dict["dataset"], split='train', datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size']) test_set = datasets.get_dataset( dataset_dict=exp_dict["dataset"], split='test', datadir=datadir, exp_dict=exp_dict, dataset_size=exp_dict['dataset_size'])
def trainval(exp_dict, savedir_base, reset=False): # bookkeeping # --------------- # get experiment directory exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) if reset: # delete and backup experiment hc.delete_experiment(savedir, backup_flag=True) # create folder and save the experiment dictionary os.makedirs(savedir, exist_ok=True) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) print(exp_dict) print("Experiment saved in %s" % savedir) # Set Seed # ------- seed = exp_dict.get('seed') np.random.seed(seed) torch.manual_seed(seed) # Dataset # ----------- train_dataset = get_dataset('train', exp_dict['dataset']) val_dataset = get_dataset('test', exp_dict['dataset']) # train and val loader train_loader = DataLoader( train_dataset, batch_size=exp_dict['batch_size'], shuffle=True, collate_fn=lambda x: x if exp_dict['batch_size'] == 1 else default_collate, # to handle episodes num_workers=args.num_workers) val_loader = DataLoader( val_dataset, batch_size=exp_dict['batch_size'], collate_fn=lambda x: x if exp_dict['batch_size'] == 1 else default_collate, shuffle=True, num_workers=args.num_workers) # Model # ----------- model = get_model(exp_dict) # Checkpoint # ----------- model_path = os.path.join(savedir, "model.pth") score_list_path = os.path.join(savedir, "score_list.pkl") if os.path.exists(score_list_path): # resume experiment model.set_state_dict(hu.torch_load(model_path)) score_list = hu.load_pkl(score_list_path) s_epoch = score_list[-1]['epoch'] + 1 else: # restart experiment score_list = [] s_epoch = 0 # Train & Val # ------------ print("Starting experiment at epoch %d" % (s_epoch)) for e in range(s_epoch, exp_dict['max_epoch']): score_dict = {} # Train the model score_dict.update(model.train_on_loader(train_loader)) # Validate the model savepath = os.path.join(savedir_base, exp_dict['dataset']['name']) score_dict.update(model.val_on_loader(val_loader, savedir=savepath)) model.on_train_end(savedir=savedir, epoch=e) score_dict["epoch"] = e # Visualize the model # model.vis_on_loader(vis_loader, savedir=savedir+"/images/") # Add to score_list and save checkpoint score_list += [score_dict] # Report & Save score_df = pd.DataFrame(score_list) print("\n", score_df.tail()) hu.torch_save(model_path, model.get_state_dict()) hu.save_pkl(score_list_path, score_list) print("Checkpoint Saved: %s" % savedir) print('experiment completed')