def __call__(self, X, y): """ given a dataset X,y we split it, in order to do cross validation, according to the procedure explained below: if n_folds is not None, then we do cross validation based on stratified folds if n_class_samples is not None, then we do cross validation using only <n_class_samples> training samples per class if n_test_samples is not None, then we do cross validation using only <n_test_samples> cross validaition samples per class assumes that each datapoint is in a column of X """ n_classes = len(set(y)) if self.n_folds is not None: # generate the folds self.folds = StratifiedKFold(y, n_folds=self.n_folds, shuffle=False, random_state=None) elif self.n_class_samples is not None: self.folds = [] for i in range(self.n_tests): if type(self.n_class_samples) is not list: self.n_class_samples = (np.ones(n_classes) * self.n_class_samples).astype(int) if self.n_test_samples is not None: self.n_test_samples = (np.ones(n_classes) * self.n_test_samples).astype(int) data_idx = split_dataset(self.n_class_samples, self.n_test_samples, y) train_idx = data_idx[0] test_idx = data_idx[1] self.folds.append((train_idx, test_idx)) self.cross_validate(X, y)
def load_data(model_type, data_path, image_path, heatmaps_path, input_size, class_names, batch_size, num_workers, rseed, heatmaps_threshold): # ImageNet normalization mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] train_file, valid_file, test_file = split_dataset(data_path, random_state=rseed) seq = iaa.Sequential([iaa.Resize((input_size, input_size))]) image_transform = transforms.Compose([seq.augment_image, transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]) heatmap_static_transform = transforms.Compose([transforms.Resize([input_size, input_size]), transforms.Grayscale(num_output_channels=1), transforms.ToTensor()]) static_heatmap_path = heatmaps_path train_dataset = EyegazeDataset(train_file, image_path, class_names, static_heatmap_path=static_heatmap_path, heatmaps_threshold=heatmaps_threshold, heatmap_static_transform=heatmap_static_transform, image_transform=image_transform) valid_dataset = EyegazeDataset(valid_file, image_path, class_names, static_heatmap_path=static_heatmap_path, heatmaps_threshold = heatmaps_threshold, heatmap_static_transform=heatmap_static_transform, image_transform=image_transform) train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False, num_workers=num_workers) valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, drop_last=False, num_workers=num_workers) test_dataset = EyegazeDataset(test_file, image_path, class_names, static_heatmap_path=static_heatmap_path, heatmaps_threshold = heatmaps_threshold, heatmap_static_transform=heatmap_static_transform, image_transform=image_transform) test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False, num_workers=32) return train_dl, valid_dl, test_dl
def __call__(self, X, y): """ given a dataset X,y we split it, in order to do cross validation, according to the procedure explained below: if n_folds is not None, then we do cross validation based on stratified folds if n_class_samples is not None, then we do cross validation using only <n_class_samples> training samples per class if n_test_samples is not None, then we do cross validation using only <n_test_samples> cross validaition samples per class assumes that each datapoint is in a column of X """ n_classes = len(set(y)) if self.n_folds is not None: # generate the folds self.folds = StratifiedKFold(y, n_folds=self.n_folds, shuffle=False, random_state=None) elif self.n_class_samples is not None: self.folds = [] for i in range(self.n_tests): if type(self.n_class_samples) is not list: self.n_class_samples = (np.ones(n_classes) * self.n_class_samples).astype(int) if self.n_test_samples is not None: self.n_test_samples = (np.ones(n_classes) * self.n_test_samples).astype(int) data_idx = split_dataset(self.n_class_samples, self.n_test_samples, y) train_idx = data_idx[0] test_idx = data_idx[1] self.folds.append((train_idx, test_idx)) self.cross_validate(X, y)
def load_data(model_type, data_path, image_path, heatmaps_path, input_size, class_names, batch_size, num_workers, rseed): # ImageNet normalization mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] train_file, valid_file, test_file = split_dataset(data_path, random_state=rseed) seq = iaa.Sequential([iaa.Resize((input_size, input_size))]) image_transform = transforms.Compose([ seq.augment_image, transforms.ToTensor(), transforms.Normalize(mean=mean, std=std) ]) if model_type in ['temporal']: heatmap_temporal_transform = transforms.Compose([ transforms.Resize([input_size, input_size]), transforms.Grayscale(num_output_channels=1), transforms.ToTensor(), transforms.Lambda(lambda x: x.repeat(3, 1, 1)), transforms.Normalize(mean=mean, std=std) ]) heatmap_static_transform = transforms.Compose([ transforms.Resize([input_size, input_size]), transforms.ToTensor() ]) static_heatmap_path = heatmaps_path train_dataset = EyegazeDataset( train_file, image_path, class_names, heatmaps_path=heatmaps_path, static_heatmap_path=static_heatmap_path, heatmap_temporal_transform=heatmap_temporal_transform, heatmap_static_transform=heatmap_static_transform, image_transform=image_transform) valid_dataset = EyegazeDataset( valid_file, image_path, class_names, heatmaps_path=heatmaps_path, static_heatmap_path=static_heatmap_path, heatmap_temporal_transform=heatmap_temporal_transform, heatmap_static_transform=heatmap_static_transform, image_transform=image_transform) test_dataset = EyegazeDataset( test_file, image_path, class_names, heatmaps_path=heatmaps_path, static_heatmap_path=static_heatmap_path, heatmap_temporal_transform=heatmap_temporal_transform, heatmap_static_transform=heatmap_static_transform, image_transform=image_transform) # drop_last=True for batchnorm issue: https://discuss.pytorch.org/t/error-expected-more-than-1-value-per-channel-when-training/26274 # this did not resolve the issue for all cases train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn, drop_last=True) valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn, drop_last=True) test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=32) else: train_dataset = EyegazeDataset(train_file, image_path, class_names, image_transform=image_transform) valid_dataset = EyegazeDataset(valid_file, image_path, class_names, image_transform=image_transform) train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=num_workers) valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=num_workers) test_dataset = EyegazeDataset(test_file, image_path, class_names, image_transform=image_transform) test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True, num_workers=32) return train_dl, valid_dl, test_dl
if opt.deviceIds[1] >= 0 else "Use CPU as target nlg torch device") ##### Vocab and Dataset Reader ##### slu_vocab, nlg_vocab = Vocab(dataset=opt.dataset, task='slu'), Vocab(dataset=opt.dataset, task='nlg') lm_vocab = Vocab(dataset=opt.dataset, task='lm') slu_evaluator, nlg_evaluator = Evaluator.get_evaluator_from_task( task='slu', vocab=slu_vocab), Evaluator.get_evaluator_from_task(task='nlg', vocab=nlg_vocab) if not opt.testing: train_dataset, dev_dataset = read_dataset( opt.dataset, choice='train'), read_dataset(opt.dataset, choice='valid') labeled_dataset, unlabeled_dataset = split_dataset(train_dataset, opt.labeled) logger.info( "Labeled/Unlabeled train and dev dataset size is: %s/%s and %s" % (len(labeled_dataset), len(unlabeled_dataset), len(dev_dataset))) unlabeled_dataset = labeled_dataset + unlabeled_dataset test_dataset = read_dataset(opt.dataset, choice='test') logger.info("Test dataset size is: %s" % (len(test_dataset))) ##### Model Construction and Init ##### if not opt.testing: params = vars(opt) json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4) else: params = json.load(open(os.path.join(exp_path, 'params.json'), 'r'))
logger.info("Random seed is set to: %d" % (opt.seed)) logger.info( "Use GPU with index %s" % (opt.deviceId) if opt.deviceId >= 0 else "Use CPU as target torch device") ##### Vocab and Dataset Reader ##### vocab = Vocab(dataset=opt.dataset, task=task) logger.info("Vocab size for input utterance is: %s" % (len(vocab.word2id))) logger.info("Vocab size for output slot label is: %s" % (len(vocab.slot2id))) logger.info("Vocab size for output intent is: %s" % (len(vocab.int2id))) evaluator = Evaluator.get_evaluator_from_task(task=task, vocab=vocab) if not opt.testing: train_dataset, dev_dataset = read_dataset( opt.dataset, choice='train'), read_dataset(opt.dataset, choice='valid') train_dataset, _ = split_dataset(train_dataset, opt.labeled) logger.info("Train and dev dataset size is: %s and %s" % (len(train_dataset), len(dev_dataset))) test_dataset = read_dataset(opt.dataset, choice='test') logger.info("Test dataset size is: %s" % (len(test_dataset))) ##### Model Construction and Init ##### if not opt.testing: opt.vocab_size, opt.slot_num, opt.intent_num = len(vocab.word2id), len( vocab.slot2id), len(vocab.int2id) opt.pad_token_idxs = { "word": vocab.word2id[PAD], "slot": vocab.slot2id[PAD] } opt.start_idx, opt.end_idx = vocab.slot2id[BOS], vocab.slot2id[EOS] params = vars(opt)
def main(image_folder, model_folder, building_folder, figure_folder, info_csv, stats_json_file, train_results, batch_size, ideal_batch_size, epochs, device_name): # Configure folders image_folder = Path(image_folder) model_folder = Path(model_folder) building_folder = Path(building_folder) stats_json_file = Path(stats_json_file) figure_folder = Path(figure_folder) # Seed program torch.manual_seed(0) # retrieve information on buildings df = pd.read_csv(info_csv) # Get information on a TIF n_ch, h, w = get_tif_dims(list(image_folder.iterdir())[0]) # Get means and stds if stats_json_file.exists(): mean_channels, std_channels = load_stats(stats_json_file) else: stats = compute_mean_std(image_folder, n_ch) mean_channels = stats['mean'] std_channels = stats["std"] with open(stats_json_file, 'w') as file: json.dump(stats, file) # New dims for image pad_h, pad_w = 8 - h % 8, 8 - w % 8 # Prepare padding if pad_h % 2 == 0: top_p = bottom_p = pad_h // 2 else: top_p, bottom_p = pad_h // 2, pad_h - (pad_h // 2) if pad_w % 2 == 0: left_p = right_p = pad_w // 2 else: left_p, right_p = pad_w // 2, pad_w - (pad_w // 2) img_pad = (left_p, top_p, right_p, bottom_p) # Compose transforms transforms = SemSegCompose(img_pad, mean_channels, std_channels, 360) # Prepare loading function # load_tif_with_mask = partial( # load_tif, # df=df, # mean_vec=mean_channels, # std_vec=std_channels, # building_folder=building_folder, # padding=(pad_h, pad_w)) # Make dataset ds = MulPanSharpenDataset(image_folder, building_folder, df, transforms=transforms) # ds = datasets.DatasetFolder(root=image_folder, # loader=load_tif_with_mask, # extensions=('.tif',)) logger.info(f"N° of images: {len(ds)}") logger.info(f"Type of img: {ds.label}") train_ds, val_ds = split_dataset(ds, train_size=0.8) logger.info(f"Train set size: {len(train_ds)}") logger.info(f"Val set size: {len(val_ds)}") train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True) val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True) logger.info(f"N° of iterations per batch (train): {len(train_dl)}") logger.info(f"N° of iterations per batch (val): {len(val_dl)}") # get model logger.info("Getting U-Net model") model = torch.hub.load('mateuszbuda/brain-segmentation-pytorch', 'unet', in_channels=n_ch, out_channels=2, init_features=32, pretrained=False) device = torch.device(device_name) logger.info(f"Mounting model on {device}") model = model.to(device) # Define criterion logger.info("Defining cross-entropy loss with 11/89 ratio") criterion = nn.CrossEntropyLoss(weight=torch.tensor([.11, .89])) criterion = criterion.to(device) # Define optimizer logger.info("Defining Adam optimizer") optimizer = optim.Adam(model.parameters(), lr=1e-3) results = train(model, train_dl, val_dl, criterion, optimizer, epochs, device, 2, model_folder, ideal_batch_size // batch_size) # Saves model metrics as pickle file with open(train_results, "wb") as f: pickle.dump(results, f) # Saves model torch.save(model.state_dict(), model_folder / 'unet_model') logger.info(f"Saved model at {model_folder / 'unet_model'}") logging.info(f"Metrics evaluation. Check {figure_folder} for results.") # Plot metrics plot_loss(results, figure_folder) logger.info("Loss curve created.") plot_last_cm(results, figure_folder) logger.info("Last confusion matrix created") plot_correct_preds(results, figure_folder) logger.info("Evolution of correct predictions created;") plot_accuracy(results, figure_folder) logger.info("Accuracy plot created.") plot_iou(results, figure_folder) logger.info("IoU evolution plot created.") for i in range(5): generate_masks(ds, model, figure_folder, i, 5) logger.info("Created model results.")
#!/usr/bin/env python3 # Standard import import os import json # Third-party import import torch # Local import from invco import DATASET_DIR, ROOT_DIR from utils.dataset import Recipe1M from utils.dataset import split_dataset if '__main__' == __name__: # dataset = Recipe1M(DATASET_DIR) # split_csv(F'{DATASET_DIR}/recipeid.csv') split_dataset(F'{DATASET_DIR}/recipeid.csv', F'{ROOT_DIR}/csv', lambda r: r[4 - 1] + '.csv')