def get_trainval_indices(dataset, fold_index=0, n_splits=5, xy_transforms=None, batch_size=32, n_workers=8, seed=None): trainval_split = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) if xy_transforms is not None: targets_dataset = TransformedDataset(dataset, transforms=xy_transforms) else: targets_dataset = dataset n_samples = len(targets_dataset) x = np.zeros((n_samples, 1)) y = np.zeros((n_samples,), dtype=np.uint8) def id_collate_fn(_x): return _x data_loader = DataLoader(targets_dataset, batch_size=batch_size, num_workers=n_workers, collate_fn=id_collate_fn) for i, dp in enumerate(data_loader): y[i * batch_size: (i + 1) * batch_size] = [p[1] for p in dp] train_index = None test_index = None for i, (train_index, test_index) in enumerate(trainval_split.split(x, y)): if i == fold_index: break return train_index, test_index
def get_trainval_data_loaders(dataset, train_index, val_index, train_transforms=None, val_transforms=None, train_batch_size=32, val_batch_size=32, collate_fn=default_collate, num_workers=8, pin_memory=True): assert isinstance(dataset, Dataset) if train_transforms is not None: assert isinstance(train_transforms, (list, tuple)) train_transforms = Compose(train_transforms) if val_transforms is not None: assert isinstance(val_transforms, (list, tuple)) val_transforms = Compose(val_transforms) train_sampler = SubsetRandomSampler(train_index) val_sampler = SubsetRandomSampler(val_index) dataset = TransformedDataset(dataset, transforms=read_image, target_transforms=lambda l: l - 1) train_aug_dataset = TransformedDataset(dataset, transforms=train_transforms) val_aug_dataset = TransformedDataset(dataset, transforms=val_transforms) train_batches = DataLoader(train_aug_dataset, batch_size=train_batch_size, sampler=train_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=True, drop_last=True) val_batches = DataLoader(val_aug_dataset, batch_size=val_batch_size, sampler=val_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=pin_memory, drop_last=True) return train_batches, val_batches
def get_data_loader(dataset_or_path, data_transform=None, target_transform=None, sample_indices=None, sampler=None, collate_fn=default_collate, batch_size=16, num_workers=8, pin_memory=True): assert isinstance(dataset_or_path, Dataset) or \ (isinstance(dataset_or_path, (str, Path)) and Path(dataset_or_path).exists()), \ "Dataset or path should be either Dataset or path to images, but given {}".format(dataset_or_path) assert sample_indices is None or sampler is None, "Both are not possible" if data_transform is not None and isinstance(data_transform, (list, tuple)): data_transform = Compose(data_transform) if isinstance(dataset_or_path, (str, Path)) and Path(dataset_or_path).exists(): dataset = TrainvalFilesDataset(dataset_or_path) else: dataset = dataset_or_path if sample_indices is None and sampler is None: sample_indices = np.arange(len(dataset)) if sample_indices is not None: sampler = SubsetRandomSampler(sample_indices) dataset = TransformedDataset(dataset, transforms=read_image, target_transforms=lambda l: l - 1) if data_transform is not None or target_transform is not None: dataset = TransformedDataset(dataset, transforms=data_transform, target_transforms=target_transform) data_loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler, collate_fn=collate_fn, num_workers=num_workers, pin_memory=pin_memory) return data_loader
RandomResizedCrop(350, scale=(0.8, 1.0)), RandomHorizontalFlip(p=0.5), ] common_transform = [ ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ] BATCH_SIZE = 180 NUM_WORKERS = 15 n_classes = 128 dataset = FilesFromCsvDataset("output/filtered_train_dataset.csv") dataset = TransformedDataset(dataset, transforms=read_image, target_transforms=lambda l: l - 1) dataset = TransformedDataset(dataset, transforms=Compose(TRAIN_TRANSFORMS)) dataset = RandomMultiImageAugDataset(dataset, n_classes, aug_fn=partial(basic_random_half_blend, alpha=0.3)) dataset = TransformedDataset(dataset, transforms=Compose(common_transform)) TRAIN_LOADER = get_data_loader(dataset, data_transform=None, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, cuda=True) val_dataset = FilesFromCsvDataset("output/filtered_val_dataset.csv")
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ] VAL_TRANSFORMS = [ CenterCrop(size=350), RandomHorizontalFlip(p=0.5), RandomVerticalFlip(p=0.5), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ] BATCH_SIZE = 32 NUM_WORKERS = 8 dataset = TrainvalFilesDataset(DATASET_PATH / "train_400x400") dataset = TransformedDataset(dataset, transforms=lambda x: x, target_transforms=lambda y: y - 1) reduced_train_indices = get_reduced_train_indices(dataset, max_n_samples_per_class=1250, seed=SEED) del dataset TRAIN_LOADER = get_data_loader(dataset_path=DATASET_PATH / "train_400x400", data_transform=TRAIN_TRANSFORMS, sample_indices=reduced_train_indices, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, cuda=True) VAL_LOADER = get_data_loader(dataset_path=DATASET_PATH / "val_400x400", data_transform=VAL_TRANSFORMS,
DEBUG = True SEED = 2018 OUTPUT_PATH = "output" meta_features_path = Path("output") meta_features_list = [ meta_features_path / "val_probas_inceptionresnetv2_350_resized_crop" / "20180428_1622" / "probas.csv", meta_features_path / "val_probas_inceptionv4_350_resized_crop" / "20180428_1633" / "probas.csv", ] meta_features = get_metafeatures(meta_features_list) dataset = FilesFromCsvDataset("output/filtered_val_dataset.csv") dataset = TransformedDataset(dataset, transforms=lambda x: (x, Image.open(x).size), target_transforms=lambda l: l - 1) df_imsize_targets = get_imsize_and_targets(dataset) X = pd.concat([meta_features, df_imsize_targets[['width', 'height']]], axis=1) X.dropna(inplace=True) X = X.values Y = df_imsize_targets['target'].values # Cross-validation parameters CV_SPLIT = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) MODEL = cat.CatBoostClassifier SCORINGS = ["neg_log_loss", ]