def _get_train_open_fn(train_folder): return ReaderCompose( readers=[RowsReader(reader=ReaderCompose( readers=[ ImageReader(row_key="Image", dict_key="Image", datapath=train_folder), TextReader(row_key="Id", dict_key="Id"), TextReader(row_key="Image", dict_key="ImageFile")]))], mixins=[SiameseLabelMixin( dict_first_id_key="Id0", dict_second_id_key="Id1")] )
def create_reader(root, num_classes): # ReaderCompose collects different Readers into one pipeline open_fn = ReaderCompose([ # Reads images from the `datapath` folder # using the key `input_key =" filepath "` (here should be the filename) # and writes it to the output dictionary by `output_key="features"` key ImageReader(input_key="filepath", output_key="features", datapath=root), # Reads a number from our dataframe # by the key `input_key =" label "` to np.long # and writes it to the output dictionary by `output_key="targets"` key ScalarReader(input_key="label", output_key="targets", default_value=-1, dtype=np.int64), # Same as above, but with one encoding ScalarReader(input_key="label", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=num_classes) ]) return open_fn
def get_loaders(train_transforms_fn, valid_transforms_fn, config, batch_size: int = 8, num_workers: int = 20, sampler=None) -> OrderedDict: train_data, valid_data = get_datasets(config) open_fn = ReaderCompose([ ImageReader(input_key="filepath", output_key="features", rootpath=config.root_images), ScalarReader(input_key="disease_type", output_key="targets", default_value=-1, dtype=np.int64), ScalarReader(input_key="disease_type", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=config.num_classes) ]) train_loader = utils.get_loader( train_data, open_fn=open_fn, dict_transform=train_transforms_fn, batch_size=batch_size, num_workers=num_workers, shuffle=sampler is None, # shuffle data only if Sampler is not specified (PyTorch requirement) sampler=sampler, drop_last=True, ) valid_loader = utils.get_loader( valid_data, open_fn=open_fn, dict_transform=valid_transforms_fn, batch_size=batch_size, num_workers=num_workers, shuffle=False, sampler=None, drop_last=True, ) loaders = OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader return loaders
def get_open_fn(data_root, num_class): open_fn = ReaderCompose([ ImageReader(input_key="filepath", output_key="features", datapath=data_root), ScalarReader(input_key="label", output_key="targets", default_value=-1, dtype=np.int64), ScalarReader(input_key="label", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=num_class) ]) return open_fn
def __init__( self, rootpath: str, target_key: str = "targets", dir2class: Optional[Mapping[str, int]] = None, dict_transform: Optional[Callable[[Dict], Dict]] = None, ) -> None: """Constructor method for the :class:`ImageFolderDataset` class. Args: rootpath: root directory of dataset target_key: key to use to store target label dir2class (Mapping[str, int], optional): mapping from folder name to class index dict_transform (Callable[[Dict], Dict]], optional): transforms to use on dict """ files = glob.iglob(f"{rootpath}/**/*") images = sorted(filter(utils.has_image_extension, files)) if dir2class is None: dirs = sorted({Path(f).parent.name for f in images}) dir2class = {dirname: index for index, dirname in enumerate(dirs)} super().__init__( filenames=images, open_fn=ReaderCompose( [ ImageReader(input_key="image", rootpath=rootpath), ScalarReader( input_key=target_key, output_key=target_key, dtype=int, default_value=-1, ), ] ), label_fn=lambda fn: dir2class[Path(fn).parent.name], features_key="image", target_key=target_key, dict_transform=dict_transform, )
def get_datasets( self, stage: str, datapath: str = None, in_csv: str = None, in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, train_folds: str = None, valid_folds: str = None, tag2class: str = None, class_column: str = None, tag_column: str = None, folds_seed: int = 42, n_folds: int = 5, one_hot_classes: bool = None, num_frames: int = None, num_segments: int = None, time_window: int = None, uniform_time_sample: bool = False, ): datasets = collections.OrderedDict() tag2class = json.load(open(tag2class)) \ if tag2class is not None \ else None df, df_train, df_valid, df_infer = read_csv_data( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=folds_seed, n_folds=n_folds) df_valid = preprocess_valid_data(df_valid) open_fn = [ ScalarReader(input_key="class", output_key="targets", default_value=-1, dtype=np.int64) ] if one_hot_classes: open_fn.append( ScalarReader(input_key="class", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=one_hot_classes)) open_fn_val = open_fn.copy() open_fn.append( VideoImageReader(input_key="filepath", output_key="features", datapath=datapath, num_frames=num_frames, num_segments=num_segments, time_window=time_window, uniform_time_sample=uniform_time_sample)) open_fn_val.append( VideoImageReader(input_key="filepath", output_key="features", datapath=datapath, num_frames=num_frames, num_segments=num_segments, time_window=time_window, uniform_time_sample=uniform_time_sample, with_offset=True)) open_fn = ReaderCompose(readers=open_fn) open_fn_val = ReaderCompose(readers=open_fn_val) for source, mode in zip((df_train, df_valid, df_infer), ("train", "valid", "infer")): if len(source) > 0: dataset = ListDataset( source, open_fn=open_fn_val if mode == "valid" else open_fn, dict_transform=self.get_transforms(stage=stage, mode=mode), ) dataset_dict = {"dataset": dataset} if mode == "train": labels = [x["class"] for x in df_train] sampler = BalanceClassSampler(labels, mode="upsampling") dataset_dict['sampler'] = sampler datasets[mode] = dataset_dict return datasets
def prepare_loaders(*, mode: str, stage: str = None, n_workers: int = None, batch_size: int = None, datapath=None, in_csv=None, in_csv_train=None, in_csv_valid=None, in_csv_infer=None, train_folds=None, valid_folds=None, tag2class=None, class_column=None, tag_column=None, folds_seed=42, n_folds=5): loaders = collections.OrderedDict() df, df_train, df_valid, df_infer = parse_in_csvs( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, folds_seed=folds_seed, n_folds=n_folds) open_fn = [ ImageReader(input_key="filepath", output_key="image", datapath=datapath), ScalarReader(input_key="class", output_key="targets", default_value=-1, dtype=np.int64) ] open_fn = ReaderCompose(readers=open_fn) if len(df_train) > 0: labels = [x["class"] for x in df_train] sampler = BalanceClassSampler(labels, mode="upsampling") train_loader = UtilsFactory.create_loader( data_source=df_train, open_fn=open_fn, dict_transform=DataSource.prepare_transforms(mode="train", stage=stage), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=sampler is None, sampler=sampler) print("Train samples", len(train_loader) * batch_size) print("Train batches", len(train_loader)) loaders["train"] = train_loader if len(df_valid) > 0: sampler = None valid_loader = UtilsFactory.create_loader( data_source=df_valid, open_fn=open_fn, dict_transform=DataSource.prepare_transforms(mode="valid", stage=stage), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=False, sampler=sampler) print("Valid samples", len(valid_loader) * batch_size) print("Valid batches", len(valid_loader)) loaders["valid"] = valid_loader if len(df_infer) > 0: infer_loader = UtilsFactory.create_loader( data_source=df_infer, open_fn=open_fn, dict_transform=DataSource.prepare_transforms(mode="infer", stage=None), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=False, sampler=None) print("Infer samples", len(infer_loader) * batch_size) print("Infer batches", len(infer_loader)) loaders["infer"] = infer_loader return loaders
def get_datasets(self, stage: str, datapath: str = None, in_csv: str = None, in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, train_folds: str = None, valid_folds: str = None, tag2class: str = None, class_column: str = None, tag_column: str = None, folds_seed: int = 42, n_folds: int = 5, one_hot_classes: int = None, image_size: int = 224): datasets = collections.OrderedDict() tag2class = json.load(open(tag2class)) \ if tag2class is not None \ else None df, df_train, df_valid, df_infer = read_csv_data( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=folds_seed, n_folds=n_folds) open_fn = [ ImageReader(input_key="filepath", output_key="image", datapath=datapath), ScalarReader(input_key="class", output_key="targets", default_value=-1, dtype=np.int64) ] if one_hot_classes: open_fn.append( ScalarReader(input_key="class", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=one_hot_classes)) open_fn = ReaderCompose(readers=open_fn) for source, mode in zip((df_train, df_valid, df_infer), ("train", "valid", "infer")): if len(source) > 0: dataset = ListDataset( source, open_fn=open_fn, dict_transform=self.get_transforms( stage=stage, mode=mode, image_size=image_size, one_hot_classes=one_hot_classes), ) if mode == "train": labels = [x["class"] for x in source] sampler = BalanceClassSampler(labels, mode="upsampling") dataset = {"dataset": dataset, "sampler": sampler} datasets[mode] = dataset return datasets
df_with_labels["rust"] * 2 + df_with_labels["scab"] * 3 df_with_labels.head(10) train_data, valid_data = split_dataframe_train_test(df_with_labels, test_size=0.3, random_state=config.seed) train_data, valid_data = train_data.to_dict('records'), valid_data.to_dict( 'records') open_fn = ReaderCompose([ ImageReader(input_key="filepath", output_key="features", rootpath=config.root_images), ScalarReader(input_key="disease_type", output_key="targets", default_value=-1, dtype=np.int64), ScalarReader(input_key="disease_type", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=config.num_classes) ]) train_transforms = compose( [pre_transforms(config.size), hard_transforms(), post_transforms()]) valid_transforms = compose([pre_transforms(config.size), post_transforms()]) show_transforms = compose([pre_transforms(config.size), hard_transforms()])
def create_dataloders( train_file: str, valid_file: str, root_folder: str, meta_info_file: str, num_classes: int, one_hot_encoding: bool, bs: int, num_workers: int, augmenters: Dict = None, ): train_data = _prepare(train_file, root_folder) valid_data = _prepare(valid_file, root_folder) train_augmenter = augmenters['train'] valid_augmenter = augmenters['valid'] train_transforms_fn = transforms.Compose([ Augmentor( dict_key="features", augment_fn=lambda x: train_augmenter(samples=x, sample_rate=16000)) ]) # Similarly for the validation part of the dataset. # we only perform squaring, normalization and ToTensor valid_transforms_fn = transforms.Compose([ Augmentor( dict_key="features", augment_fn=lambda x: valid_augmenter(samples=x, sample_rate=16000)) ]) compose = [ AudioReader( input_key="filepath", output_key="features", ), ScalarReader(input_key="label", output_key="targets", default_value=-1, dtype=np.int64), ] if one_hot_encoding: compose.append( ScalarReader( input_key="label", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=num_classes, )) open_fn = ReaderCompose(compose) train_loader = catalyst_utils.get_loader( train_data, open_fn=open_fn, dict_transform=train_transforms_fn, batch_size=bs, num_workers=num_workers, shuffle= True, # shuffle data only if Sampler is not specified (PyTorch requirement) ) valid_loader = catalyst_utils.get_loader( valid_data, open_fn=open_fn, dict_transform=valid_transforms_fn, batch_size=bs, num_workers=1, shuffle=False, ) loaders = OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader return loaders