def _get_train_open_fn(train_folder):
     return ReaderCompose(
         readers=[RowsReader(reader=ReaderCompose(
             readers=[
                 ImageReader(row_key="Image", dict_key="Image", datapath=train_folder),
                 TextReader(row_key="Id", dict_key="Id"),
                 TextReader(row_key="Image", dict_key="ImageFile")]))],
         mixins=[SiameseLabelMixin(
             dict_first_id_key="Id0", dict_second_id_key="Id1")]
     )
def create_reader(root, num_classes):

    # ReaderCompose collects different Readers into one pipeline
    open_fn = ReaderCompose([

        # Reads images from the `datapath` folder
        # using the key `input_key =" filepath "` (here should be the filename)
        # and writes it to the output dictionary by `output_key="features"` key
        ImageReader(input_key="filepath", output_key="features",
                    datapath=root),

        # Reads a number from our dataframe
        # by the key `input_key =" label "` to np.long
        # and writes it to the output dictionary by `output_key="targets"` key
        ScalarReader(input_key="label",
                     output_key="targets",
                     default_value=-1,
                     dtype=np.int64),

        # Same as above, but with one encoding
        ScalarReader(input_key="label",
                     output_key="targets_one_hot",
                     default_value=-1,
                     dtype=np.int64,
                     one_hot_classes=num_classes)
    ])

    return open_fn
Exemple #3
0
def get_loaders(train_transforms_fn,
                valid_transforms_fn,
                config,
                batch_size: int = 8,
                num_workers: int = 20,
                sampler=None) -> OrderedDict:
    train_data, valid_data = get_datasets(config)

    open_fn = ReaderCompose([
        ImageReader(input_key="filepath",
                    output_key="features",
                    rootpath=config.root_images),
        ScalarReader(input_key="disease_type",
                     output_key="targets",
                     default_value=-1,
                     dtype=np.int64),
        ScalarReader(input_key="disease_type",
                     output_key="targets_one_hot",
                     default_value=-1,
                     dtype=np.int64,
                     one_hot_classes=config.num_classes)
    ])

    train_loader = utils.get_loader(
        train_data,
        open_fn=open_fn,
        dict_transform=train_transforms_fn,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=sampler is
        None,  # shuffle data only if Sampler is not specified (PyTorch requirement)
        sampler=sampler,
        drop_last=True,
    )

    valid_loader = utils.get_loader(
        valid_data,
        open_fn=open_fn,
        dict_transform=valid_transforms_fn,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        sampler=None,
        drop_last=True,
    )

    loaders = OrderedDict()
    loaders["train"] = train_loader
    loaders["valid"] = valid_loader

    return loaders
def get_open_fn(data_root, num_class):
    open_fn = ReaderCompose([
        ImageReader(input_key="filepath",
                    output_key="features",
                    datapath=data_root),
        ScalarReader(input_key="label",
                     output_key="targets",
                     default_value=-1,
                     dtype=np.int64),
        ScalarReader(input_key="label",
                     output_key="targets_one_hot",
                     default_value=-1,
                     dtype=np.int64,
                     one_hot_classes=num_class)
    ])
    return open_fn
Exemple #5
0
    def __init__(
        self,
        rootpath: str,
        target_key: str = "targets",
        dir2class: Optional[Mapping[str, int]] = None,
        dict_transform: Optional[Callable[[Dict], Dict]] = None,
    ) -> None:
        """Constructor method for the :class:`ImageFolderDataset` class.

        Args:
            rootpath: root directory of dataset
            target_key: key to use to store target label
            dir2class (Mapping[str, int], optional): mapping from folder name
                to class index
            dict_transform (Callable[[Dict], Dict]], optional): transforms
                to use on dict
        """
        files = glob.iglob(f"{rootpath}/**/*")
        images = sorted(filter(utils.has_image_extension, files))

        if dir2class is None:
            dirs = sorted({Path(f).parent.name for f in images})
            dir2class = {dirname: index for index, dirname in enumerate(dirs)}

        super().__init__(
            filenames=images,
            open_fn=ReaderCompose(
                [
                    ImageReader(input_key="image", rootpath=rootpath),
                    ScalarReader(
                        input_key=target_key,
                        output_key=target_key,
                        dtype=int,
                        default_value=-1,
                    ),
                ]
            ),
            label_fn=lambda fn: dir2class[Path(fn).parent.name],
            features_key="image",
            target_key=target_key,
            dict_transform=dict_transform,
        )
Exemple #6
0
    def get_datasets(
        self,
        stage: str,
        datapath: str = None,
        in_csv: str = None,
        in_csv_train: str = None,
        in_csv_valid: str = None,
        in_csv_infer: str = None,
        train_folds: str = None,
        valid_folds: str = None,
        tag2class: str = None,
        class_column: str = None,
        tag_column: str = None,
        folds_seed: int = 42,
        n_folds: int = 5,
        one_hot_classes: bool = None,
        num_frames: int = None,
        num_segments: int = None,
        time_window: int = None,
        uniform_time_sample: bool = False,
    ):
        datasets = collections.OrderedDict()
        tag2class = json.load(open(tag2class)) \
            if tag2class is not None \
            else None

        df, df_train, df_valid, df_infer = read_csv_data(
            in_csv=in_csv,
            in_csv_train=in_csv_train,
            in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            train_folds=train_folds,
            valid_folds=valid_folds,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column,
            seed=folds_seed,
            n_folds=n_folds)

        df_valid = preprocess_valid_data(df_valid)

        open_fn = [
            ScalarReader(input_key="class",
                         output_key="targets",
                         default_value=-1,
                         dtype=np.int64)
        ]
        if one_hot_classes:
            open_fn.append(
                ScalarReader(input_key="class",
                             output_key="targets_one_hot",
                             default_value=-1,
                             dtype=np.int64,
                             one_hot_classes=one_hot_classes))

        open_fn_val = open_fn.copy()
        open_fn.append(
            VideoImageReader(input_key="filepath",
                             output_key="features",
                             datapath=datapath,
                             num_frames=num_frames,
                             num_segments=num_segments,
                             time_window=time_window,
                             uniform_time_sample=uniform_time_sample))
        open_fn_val.append(
            VideoImageReader(input_key="filepath",
                             output_key="features",
                             datapath=datapath,
                             num_frames=num_frames,
                             num_segments=num_segments,
                             time_window=time_window,
                             uniform_time_sample=uniform_time_sample,
                             with_offset=True))

        open_fn = ReaderCompose(readers=open_fn)
        open_fn_val = ReaderCompose(readers=open_fn_val)

        for source, mode in zip((df_train, df_valid, df_infer),
                                ("train", "valid", "infer")):
            if len(source) > 0:
                dataset = ListDataset(
                    source,
                    open_fn=open_fn_val if mode == "valid" else open_fn,
                    dict_transform=self.get_transforms(stage=stage, mode=mode),
                )
                dataset_dict = {"dataset": dataset}
                if mode == "train":
                    labels = [x["class"] for x in df_train]
                    sampler = BalanceClassSampler(labels, mode="upsampling")
                    dataset_dict['sampler'] = sampler

                datasets[mode] = dataset_dict
        return datasets
Exemple #7
0
    def prepare_loaders(*,
                        mode: str,
                        stage: str = None,
                        n_workers: int = None,
                        batch_size: int = None,
                        datapath=None,
                        in_csv=None,
                        in_csv_train=None,
                        in_csv_valid=None,
                        in_csv_infer=None,
                        train_folds=None,
                        valid_folds=None,
                        tag2class=None,
                        class_column=None,
                        tag_column=None,
                        folds_seed=42,
                        n_folds=5):
        loaders = collections.OrderedDict()

        df, df_train, df_valid, df_infer = parse_in_csvs(
            in_csv=in_csv,
            in_csv_train=in_csv_train,
            in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            train_folds=train_folds,
            valid_folds=valid_folds,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column,
            folds_seed=folds_seed,
            n_folds=n_folds)

        open_fn = [
            ImageReader(input_key="filepath",
                        output_key="image",
                        datapath=datapath),
            ScalarReader(input_key="class",
                         output_key="targets",
                         default_value=-1,
                         dtype=np.int64)
        ]
        open_fn = ReaderCompose(readers=open_fn)

        if len(df_train) > 0:
            labels = [x["class"] for x in df_train]
            sampler = BalanceClassSampler(labels, mode="upsampling")

            train_loader = UtilsFactory.create_loader(
                data_source=df_train,
                open_fn=open_fn,
                dict_transform=DataSource.prepare_transforms(mode="train",
                                                             stage=stage),
                dataset_cache_prob=-1,
                batch_size=batch_size,
                workers=n_workers,
                shuffle=sampler is None,
                sampler=sampler)

            print("Train samples", len(train_loader) * batch_size)
            print("Train batches", len(train_loader))
            loaders["train"] = train_loader

        if len(df_valid) > 0:
            sampler = None

            valid_loader = UtilsFactory.create_loader(
                data_source=df_valid,
                open_fn=open_fn,
                dict_transform=DataSource.prepare_transforms(mode="valid",
                                                             stage=stage),
                dataset_cache_prob=-1,
                batch_size=batch_size,
                workers=n_workers,
                shuffle=False,
                sampler=sampler)

            print("Valid samples", len(valid_loader) * batch_size)
            print("Valid batches", len(valid_loader))
            loaders["valid"] = valid_loader

        if len(df_infer) > 0:
            infer_loader = UtilsFactory.create_loader(
                data_source=df_infer,
                open_fn=open_fn,
                dict_transform=DataSource.prepare_transforms(mode="infer",
                                                             stage=None),
                dataset_cache_prob=-1,
                batch_size=batch_size,
                workers=n_workers,
                shuffle=False,
                sampler=None)

            print("Infer samples", len(infer_loader) * batch_size)
            print("Infer batches", len(infer_loader))
            loaders["infer"] = infer_loader

        return loaders
Exemple #8
0
    def get_datasets(self,
                     stage: str,
                     datapath: str = None,
                     in_csv: str = None,
                     in_csv_train: str = None,
                     in_csv_valid: str = None,
                     in_csv_infer: str = None,
                     train_folds: str = None,
                     valid_folds: str = None,
                     tag2class: str = None,
                     class_column: str = None,
                     tag_column: str = None,
                     folds_seed: int = 42,
                     n_folds: int = 5,
                     one_hot_classes: int = None,
                     image_size: int = 224):
        datasets = collections.OrderedDict()
        tag2class = json.load(open(tag2class)) \
            if tag2class is not None \
            else None

        df, df_train, df_valid, df_infer = read_csv_data(
            in_csv=in_csv,
            in_csv_train=in_csv_train,
            in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            train_folds=train_folds,
            valid_folds=valid_folds,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column,
            seed=folds_seed,
            n_folds=n_folds)

        open_fn = [
            ImageReader(input_key="filepath",
                        output_key="image",
                        datapath=datapath),
            ScalarReader(input_key="class",
                         output_key="targets",
                         default_value=-1,
                         dtype=np.int64)
        ]

        if one_hot_classes:
            open_fn.append(
                ScalarReader(input_key="class",
                             output_key="targets_one_hot",
                             default_value=-1,
                             dtype=np.int64,
                             one_hot_classes=one_hot_classes))

        open_fn = ReaderCompose(readers=open_fn)

        for source, mode in zip((df_train, df_valid, df_infer),
                                ("train", "valid", "infer")):
            if len(source) > 0:
                dataset = ListDataset(
                    source,
                    open_fn=open_fn,
                    dict_transform=self.get_transforms(
                        stage=stage,
                        mode=mode,
                        image_size=image_size,
                        one_hot_classes=one_hot_classes),
                )
                if mode == "train":
                    labels = [x["class"] for x in source]
                    sampler = BalanceClassSampler(labels, mode="upsampling")
                    dataset = {"dataset": dataset, "sampler": sampler}
                datasets[mode] = dataset

        return datasets
                                 df_with_labels["rust"] * 2 + df_with_labels["scab"] * 3
df_with_labels.head(10)

train_data, valid_data = split_dataframe_train_test(df_with_labels,
                                                    test_size=0.3,
                                                    random_state=config.seed)
train_data, valid_data = train_data.to_dict('records'), valid_data.to_dict(
    'records')

open_fn = ReaderCompose([
    ImageReader(input_key="filepath",
                output_key="features",
                rootpath=config.root_images),
    ScalarReader(input_key="disease_type",
                 output_key="targets",
                 default_value=-1,
                 dtype=np.int64),
    ScalarReader(input_key="disease_type",
                 output_key="targets_one_hot",
                 default_value=-1,
                 dtype=np.int64,
                 one_hot_classes=config.num_classes)
])

train_transforms = compose(
    [pre_transforms(config.size),
     hard_transforms(),
     post_transforms()])
valid_transforms = compose([pre_transforms(config.size), post_transforms()])

show_transforms = compose([pre_transforms(config.size), hard_transforms()])
Exemple #10
0
def create_dataloders(
    train_file: str,
    valid_file: str,
    root_folder: str,
    meta_info_file: str,
    num_classes: int,
    one_hot_encoding: bool,
    bs: int,
    num_workers: int,
    augmenters: Dict = None,
):

    train_data = _prepare(train_file, root_folder)
    valid_data = _prepare(valid_file, root_folder)

    train_augmenter = augmenters['train']
    valid_augmenter = augmenters['valid']

    train_transforms_fn = transforms.Compose([
        Augmentor(
            dict_key="features",
            augment_fn=lambda x: train_augmenter(samples=x, sample_rate=16000))
    ])

    # Similarly for the validation part of the dataset.
    # we only perform squaring, normalization and ToTensor
    valid_transforms_fn = transforms.Compose([
        Augmentor(
            dict_key="features",
            augment_fn=lambda x: valid_augmenter(samples=x, sample_rate=16000))
    ])

    compose = [
        AudioReader(
            input_key="filepath",
            output_key="features",
        ),
        ScalarReader(input_key="label",
                     output_key="targets",
                     default_value=-1,
                     dtype=np.int64),
    ]

    if one_hot_encoding:
        compose.append(
            ScalarReader(
                input_key="label",
                output_key="targets_one_hot",
                default_value=-1,
                dtype=np.int64,
                one_hot_classes=num_classes,
            ))

    open_fn = ReaderCompose(compose)

    train_loader = catalyst_utils.get_loader(
        train_data,
        open_fn=open_fn,
        dict_transform=train_transforms_fn,
        batch_size=bs,
        num_workers=num_workers,
        shuffle=
        True,  # shuffle data only if Sampler is not specified (PyTorch requirement)
    )

    valid_loader = catalyst_utils.get_loader(
        valid_data,
        open_fn=open_fn,
        dict_transform=valid_transforms_fn,
        batch_size=bs,
        num_workers=1,
        shuffle=False,
    )

    loaders = OrderedDict()
    loaders["train"] = train_loader
    loaders["valid"] = valid_loader

    return loaders