Example #1
0
def get_loader(
    data_source,
    open_fn,
    dict_transform=None,
    dataset_cache_prob=-1,
    sampler=None,
    collate_fn=default_collate_fn,
    batch_size=32,
    num_workers=4,
    shuffle=False,
    drop_last=False
):
    from catalyst.data import ListDataset

    dataset = ListDataset(
        data_source,
        open_fn=open_fn,
        dict_transform=dict_transform,
        cache_prob=dataset_cache_prob
    )
    loader = torch.utils.data.DataLoader(
        dataset=dataset,
        sampler=sampler,
        collate_fn=collate_fn,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=torch.cuda.is_available(),
        drop_last=drop_last,
    )
    return loader
Example #2
0
def get_loader(data_source: Iterable[dict],
               open_fn: Callable,
               dict_transform: Callable = None,
               sampler=None,
               collate_fn: Callable = default_collate_fn,
               batch_size: int = 32,
               num_workers: int = 4,
               shuffle: bool = False,
               drop_last: bool = False):
    """
    Creates a DataLoader from given source and its open/transform params

    Args:
        data_source (Iterable[dict]): and iterable containing your
            data annotations,
            (for example path to images, labels, bboxes, etc)
        open_fn (Callable): function, that can open your
            annotations dict and
            transfer it to data, needed by your network
            (for example open image by path, or tokenize read string)
        dict_transform (callable): transforms to use on dict
            (for example normalize image, add blur, crop/resize/etc)
        sampler (Sampler, optional): defines the strategy to draw samples from
            the dataset
        collate_fn (callable, optional): merges a list of samples to form a
            mini-batch of Tensor(s).  Used when using batched loading from a
            map-style dataset
        batch_size (int, optional): how many samples per batch to load
        num_workers (int, optional): how many subprocesses to use for data
            loading. ``0`` means that the data will be loaded
            in the main process
        shuffle (bool, optional): set to ``True`` to have the data reshuffled
            at every epoch (default: ``False``).
        drop_last (bool, optional): set to ``True`` to drop
            the last incomplete batch, if the dataset size is not divisible
            by the batch size. If ``False`` and the size of dataset
            is not divisible by the batch size, then the last batch
            will be smaller. (default: ``False``)

    Returns:
        DataLoader with ``catalyst.data.ListDataset``
    """
    from catalyst.data import ListDataset

    dataset = ListDataset(
        list_data=data_source,
        open_fn=open_fn,
        dict_transform=dict_transform,
    )
    loader = torch.utils.data.DataLoader(
        dataset=dataset,
        sampler=sampler,
        collate_fn=collate_fn,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=torch.cuda.is_available(),
        drop_last=drop_last,
    )
    return loader
Example #3
0
    def get_datasets(
        self,
        stage: str,
        datapath: str = None,
        in_csv: str = None,
        in_csv_train: str = None,
        in_csv_valid: str = None,
        in_csv_infer: str = None,
        train_folds: str = None,
        valid_folds: str = None,
        tag2class: str = None,
        class_column: str = None,
        tag_column: str = None,
        folds_seed: int = 42,
        n_folds: int = 5,
        image_size: int = 256,
    ):
        datasets = collections.OrderedDict()
        tag2class = (json.load(open(tag2class))
                     if tag2class is not None else None)

        df, df_train, df_valid, df_infer = read_csv_data(
            in_csv=in_csv,
            in_csv_train=in_csv_train,
            in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            train_folds=train_folds,
            valid_folds=valid_folds,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column,
            seed=folds_seed,
            n_folds=n_folds,
        )

        open_fn = ReaderCompose(readers=[
            ImageReader(
                input_key="images", output_key="image", datapath=datapath),
            MaskReader(input_key="masks", output_key="mask",
                       datapath=datapath),
            ScalarReader(
                input_key="name",
                output_key="name",
                default_value=-1,
                dtype=str,
            ),
        ])

        for mode, source in zip(("train", "valid", "infer"),
                                (df_train, df_valid, df_infer)):
            if len(source) > 0:
                datasets[mode] = ListDataset(
                    list_data=source,
                    open_fn=open_fn,
                    dict_transform=self.get_transforms(stage=stage,
                                                       mode=mode,
                                                       image_size=image_size),
                )

        return datasets
 def get_dataset(self, datapath, csv_path, info_json_path, stage, mode,
                 image_size, min_multiple, detection_pixel_threshold,
                 detection_area_threshold):
     df_as_list = dataframe_to_list(pd.read_csv(csv_path, sep=','))
     with open(info_json_path, "r") as json_file:
         dataset_info = json.load(json_file)
     return ListDataset(
         list_data=df_as_list,
         open_fn=self.get_open_fn(dataset_info, datapath),
         dict_transform=self.get_transforms(
             stage=stage,
             mode=mode,
             image_size=image_size,
             min_multiple=min_multiple,
             detection_pixel_threshold=detection_pixel_threshold,
             detection_area_threshold=detection_area_threshold))
    def get_datasets(
        self,
        stage: str,
        datapath: str = None,
        in_csv: str = None,
        in_csv_train: str = None,
        in_csv_valid: str = None,
        in_csv_infer: str = None,
        train_folds: str = None,
        valid_folds: str = None,
        tag2class: str = None,
        class_column: str = None,
        tag_column: str = None,
        folds_seed: int = 42,
        n_folds: int = 5,
        one_hot_classes: int = None,
        balance_strategy: str = "upsampling",
    ):
        datasets = collections.OrderedDict()
        tag2class = safitty.load(tag2class) if tag2class is not None else None

        df, df_train, df_valid, df_infer = read_csv_data(
            in_csv=in_csv,
            in_csv_train=in_csv_train,
            in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            train_folds=train_folds,
            valid_folds=valid_folds,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column,
            seed=folds_seed,
            n_folds=n_folds,
        )

        open_fn = [
            ImageReader(
                input_key="filepath", output_key="image", rootpath=datapath
            )
        ]

        if stage.startswith('infer'):
            open_fn.append(ScalarReader(
                input_key="filepath",
                output_key="filepath",
                default_value=-1,
                dtype=np.str,
            ))
        else:
            open_fn.append(ScalarReader(
                input_key="class",
                output_key="targets",
                default_value=-1,
                dtype=np.int64,
            ))

            if one_hot_classes:
                open_fn.append(
                    ScalarReader(
                        input_key="class",
                        output_key="targets_one_hot",
                        default_value=-1,
                        dtype=np.int64,
                        one_hot_classes=one_hot_classes,
                    )
                )

        open_fn = ReaderCompose(readers=open_fn)

        for source, mode in zip(
            (df_train, df_valid, df_infer), ("train", "valid", "infer")
        ):
            if source is not None and len(source) > 0:
                dataset = ListDataset(
                    source,
                    open_fn=open_fn,
                    dict_transform=self.get_transforms(
                        stage=stage, dataset=mode
                    ),
                )
                if mode == "train":
                    labels = [x["class"] for x in source]
                    sampler = BalanceClassSampler(
                        labels, mode=balance_strategy
                    )
                    dataset = {"dataset": dataset, "sampler": sampler}
                datasets[mode] = dataset

        if stage == 'infer':
            datasets['infer'] = datasets['valid']
            del datasets['valid']
            if 'train' in datasets:
                del datasets['train']


        return datasets
Example #6
0
    def get_datasets(
        self,
        stage: str,
        datapath: Optional[str] = None,
        in_csv: Optional[str] = None,
        in_csv_train: Optional[str] = None,
        in_csv_valid: Optional[str] = None,
        in_csv_infer: Optional[str] = None,
        train_folds: Optional[str] = None,
        valid_folds: Optional[str] = None,
        tag2class: Optional[str] = None,
        class_column: Optional[str] = None,
        tag_column: Optional[str] = None,
        folds_seed: int = 42,
        n_folds: int = 5,
    ):
        """Returns the datasets for a given stage and epoch.

        Args:
            stage (str): stage name of interest,
                like "pretrain" / "train" / "finetune" / etc
            datapath (str): path to folder with images and masks
            in_csv (Optional[str]): path to CSV annotation file. Look at
                :func:`catalyst.contrib.utils.pandas.read_csv_data` for details
            in_csv_train (Optional[str]): path to CSV annotaion file
                with train samples.
            in_csv_valid (Optional[str]): path to CSV annotaion file
                with the validation samples
            in_csv_infer (Optional[str]): path to CSV annotaion file
                with test samples
            train_folds (Optional[str]): folds to use for training
            valid_folds (Optional[str]): folds to use for validation
            tag2class (Optional[str]): path to JSON file with mapping from
                class name (tag) to index
            class_column (Optional[str]): name of class index column in the CSV
            tag_column (Optional[str]): name of class name in the CSV file
            folds_seed (int): random seed to use
            n_folds (int): number of folds on which data will be split

        Returns:
            Dict: dictionary with datasets for current stage.
        """
        datasets = collections.OrderedDict()
        tag2class = (
            json.load(open(tag2class)) if tag2class is not None else None
        )

        df, df_train, df_valid, df_infer = read_csv_data(
            in_csv=in_csv,
            in_csv_train=in_csv_train,
            in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            train_folds=train_folds,
            valid_folds=valid_folds,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column,
            seed=folds_seed,
            n_folds=n_folds,
        )

        open_fn = ReaderCompose(
            readers=[
                ImageReader(
                    input_key="images", output_key="image", rootpath=datapath
                ),
                MaskReader(
                    input_key="masks", output_key="mask", rootpath=datapath
                ),
                ScalarReader(
                    input_key="name",
                    output_key="name",
                    dtype=str,
                    default_value=-1,
                ),
            ]
        )

        for mode, source in zip(
            ("train", "valid", "infer"), (df_train, df_valid, df_infer)
        ):
            if source is not None and len(source) > 0:
                datasets[mode] = ListDataset(
                    list_data=source,
                    open_fn=open_fn,
                    dict_transform=self.get_transforms(
                        stage=stage, dataset=mode
                    ),
                )

        return datasets
Example #7
0
    def get_datasets(
        self,
        stage: str,
        datapath: str = None,
        in_csv: str = None,
        in_csv_train: str = None,
        in_csv_valid: str = None,
        in_csv_infer: str = None,
        train_folds: str = None,
        valid_folds: str = None,
        tag2class: str = None,
        class_column: str = None,
        tag_column: str = None,
        folds_seed: int = 42,
        n_folds: int = 5,
        image_size: int = 256,
    ):
        datasets = collections.OrderedDict()
        tag2class = (json.load(open(tag2class))
                     if tag2class is not None else None)

        df, df_train, df_valid, df_infer = read_csv_data(
            in_csv=in_csv,
            in_csv_train=in_csv_train,
            in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            train_folds=train_folds,
            valid_folds=valid_folds,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column,
            seed=folds_seed,
            n_folds=n_folds,
        )

        import cv2
        import os

        def encode_fn_lambda(fname, datapath):
            return (cv2.cvtColor(cv2.imread(os.path.join(datapath, fname)),
                                 cv2.COLOR_BGR2GRAY) // 255)[:, :, None]

        open_fn = ReaderCompose(readers=[
            ImageReader(
                input_key="images", output_key="image", datapath=datapath),
            LambdaReader(input_key="masks",
                         output_key="mask",
                         datapath=datapath,
                         encode_fn=encode_fn_lambda),
            # MaskReader(
            #     input_key="masks", output_key="mask", datapath=datapath
            # ),
            ScalarReader(
                input_key="name",
                output_key="name",
                default_value=-1,
                dtype=str,
            ),
        ])

        for mode, source in zip(("train", "valid", "infer"),
                                (df_train, df_valid, df_infer)):
            if len(source) > 0:
                datasets[mode] = ListDataset(
                    list_data=source,
                    open_fn=open_fn,
                    dict_transform=self.get_transforms(stage=stage,
                                                       mode=mode,
                                                       image_size=image_size),
                )

        # fff = datasets["train"][0]

        return datasets