def get_loader( data_source, open_fn, dict_transform=None, dataset_cache_prob=-1, sampler=None, collate_fn=default_collate_fn, batch_size=32, num_workers=4, shuffle=False, drop_last=False ): from catalyst.data import ListDataset dataset = ListDataset( data_source, open_fn=open_fn, dict_transform=dict_transform, cache_prob=dataset_cache_prob ) loader = torch.utils.data.DataLoader( dataset=dataset, sampler=sampler, collate_fn=collate_fn, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=torch.cuda.is_available(), drop_last=drop_last, ) return loader
def get_loader(data_source: Iterable[dict], open_fn: Callable, dict_transform: Callable = None, sampler=None, collate_fn: Callable = default_collate_fn, batch_size: int = 32, num_workers: int = 4, shuffle: bool = False, drop_last: bool = False): """ Creates a DataLoader from given source and its open/transform params Args: data_source (Iterable[dict]): and iterable containing your data annotations, (for example path to images, labels, bboxes, etc) open_fn (Callable): function, that can open your annotations dict and transfer it to data, needed by your network (for example open image by path, or tokenize read string) dict_transform (callable): transforms to use on dict (for example normalize image, add blur, crop/resize/etc) sampler (Sampler, optional): defines the strategy to draw samples from the dataset collate_fn (callable, optional): merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset batch_size (int, optional): how many samples per batch to load num_workers (int, optional): how many subprocesses to use for data loading. ``0`` means that the data will be loaded in the main process shuffle (bool, optional): set to ``True`` to have the data reshuffled at every epoch (default: ``False``). drop_last (bool, optional): set to ``True`` to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If ``False`` and the size of dataset is not divisible by the batch size, then the last batch will be smaller. (default: ``False``) Returns: DataLoader with ``catalyst.data.ListDataset`` """ from catalyst.data import ListDataset dataset = ListDataset( list_data=data_source, open_fn=open_fn, dict_transform=dict_transform, ) loader = torch.utils.data.DataLoader( dataset=dataset, sampler=sampler, collate_fn=collate_fn, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=torch.cuda.is_available(), drop_last=drop_last, ) return loader
def get_datasets( self, stage: str, datapath: str = None, in_csv: str = None, in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, train_folds: str = None, valid_folds: str = None, tag2class: str = None, class_column: str = None, tag_column: str = None, folds_seed: int = 42, n_folds: int = 5, image_size: int = 256, ): datasets = collections.OrderedDict() tag2class = (json.load(open(tag2class)) if tag2class is not None else None) df, df_train, df_valid, df_infer = read_csv_data( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=folds_seed, n_folds=n_folds, ) open_fn = ReaderCompose(readers=[ ImageReader( input_key="images", output_key="image", datapath=datapath), MaskReader(input_key="masks", output_key="mask", datapath=datapath), ScalarReader( input_key="name", output_key="name", default_value=-1, dtype=str, ), ]) for mode, source in zip(("train", "valid", "infer"), (df_train, df_valid, df_infer)): if len(source) > 0: datasets[mode] = ListDataset( list_data=source, open_fn=open_fn, dict_transform=self.get_transforms(stage=stage, mode=mode, image_size=image_size), ) return datasets
def get_dataset(self, datapath, csv_path, info_json_path, stage, mode, image_size, min_multiple, detection_pixel_threshold, detection_area_threshold): df_as_list = dataframe_to_list(pd.read_csv(csv_path, sep=',')) with open(info_json_path, "r") as json_file: dataset_info = json.load(json_file) return ListDataset( list_data=df_as_list, open_fn=self.get_open_fn(dataset_info, datapath), dict_transform=self.get_transforms( stage=stage, mode=mode, image_size=image_size, min_multiple=min_multiple, detection_pixel_threshold=detection_pixel_threshold, detection_area_threshold=detection_area_threshold))
def get_datasets( self, stage: str, datapath: str = None, in_csv: str = None, in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, train_folds: str = None, valid_folds: str = None, tag2class: str = None, class_column: str = None, tag_column: str = None, folds_seed: int = 42, n_folds: int = 5, one_hot_classes: int = None, balance_strategy: str = "upsampling", ): datasets = collections.OrderedDict() tag2class = safitty.load(tag2class) if tag2class is not None else None df, df_train, df_valid, df_infer = read_csv_data( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=folds_seed, n_folds=n_folds, ) open_fn = [ ImageReader( input_key="filepath", output_key="image", rootpath=datapath ) ] if stage.startswith('infer'): open_fn.append(ScalarReader( input_key="filepath", output_key="filepath", default_value=-1, dtype=np.str, )) else: open_fn.append(ScalarReader( input_key="class", output_key="targets", default_value=-1, dtype=np.int64, )) if one_hot_classes: open_fn.append( ScalarReader( input_key="class", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=one_hot_classes, ) ) open_fn = ReaderCompose(readers=open_fn) for source, mode in zip( (df_train, df_valid, df_infer), ("train", "valid", "infer") ): if source is not None and len(source) > 0: dataset = ListDataset( source, open_fn=open_fn, dict_transform=self.get_transforms( stage=stage, dataset=mode ), ) if mode == "train": labels = [x["class"] for x in source] sampler = BalanceClassSampler( labels, mode=balance_strategy ) dataset = {"dataset": dataset, "sampler": sampler} datasets[mode] = dataset if stage == 'infer': datasets['infer'] = datasets['valid'] del datasets['valid'] if 'train' in datasets: del datasets['train'] return datasets
def get_datasets( self, stage: str, datapath: Optional[str] = None, in_csv: Optional[str] = None, in_csv_train: Optional[str] = None, in_csv_valid: Optional[str] = None, in_csv_infer: Optional[str] = None, train_folds: Optional[str] = None, valid_folds: Optional[str] = None, tag2class: Optional[str] = None, class_column: Optional[str] = None, tag_column: Optional[str] = None, folds_seed: int = 42, n_folds: int = 5, ): """Returns the datasets for a given stage and epoch. Args: stage (str): stage name of interest, like "pretrain" / "train" / "finetune" / etc datapath (str): path to folder with images and masks in_csv (Optional[str]): path to CSV annotation file. Look at :func:`catalyst.contrib.utils.pandas.read_csv_data` for details in_csv_train (Optional[str]): path to CSV annotaion file with train samples. in_csv_valid (Optional[str]): path to CSV annotaion file with the validation samples in_csv_infer (Optional[str]): path to CSV annotaion file with test samples train_folds (Optional[str]): folds to use for training valid_folds (Optional[str]): folds to use for validation tag2class (Optional[str]): path to JSON file with mapping from class name (tag) to index class_column (Optional[str]): name of class index column in the CSV tag_column (Optional[str]): name of class name in the CSV file folds_seed (int): random seed to use n_folds (int): number of folds on which data will be split Returns: Dict: dictionary with datasets for current stage. """ datasets = collections.OrderedDict() tag2class = ( json.load(open(tag2class)) if tag2class is not None else None ) df, df_train, df_valid, df_infer = read_csv_data( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=folds_seed, n_folds=n_folds, ) open_fn = ReaderCompose( readers=[ ImageReader( input_key="images", output_key="image", rootpath=datapath ), MaskReader( input_key="masks", output_key="mask", rootpath=datapath ), ScalarReader( input_key="name", output_key="name", dtype=str, default_value=-1, ), ] ) for mode, source in zip( ("train", "valid", "infer"), (df_train, df_valid, df_infer) ): if source is not None and len(source) > 0: datasets[mode] = ListDataset( list_data=source, open_fn=open_fn, dict_transform=self.get_transforms( stage=stage, dataset=mode ), ) return datasets
def get_datasets( self, stage: str, datapath: str = None, in_csv: str = None, in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, train_folds: str = None, valid_folds: str = None, tag2class: str = None, class_column: str = None, tag_column: str = None, folds_seed: int = 42, n_folds: int = 5, image_size: int = 256, ): datasets = collections.OrderedDict() tag2class = (json.load(open(tag2class)) if tag2class is not None else None) df, df_train, df_valid, df_infer = read_csv_data( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=folds_seed, n_folds=n_folds, ) import cv2 import os def encode_fn_lambda(fname, datapath): return (cv2.cvtColor(cv2.imread(os.path.join(datapath, fname)), cv2.COLOR_BGR2GRAY) // 255)[:, :, None] open_fn = ReaderCompose(readers=[ ImageReader( input_key="images", output_key="image", datapath=datapath), LambdaReader(input_key="masks", output_key="mask", datapath=datapath, encode_fn=encode_fn_lambda), # MaskReader( # input_key="masks", output_key="mask", datapath=datapath # ), ScalarReader( input_key="name", output_key="name", default_value=-1, dtype=str, ), ]) for mode, source in zip(("train", "valid", "infer"), (df_train, df_valid, df_infer)): if len(source) > 0: datasets[mode] = ListDataset( list_data=source, open_fn=open_fn, dict_transform=self.get_transforms(stage=stage, mode=mode, image_size=image_size), ) # fff = datasets["train"][0] return datasets