def create_loader(data_source, open_fn, dict_transform=None, dataset_cache_prob=-1, sampler=None, collate_fn=default_collate_fn, batch_size=32, num_workers=4, shuffle=False, drop_last=False): dataset = ListDataset(data_source, open_fn=open_fn, dict_transform=dict_transform, cache_prob=dataset_cache_prob) loader = torch.utils.data.DataLoader( dataset=dataset, sampler=sampler, collate_fn=collate_fn, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=torch.cuda.is_available(), drop_last=drop_last, ) return loader
def get_loader( data_source: Iterable[dict], open_fn: Callable, dict_transform: Callable = None, sampler=None, collate_fn: Callable = default_collate_fn, batch_size: int = 32, num_workers: int = 4, shuffle: bool = False, drop_last: bool = False, ): """Creates a DataLoader from given source and its open/transform params. Args: data_source: and iterable containing your data annotations, (for example path to images, labels, bboxes, etc) open_fn: function, that can open your annotations dict and transfer it to data, needed by your network (for example open image by path, or tokenize read string) dict_transform: transforms to use on dict (for example normalize image, add blur, crop/resize/etc) sampler (Sampler, optional): defines the strategy to draw samples from the dataset collate_fn (callable, optional): merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset batch_size (int, optional): how many samples per batch to load num_workers (int, optional): how many subprocesses to use for data loading. ``0`` means that the data will be loaded in the main process shuffle (bool, optional): set to ``True`` to have the data reshuffled at every epoch (default: ``False``). drop_last (bool, optional): set to ``True`` to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If ``False`` and the size of dataset is not divisible by the batch size, then the last batch will be smaller. (default: ``False``) Returns: DataLoader with ``catalyst.data.ListDataset`` """ from catalyst.data.dataset import ListDataset dataset = ListDataset( list_data=data_source, open_fn=open_fn, dict_transform=dict_transform, ) loader = torch.utils.data.DataLoader( dataset=dataset, sampler=sampler, collate_fn=collate_fn, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=torch.cuda.is_available(), drop_last=drop_last, ) return loader
def get_datasets( self, stage: str, datapath: str = None, in_csv: str = None, in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, train_folds: str = None, valid_folds: str = None, tag2class: str = None, class_column: str = None, tag_column: str = None, folds_seed: int = 42, n_folds: int = 5, one_hot_classes: bool = None, num_frames: int = None, num_segments: int = None, time_window: int = None, uniform_time_sample: bool = False, ): datasets = collections.OrderedDict() tag2class = json.load(open(tag2class)) \ if tag2class is not None \ else None df, df_train, df_valid, df_infer = read_csv_data( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=folds_seed, n_folds=n_folds) df_valid = preprocess_valid_data(df_valid) open_fn = [ ScalarReader(input_key="class", output_key="targets", default_value=-1, dtype=np.int64) ] if one_hot_classes: open_fn.append( ScalarReader(input_key="class", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=one_hot_classes)) open_fn_val = open_fn.copy() open_fn.append( VideoImageReader(input_key="filepath", output_key="features", datapath=datapath, num_frames=num_frames, num_segments=num_segments, time_window=time_window, uniform_time_sample=uniform_time_sample)) open_fn_val.append( VideoImageReader(input_key="filepath", output_key="features", datapath=datapath, num_frames=num_frames, num_segments=num_segments, time_window=time_window, uniform_time_sample=uniform_time_sample, with_offset=True)) open_fn = ReaderCompose(readers=open_fn) open_fn_val = ReaderCompose(readers=open_fn_val) for source, mode in zip((df_train, df_valid, df_infer), ("train", "valid", "infer")): if len(source) > 0: dataset = ListDataset( source, open_fn=open_fn_val if mode == "valid" else open_fn, dict_transform=self.get_transforms(stage=stage, mode=mode), ) dataset_dict = {"dataset": dataset} if mode == "train": labels = [x["class"] for x in df_train] sampler = BalanceClassSampler(labels, mode="upsampling") dataset_dict['sampler'] = sampler datasets[mode] = dataset_dict return datasets
def get_datasets(self, stage: str, datapath: str = None, in_csv: str = None, in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, train_folds: str = None, valid_folds: str = None, tag2class: str = None, class_column: str = None, tag_column: str = None, folds_seed: int = 42, n_folds: int = 5, one_hot_classes: int = None, image_size: int = 224): datasets = collections.OrderedDict() tag2class = json.load(open(tag2class)) \ if tag2class is not None \ else None df, df_train, df_valid, df_infer = read_csv_data( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=folds_seed, n_folds=n_folds) open_fn = [ ImageReader(input_key="filepath", output_key="image", datapath=datapath), ScalarReader(input_key="class", output_key="targets", default_value=-1, dtype=np.int64) ] if one_hot_classes: open_fn.append( ScalarReader(input_key="class", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=one_hot_classes)) open_fn = ReaderCompose(readers=open_fn) for source, mode in zip((df_train, df_valid, df_infer), ("train", "valid", "infer")): if len(source) > 0: dataset = ListDataset( source, open_fn=open_fn, dict_transform=self.get_transforms( stage=stage, mode=mode, image_size=image_size, one_hot_classes=one_hot_classes), ) if mode == "train": labels = [x["class"] for x in source] sampler = BalanceClassSampler(labels, mode="upsampling") dataset = {"dataset": dataset, "sampler": sampler} datasets[mode] = dataset return datasets