def __init__(self, root, mode, seq_len): """ Args: root: directory path mode: either ['train', 'valid'] len: sequence length """ assert mode in ['train', 'valid'], 'Invalid dataset mode' Dataset.__init__(self) path = { 'train': 'seq_mnist_train.pickle', 'valid': 'seq_mnist_validation.pickle' }[mode] # Path to the pickle file path = os.path.join(root, path) # Load dataset with open(path, 'rb') as f: dataset = pickle.load(f, encoding='latin1') # (T, N, H, W) self.imgs = dataset['imgs'][:seq_len] # (N, 2), numbers in the digits self.labels = dataset['labels'] # (T, N, 2, 4), the last dimension being [x, y, w, h] self.coords = dataset['coords'] # (1, N, 3), bool self.nums = dataset['nums'] # (1, N, 3) -> (1, N) self.nums = np.sum(self.nums, axis=-1)
def __init__(self, data_paths, input_transform=None, target_transform=None, cache=False, data_root='/'): """ data_paths: list of lists, [[str_path_to_input, str_path_to_label], [...]] """ Dataset.__init__(self) # Allows easier path concatenation if not isinstance(data_root, Path): data_root = Path(data_root) self.data_root = data_root self.data_paths = sorted(data_paths) self.input_transform = input_transform self.target_transform = target_transform # dictionary of input self.data_loader_dict = { 'input': (self.load_input, self.input_transform), 'target': (self.load_target, self.target_transform) } # For large dataset, do not cache self.cache = cache self.cache_dict = defaultdict(dict) self.loading_key_order = ['input', 'target']
def __init__(self, config, transforms, train=True, datakeys=None): Dataset.__init__(self) LoggingParent.__init__(self) if datakeys is None: self.datakeys = ["images"] if not path.isdir(config.datapath): self.basepath = None else: self.basepath = config.datapath self.spatial_size = config.reconstr_dim self.transforms = transforms self.train = train self.datadict = {"img_path": []} self._read_data() self.datadict = { key: np.asarray(self.datadict[key]) for key in self.datadict } assert self.basepath is not None assert self.datadict["img_path"].shape[0] > 0 self._output_dict = {"images": self._get_img} self.logger.info( f'Constructed {self.__class__.__name__} in {"train" if self.train else "test"}-mode; dataset consists of {self.__len__()} samples.' )
def __init__(self, dataframe, sequences, transform=None): Dataset.__init__(self) NiftiDataset.__init__(self) self.df = dataframe self.sequences = sequences self.patients = self.df.index.values self.transform = transform
def __init__(self, objs, atts, gqa_data_path, dset, with_atts, att_categories): Dataset.__init__(self) self.objs = objs self.atts = atts self.with_atts = with_atts self.gqa_env = lmdb.open(gqa_descriptors_file, subdir=False, readonly=True, lock=False, readahead=False, meminit=False) self.gqa_txn = self.gqa_env.begin(write=False) self.gqa_curs = self.gqa_txn.cursor() if os.path.isfile(gqa_data_path): self.gqa_data = list(json.load(open(gqa_data_path)).items()) else: gqa_data_dict = get_objs_and_atts_datasets(self.objs, self.atts, dset) with open(gqa_data_path, 'w') as out_f: json.dump(gqa_data_dict, out_f, indent=2) self.gqa_data = list(gqa_data_dict.items()) self.categorize_atts = False if att_categories is not None: self.categorize_atts = True self.att_categories = att_categories self.att_to_category = \ {x: (key, idx) for key, value in att_categories.items() for idx, x in enumerate(value)} self.num_labels_distribution = [x / sum(CC_NUM_LABELS_ORDERED) for x in CC_NUM_LABELS_ORDERED]
def __init__( self, img_tensor_paths=None, heatmap_infos=None, label_name_to_value=LABEL_NAME_TO_VALUE, is_sigma_fixed=IS_SIGMA_FIXED, sigma_fixed=SIGMA_FIXED, sigma_scale=SIGMA_SCALE, heatmap_types=HEATMAP_TYPES_HANDLED, heatmap_labels=HEATMAP_LABELS, ): Dataset.__init__(self) if img_tensor_paths is None: img_tensor_paths = IMG_000_PATHS if heatmap_infos is None: heatmap_infos = HEATMAPS_000_INFOS assert len(img_tensor_paths) == len(heatmap_infos) self.img_tensor_paths = img_tensor_paths self.heatmap_infos = heatmap_infos self.label_name_to_value = label_name_to_value self.is_sigma_fixed = is_sigma_fixed self.sigma_fixed = sigma_fixed self.sigma_scale = sigma_scale self.heatmap_types = heatmap_types self.heatmap_labels = heatmap_labels
def __init__(self, index_tensor, data_tensor, target_tensor, sensitive_tensor): Dataset.__init__(self) assert data_tensor.size(0) == target_tensor.size(0) self.data_tensor = data_tensor self.target_tensor = target_tensor self.sensitive_tensor = sensitive_tensor self.index_tensor = index_tensor
def __init__(self, split, transform=None, texture_dataset=None): Dataset.__init__(self) if texture_dataset is None: texture_dataset = TextureDescriptionData(phid_format=None) self.dataset = texture_dataset self.split = split self.transform = transform
def __init__( self, datadir, # JSON format batch_size, feature2idx, qual_features, binary_features, quant_features, dimred_dict, labelcol, label2idx, assigned_partitions=None, interested_partitions=[], h5dir=None, filename_fmt='data_{0:09d}.h5', device='cpu'): Dataset.__init__(self) if h5dir is None: self.h5tempdir = tempfile.TemporaryDirectory( ) # storing the context so that it is not removed after exiting the constructor h5dir = self.h5tempdir.name self.store_parameter(h5dir, batch_size, feature2idx, qual_features, binary_features, quant_features, dimred_dict, labelcol, label2idx, assigned_partitions, interested_partitions, filename_fmt, device) self.reshuffle_batch = False self.datadir = datadir self.prepare_h5_files()
def __init__(self, img_dir=FACADE_ROT_IMAGES_TENSORS_DIR, add_targets_fn=None, img_to_num_rot=None, caching=False, init_caching=False, device=None): Dataset.__init__(self) self.dir_path = img_dir self.aux_targets_fn = add_targets_fn if img_to_num_rot is None: img_to_num_rot = create_img_to_num_rot(NUM_IMAGES, NUM_ROTATIONS) self.img_to_num_rot = img_to_num_rot self.cached_images = None self.device = device assert not (device is not None and (init_caching or caching)), 'cannot cache on GPU -> GPU_RAM' # checking all files exist for idx, num_rot in enumerate(self.img_to_num_rot): for rot_idx in range(num_rot): for is_img in [True, False]: fname = self.get_filename(idx, rot_idx, is_img) assert os.path.isfile( fname), 'file ({}) does not exist'.format(fname) if caching or init_caching: self.cached_images = dict() if init_caching: for img_idx in tqdm( list(range(FacadeRandomRotDataset.__len__(self)))): for rot_idx in range(NUM_ROTATIONS): img, lbl = self.get_rot_item(img_idx, rot_idx) self.cached_images[(img_idx, rot_idx)] = (img, lbl)
def __init__(self, data_type, year, datadir, batch_size, im_processor, cfg, processes=5, shuffle=True, dst_size=None): self.imdb_name = '%s%s' % (data_type, year) ImageDataset.__init__(self, 'coco_' + self.imdb_name, datadir, batch_size, im_processor, cfg, processes, shuffle, dst_size) Dataset.__init__(self) anno_path = os.path.join(datadir, 'data', 'annotations', 'instances_%s%s.json' % (data_type, year)) self.coco = COCO(annotation_file=anno_path) self.year = str(year) self._load_class_ids() self._image_ids = self._get_image_ids() print('load annotations and image_names') st = time.time() self._annotations, self._image_names = self._load_annotation() print('done, time=%5.2f' % (time.time() - st)) self._image__indexes = np.arange(len(self._image_names))
def __init__(self, opt, aligned=True): Dataset.__init__(self) self.imgA = [] self.imgB = [] self.size_in = opt.network["input_patch_size"] self.opt = opt self.filenamesA = None self.filenamesB = None if opt.isTrain: # only need a name when training self.name = opt.name self.batch_size = opt.network["batch_size"] self.resizeA = opt.resizeA self.netG = opt.network["netG"] self.model = opt.network["model"] self.shift_dict = {} # only for stn and when opt.stn_adjust_image use self.stn_adjust_dict = {} self.up_scale = (1, 1, 1) self.size_out = self.size_in self.aligned = aligned module_name = "aics_transfer_function.util.preprocessing" norm_module = importlib.import_module(module_name) func_name_src = self.opt["normalization"]["source"]["method"] self.source_norm = getattr(norm_module, func_name_src) self.source_norm_param = self.opt["normalization"]["source"]["params"] if "target" in self.opt.datapath and self.opt.datapath[ "target"] is not None: func_name_tar = self.opt["normalization"]["target"]["method"] self.target_norm = getattr(norm_module, func_name_tar) self.target_norm_param = self.opt["normalization"]["target"][ "params"]
def __init__(self, index_tensor, x, f): Dataset.__init__(self) assert index_tensor.size(0) == x.size(0) assert x.size(0) == f.size(0) self.index_tensor = index_tensor self.x = x self.f = f
def __init__( self, phase: str, data_root: str = "/data/yjwa/sparse_torch/MinkowskiEngine/nnbar_overlay", ): Dataset.__init__(self) self.w_xy, self.w_val, self.label = self.load_data(data_root, phase) self.phase = phase
def __init__( self, data_dir, normalize_images=True, split=None, return_mesh=False, voxel_size=32, num_samples=5000, sample_online=False, in_memory=False, return_id_str=False, input_views=[0, 6, 7], ): # call the PyTorch Dataset interface in this way # since the immediate parent is MeshVoxDataset Dataset.__init__(self) if not return_mesh and sample_online: raise ValueError("Cannot sample online without returning mesh") self.data_dir = data_dir self.return_mesh = return_mesh self.voxel_size = voxel_size self.num_samples = num_samples self.sample_online = sample_online self.return_id_str = return_id_str self.synset_ids = [] self.model_ids = [] self.mid_to_samples = {} # TODO: get the image ids from parameters self.image_ids = input_views self.transform = self.get_transform(normalize_images) summary_json = os.path.join(data_dir, "summary.json") with open(summary_json, "r") as f: summary = json.load(f) for sid in summary: logger.info("Starting synset %s" % sid) allowed_mids = None if split is not None: if sid not in split: logger.info("Skipping synset %s" % sid) continue elif isinstance(split[sid], list): allowed_mids = set(split[sid]) elif isinstance(split, dict): allowed_mids = set(split[sid].keys()) for mid, num_imgs in summary[sid].items(): if allowed_mids is not None and mid not in allowed_mids: continue if not sample_online and in_memory: samples_path = os.path.join(data_dir, sid, mid, "samples.pt") samples = torch.load(samples_path) self.mid_to_samples[mid] = samples self.synset_ids.append(sid) self.model_ids.append(mid)
def __init__(self, data_type, datadir, batch_size, im_processor, cfg, processes=5, shuffle=True, dst_size=None): self.imdb_name = 'OpenImage_%s'%(data_type) self.data_type = data_type ImageDataset.__init__(self, self.imdb_name, datadir, batch_size, im_processor, cfg, processes, shuffle, dst_size) Dataset.__init__(self) anno_path = os.path.join(datadir, 'json_data', '%s_annotation.json'%(data_type)) self._classes = cfg.label_names self._annotations,self._image_names = self._load_annotation(anno_path) self._image__indexes = np.arange(len(self._image_names))
def __init__(self, xmls_folder, height, img_folder, config): Dataset.__init__(self) BaseDataset.__init__(self, xmls_folder, height, img_folder, config) self.Y = [] for y in self.raw_Y: zeros = torch.zeros(self.max_output) zeros[torch.arange(len(y))] = torch.tensor(y, dtype=torch.float32) self.Y.append(zeros) self.Y = torch.stack(self.Y)
def __init__(self, index_tensor, x, g, g_label=None): Dataset.__init__(self) assert index_tensor.size(0) == x.size(0) assert x.size(0) == g.size(0) assert x.size(0) == g_label.size(0) self.index_tensor = index_tensor self.x = x self.g = g self.g_label = g_label
def __init__(self, dataframe, sequences, transform=None, preprocess_config=None): Dataset.__init__(self) NiftiDataset.__init__(self) self.df = dataframe self.sequences = sequences self.patients = self.df.index.values print(self.df['seg'][0]) print(self.df['seg'][1]) print(self.df['seg'][2]) self.transform = transform self.config_file = preprocess_config
def __init__( self, phase: str, data_root: str = "modelnet40h5", num_points=2048, ): Dataset.__init__(self) phase = "test" if phase in ["val", "test"] else "train" self.data, self.label = self.load_data(data_root, phase) self.phase = phase self.num_points = num_points
def __init__(self, dataset, dataset_path, training, validation, transform=None, downsample_training=False): """ Parameters ---------- dataset: str Kitti2012 or kitti2015 dataset_path: str Kitti dataset path training: bool Loads training images validation: bool Loads validation data transform: torchvision.transforms Transform to be applied to all pair downsample_training: bool Downsample during training. Some networks dont't need big images to converge faster Returns ------- None """ Dataset.__init__(self) self.dataset = dataset self.dataset_path = dataset_path self.training = training self.validation = validation self.transform = transform self.downsample_training = downsample_training # Load list of images tr_l, tr_r, tr_l_disp, test_l, test_r, test_l_disp = lt.dataloader( dataset_path) self.l_im_paths = [] self.r_im_paths = [] self.l_disp_paths = [] if self.training: self.l_im_paths = self.l_im_paths + tr_l self.r_im_paths = self.r_im_paths + tr_r self.l_disp_paths = self.l_disp_paths + tr_l_disp if self.validation: self.l_im_paths = self.l_im_paths + test_l self.r_im_paths = self.r_im_paths + test_r self.l_disp_paths = self.l_disp_paths + test_l_disp
def __init__(self, base_dir, config,): Dataset.__init__(self) LoggingParent.__init__(self) self.logger.info(f"Initialize GoogleImgDataset with basepath {base_dir}") self.config = config img_paths = [p for p in glob(path.join(base_dir,"*")) if path.isfile(p) and any(map(lambda x: p.endswith(x),["jpg","jpeg","png"]))] self.datadict = {"img_path": np.asarray(img_paths)} self.transforms = tt.Compose( [ tt.ToTensor(), tt.Lambda(lambda x: (x * 2.0) - 1.0), ]) self.logger.info(f"Initialized Dataset with {self.__len__()} images")
def __init__(self, file, pipeline=[]): # cvs file and pipeline object Dataset.__init__(self) data = [] with open(file, "r", encoding='utf-8') as f: # list of splitted lines : line is also list lines = csv.reader(f, delimiter='\t', quotechar=None) for instance in self.get_instances(lines): # instance : tuple of fields for proc in pipeline: # a bunch of pre-processing instance = proc(instance) data.append(instance) # To Tensors self.tensors = [torch.tensor(x, dtype=torch.long) for x in zip(*data)]
def __init__(self, mode='train', transform=None, preload=False, name=None, data=None, mask_threshold=0): Dataset.__init__(self) self.mode = mode self.transform = transform if name is not None: self.name = name else: self.name = mode self.mask_threshold = mask_threshold if data is None: self.data = load_data(self.name, self.mode, preload, self.mask_threshold) else: self.data = data
def __init__(self, dataframe, sequences, transform=None, brainmask=True, segmentation=True): Dataset.__init__(self) NiftiDataset.__init__(self) self.brainmask = brainmask self.df = dataframe self.sequences = sequences self.patients = self.df.index.values self.transform = transform self.segmentation = segmentation
def __init__( self, phase: str, data_root: str = "modelnet40h5", translation_max: float = 0.25, num_points=2048, ): Dataset.__init__(self) download_modelnet40_dataset() phase = "test" if phase in ["val", "test"] else "train" self.data, self.label = self.load_data(data_root, phase) self.transform = CoordinateTransformation(trans=translation_max) self.phase = phase self.num_points = num_points
def __init__(self, imdb_name, datadir, batch_size, im_processor, processes=3, shuffle=True, dst_size=None, classes=None, n_classes=None): ImageDataset.__init__(self, imdb_name, datadir, batch_size, im_processor, processes, shuffle, dst_size) Dataset.__init__(self) meta = imdb_name.split('_') self._year = meta[1] self._image_set = meta[2] self._devkit_path = os.path.join(datadir, 'VOCdevkit{}'.format(self._year)) self._data_path = os.path.join(self._devkit_path, 'VOC{}'.format(self._year)) assert os.path.exists( self._devkit_path), 'VOCdevkit path does not exist: {}'.format( self._devkit_path) assert os.path.exists( self._data_path), 'Path does not exist: {}'.format(self._data_path) if classes is None: self._classes = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') else: self._classes = classes if n_classes is not None: self._classes = self._classes[:n_classes] self._class_to_ind = dict( list(zip(self.classes, list(range(self.num_classes))))) self._image_ext = '.jpg' self._salt = str(uuid.uuid4()) self._comp_id = 'comp4' # PASCAL specific config options self.config = {'cleanup': True, 'use_salt': True} self.load_dataset()
def __init__(self, opt): Dataset.__init__(self) self.opt = opt self.audios = [] #load audio files here with open(os.path.join(opt.splitPath, opt.mode + ".txt"), 'r') as cur_f: audio_files = cur_f.readlines() self.audios = [audio_file[:-1] for audio_file in audio_files] normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) vision_transform_list = [transforms.ToTensor(), normalize] self.vision_transform = transforms.Compose(vision_transform_list)
def __init__(self, data_root, max_dataset_size=float('inf'), is_train=True, tumor_threshold=0.2, is_3d=False): Dataset.__init__(self) self.tumor_threshold = tumor_threshold if is_train: self.dir = os.path.join(data_root, 'train') else: self.dir = os.path.join(data_root, 'test') self.is_3d = is_3d self.dataset = self.make_dataset_from_tensor(self.dir, max_dataset_size) self.size = len(self.dataset)
def __init__(self, path): Dataset.__init__(self) self.path = path self.dataset = pd.read_csv(path, delimiter=",") self.dataset = self.dataset.sample(frac=1) self.dataset.replace(to_replace='blues', value=0, inplace=True) self.dataset.replace(to_replace='classical', value=1, inplace=True) self.dataset.replace(to_replace='country', value=2, inplace=True) self.dataset.replace(to_replace='disco', value=3, inplace=True) self.dataset.replace(to_replace='hiphop', value=4, inplace=True) self.dataset.replace(to_replace='jazz', value=5, inplace=True) self.dataset.replace(to_replace='metal', value=6, inplace=True) self.dataset.replace(to_replace='pop', value=7, inplace=True) self.dataset.replace(to_replace='reggae', value=8, inplace=True) self.dataset.replace(to_replace='rock', value=9, inplace=True) self.data = self.dataset.iloc[:-200, :-1].as_matrix() self.label = self.dataset.iloc[:-200, -1:].as_matrix()
def __init__(self, config): Dataset.__init__(self) Configurable.__init__(self, config) # get dataloader parameters self.shuffle = self.config.shuffle self.batch_size = self.config.batch_size self.num_workers = self.config.num_workers self.pin_memory = self.config.pin_memory # get labels if provided in config and not set in class if not self.labels and self.config.labels: self.labels = self.config.labels # get the transformations to be applied for the image and for the target self.transform = self.load_transforms(self.config.transforms) self.target_transform = self.load_transforms(self.config.target_transforms)