def process(self, mode='r', projections='ixh', shape=None, **kwargs): """ Detect available projections in the cube and store handlers to them in attributes. """ if mode == 'a': mode = 'r+' if os.path.exists(self.path) else 'w-' self.mode = mode if self.mode in ['r', 'r+']: self.file = h5py.File(self.path, mode=mode) elif self.mode == 'w-': # TODO Create new HDF5 file with required projections pass # Check available projections self.available_axis = [ axis for axis, name in self.AXIS_TO_NAME.items() if name in self.file ] self.available_names = [ self.AXIS_TO_NAME[axis] for axis in self.available_axis ] # Save cube handlers to instance self.axis_to_cube = {} for axis in self.available_axis: name = self.AXIS_TO_NAME[axis] cube = self.file[name] self.axis_to_cube[axis] = cube setattr(self, name, cube) # Parse attributes from meta / set defaults self.add_attributes(**kwargs)
def train_dataloader(self): _train_transforms: None = None if self.hparams.augment: _train_transforms = transforms.Compose([ transforms.RandomResizedCrop(size=(224, 224), scale=(0.85, 1.0)), transforms.RandomGrayscale(p=0.08), # lambda x: x if np.random.random_sample() > 0.08 else x.filter(ImageFilter.GaussianBlur(radius=1)), # lambda x: x if np.random.random_sample() > 0.08 else x.filter(ImageFilter.GaussianBlur(radius=3)), self.gaussianBlur1, self.gaussianBlur3, transforms.Resize((224, 224), Image.BICUBIC), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) _data_train = RTGENEH5Dataset(h5_file=h5pickle.File( self.hparams.hdf5_file, mode="r"), subject_list=self._train_subjects, transform=_train_transforms) return DataLoader(_data_train, batch_size=self.hparams.batch_size, shuffle=True, num_workers=self.hparams.num_io_workers, pin_memory=False)
def __init__(self, h5file, datasetImage='/train/image', datasetProf='/train/prof', transform=None): """ Args: h5file (string): Path to the h5 file datasetImage (string): dataset for images datasetProf (string): dataset for professors transform (callable, optional): Optional transform to be applied on a sample. """ self.isLoaded = False self.alphabet = "0123456789abcdefghijklmnopqrstuvwxyz" try: h5 = h5pickle.File( h5file, 'r', skip_cache=False) self.images = h5[ datasetImage] self.prof = h5[ datasetProf] self.length, self.width, self.height, self.channels = self.images.shape self.prof_lenght = self.prof.shape[1] self.isLoaded = True except OSError as e: print(" ** Error using H5 file",e) return except ValueError as e: print(" ** Value error",e) return #self.lenght = min(self.length, 512) self.reject = int( np.max(self.prof)) # not implemented yet self.transform = transform
def test_repickling(self): f = h5pickle.File(self.file, 'r', skip_cache=True) dataset = f['a'] dataset_pickled = pickle.dumps(dataset, protocol=pickle.HIGHEST_PROTOCOL) dataset_unpickled = pickle.loads(dataset_pickled) dataset_repickled = pickle.dumps(dataset_unpickled, protocol=pickle.HIGHEST_PROTOCOL)
def process(self, **kwargs): """ Put info from `.hdf5` groups to attributes. No passing through data whatsoever. """ _ = kwargs # self.file_hdf5 = SafeIO(self.path, opener=h5pickle.File, mode='r') self.file_hdf5 = h5pickle.File(self.path, mode='r') self.add_attributes()
def test_dataloader(self): _data_test = RTGENEH5Dataset(h5_file=h5pickle.File( self.hparams.hdf5_file, mode="r"), subject_list=self._test_subjects) return DataLoader(_data_test, batch_size=self.hparams.batch_size, shuffle=True, num_workers=self.hparams.num_io_workers, pin_memory=False)
def test_readonly_skip_cache(self): f = h5pickle.File(self.file, 'r', skip_cache=True) self.assertEqual(f['a'][()], 1, 'can read from file') g = pickle.loads(pickle.dumps(f['a'])) self.assertEqual(g[()], 1, 'reading from dataset should give correct result') f.close() g.file.close()
def test_create_writable_file(self): f = h5pickle.File(self.file, 'w', skip_cache=True) got_oserror = False try: g = pickle.loads(pickle.dumps(f)) # We expect an error here, since we cannot open on multiple processes except OSError: got_oserror = True self.assertTrue(got_oserror, 'Forbidden to open write-only file twice') f.close() os.remove(self.file)
def test_3(self): f = h5pickle.File(self.file, 'r', skip_cache=True) protocol = 3 h = pickle.loads(pickle.dumps(f, protocol=protocol)) self.assertEqual( f['a'][()], 1, 'reading from file dataset should give correct result') g = pickle.loads(pickle.dumps(f['a'], protocol=protocol)) self.assertEqual(g[()], 1, 'reading from dataset should give correct result') f.close() h.close() g.file.close()
def test_readonly(self): f = h5pickle.File(self.file, 'a', skip_cache=False) g = pickle.loads(pickle.dumps(f)) # Due to the caching mechanism f and g should be the same self.assertEqual(id(f), id(g), 'pickling and unpickling should use cache') self.assertEqual(g['a'][()], 1, 'reading from dataset should give correct result') # skip_cache should be propagated (but they're the same so kind of meaningless test) self.assertFalse(g.skip_cache, 'skip_cache should be propagated') f.close()
def test_readonly_skip_cache(self): f = h5pickle.File(self.file, 'a', skip_cache=True) self.assertEqual(f['a'][()], 1, 'can read from file') g = pickle.loads(pickle.dumps(f)) self.assertEqual(g['a'][()], 1, 'reading from dataset should give correct result') # Since cache is skipped I want 2 different handles self.assertNotEqual( id(f), id(g), 'pickling and unpickling should create new file handle') self.assertTrue(g.skip_cache, 'skip_cache should be propagated') f.close() g.close()
def __init__(self, h5_path, folds, do_transform=False, pathology=False, test=False): self.h5_path = h5_path self.folds = folds self.test = test self.f = h5py.File(self.h5_path, 'r') self.fold_names = [] for fold in self.folds: for i in range(self.f[str(fold)].shape[0]): self.fold_names.append((i,str(fold))) self.pathology = pathology self.do_transform = do_transform self.transform = Compose([ RandomAffine(degrees=(-20,20), translate=(0.1, 0.1)) ])
def __init__(self, h5_path, folds, do_transform=False, classes=4): self.folds = folds self.f = h5py.File(h5_path, 'r') self.fold_names = [] self.classes = [[] for i in range(classes)] for fold in folds: for i in range(self.f[str(fold)].shape[0]): self.fold_names.append((i,str(fold))) label = self.f["{0}_label".format(fold)][i,...] self.classes[np.argmax(label)].append((i, str(fold))) self.do_transform = do_transform self.transform = Compose([ RandomResizedCrop(512), RandomHorizontalFlip() ])
def create_file_from_iterable(cls, src, shape, window, stride, dst=None, agg=None, projection='ixh', threshold=None): """ Aggregate multiple chunks into file with 3D cube. Parameters ---------- src : iterable Each item is a tuple (position, array) where position is a 3D coordinate of the left upper array corner. shape : tuple Shape of the resulting array. window : tuple Chunk shape. stride : tuple Stride for chunks. Values in overlapped regions will be aggregated. dst : str or None, optional Path to the resulting .hdf5. If None, function will return array with predictions. agg : 'mean', 'min' or 'max' or None, optional The way to aggregate values in overlapped regions. None means that new chunk will rewrite previous value in cube. projection : str, optional Projections to create in hdf5 file, by default 'ixh'. threshold : float or None, optional If not None, threshold to transform values into [0, 1]. Default is None. """ shape = np.array(shape) window = np.array(window) stride = np.array(stride) if dst is None: dst = np.zeros(shape) else: file_hdf5 = h5py.File(dst, 'a') dst = file_hdf5.create_dataset('cube', shape) cube_hdf5_x = file_hdf5.create_dataset('cube_x', shape[[1, 2, 0]]) cube_hdf5_h = file_hdf5.create_dataset('cube_h', shape[[2, 0, 1]]) lower_bounds = [ make_axis_grid((0, shape[i]), stride[i], shape[i], window[i]) for i in range(3) ] lower_bounds = np.stack(np.meshgrid(*lower_bounds), axis=-1).reshape(-1, 3) upper_bounds = lower_bounds + window grid = np.stack([lower_bounds, upper_bounds], axis=-1) for position, chunk in src: slices = tuple([ slice(position[i], position[i] + chunk.shape[i]) for i in range(3) ]) _chunk = dst[slices] if agg in ('max', 'min'): chunk = np.maximum(chunk, _chunk) if agg == 'max' else np.minimum( chunk, _chunk) elif agg == 'mean': grid_mask = np.logical_and( grid[..., 1] >= np.expand_dims(position, axis=0), grid[..., 0] < np.expand_dims(position + window, axis=0)).all(axis=1) agg_map = np.zeros_like(chunk) for chunk_slc in grid[grid_mask]: _slices = [ slice( max(chunk_slc[i, 0], position[i]) - position[i], min(chunk_slc[i, 1], position[i] + window[i]) - position[i]) for i in range(3) ] agg_map[tuple(_slices)] += 1 chunk /= agg_map chunk = _chunk + chunk dst[slices] = chunk if isinstance(dst, np.ndarray): if threshold is not None: dst = (dst > threshold).astype(int) else: for i in range(0, dst.shape[0], window[0]): slide = dst[i:i + window[0]] if threshold is not None: slide = (slide > threshold).astype(int) dst[i:i + window[0]] = slide cube_hdf5_x[:, :, i:i + window[0]] = slide.transpose((1, 2, 0)) cube_hdf5_h[:, i:i + window[0]] = slide.transpose((2, 0, 1)) return dst
def make_sgy(self, path_hdf5=None, path_spec=None, postfix='', remove_hdf5=False, zip_result=True, path_segy=None, pbar=False): """ Convert POST-STACK HDF5 cube to SEG-Y format with supplied spec. Parameters ---------- path_hdf5 : str Path to load hdf5 file from. path_spec : str Path to load segy file from with geometry spec. path_segy : str Path to store converted cube. By default, new cube is stored right next to original. postfix : str Postfix to add to the name of resulting cube. """ path_segy = path_segy or (os.path.splitext(path_hdf5)[0] + postfix + '.sgy') if not path_spec: if hasattr(self, 'segy_path'): path_spec = self.segy_path else: path_spec = os.path.splitext(self.path) + '.sgy' # By default, if path_hdf5 is not provided, `temp.hdf5` next to self.path will be used if path_hdf5 is None: path_hdf5 = os.path.join(os.path.dirname(self.path), 'temp.hdf5') with h5py.File(path_hdf5, 'r') as src: cube_hdf5 = src['cube'] from .base import SeismicGeometry #pylint: disable=import-outside-toplevel geometry = SeismicGeometry(path_spec) segy = geometry.segyfile spec = segyio.spec() spec.sorting = None if segy.sorting is None else int(segy.sorting) spec.format = None if segy.format is None else int(segy.format) spec.samples = range(self.depth) idx = np.stack(geometry.dataframe.index) ilines, xlines = self.load_meta_item( 'ilines'), self.load_meta_item('xlines') i_enc = {num: k for k, num in enumerate(ilines)} x_enc = {num: k for k, num in enumerate(xlines)} spec.ilines = ilines spec.xlines = xlines with segyio.create(path_segy, spec) as dst_file: # Copy all textual headers, including possible extended for i in range(1 + segy.ext_headers): dst_file.text[i] = segy.text[i] dst_file.bin = segy.bin for c, (i, x) in enumerate(tqdm(idx, disable=(not pbar))): locs = tuple([i_enc[i], x_enc[x], slice(None)]) dst_file.header[c] = segy.header[c] dst_file.trace[c] = cube_hdf5[locs] dst_file.bin = segy.bin dst_file.bin[segyio.BinField.Traces] = len(idx) if remove_hdf5: os.remove(path_hdf5) if zip_result: dir_name = os.path.dirname(os.path.abspath(path_segy)) file_name = os.path.basename(path_segy) shutil.make_archive( os.path.splitext(path_segy)[0], 'zip', dir_name, file_name)
def __init__(self, dataPath, yamlConfig, normalize=True): data_path = dataPath # List of features to use features = yamlConfig['Inputs'] self.features_list = features # List of labels to use labels = yamlConfig['Labels'] self.labels_list = labels columns_arr = np.array([]) features_labels_df = pd.DataFrame() #Check/Handle directory of files vs 1 file if os.path.isdir(data_path): print("Directory of data files found!") first = True for file in os.listdir(data_path): if file.endswith(".h5") or file.endswith(".h5df"): try: print("Loading " + str(file)) self.h5File = h5py.File(os.path.join(data_path,file), 'r', libver='latest', swmr=True) if first: columns_arr = np.array(self.h5File['jetFeatureNames'][:]).astype(str) # slicing h5 data because otherwise it's a reference to the actual file? first = False this_file = pd.DataFrame(self.h5File["jets"][:], columns=columns_arr) features_labels_df = pd.concat([features_labels_df,this_file],axis=0) #concat along axis 0 if doesn't work? self.h5File.close() except Exception as e: print("Error! Failed to load jet file " + file) print(e) elif os.path.isfile(data_path): print("Single data file found!") self.h5File = h5py.File(dataPath, 'r', libver='latest', swmr=True) # Convert to dataframe columns_arr = np.array(self.h5File['jetFeatureNames'][:]).astype(str) # slicing h5 data because otherwise it's a reference to the actual file? features_labels_df = pd.DataFrame(self.h5File["jets"][:], columns=columns_arr) else: print("Error! path specified is a special file (socket, FIFO, device file), or isn't valid") features_labels_df = features_labels_df.drop_duplicates() features_df = features_labels_df[features] labels_df = features_labels_df[labels] # Convert to numpy array self.features_val = features_df.values.astype(np.float) self.labels_val = labels_df.values.astype(np.float) if 'j_index' in features: self.features_val = self.features_val[:, :-1] # drop the j_index feature if 'j_index' in labels: self.labels_val = self.labels_val[:, :-1] # drop the j_index label labels = labels[:-1] if normalize: # Normalize inputs if yamlConfig['NormalizeInputs'] and yamlConfig['InputType'] != 'Conv1D' \ and yamlConfig['InputType'] != 'Conv2D': scaler = preprocessing.StandardScaler().fit(self.features_val) self.features_val = scaler.transform(self.features_val) print("Scaled Features Data W/ StandardScaler") # Normalize inputs (w/ MinMax for squared hinge) if yamlConfig['NormalizeInputs'] and yamlConfig['InputType'] != 'Conv1D' \ and yamlConfig['InputType'] != 'Conv2D' \ and yamlConfig['KerasLoss'] == 'squared_hinge': scaler = preprocessing.MinMaxScaler().fit(self.features_val) self.features_val = scaler.transform(self.features_val) print("Scaled Features Data W/ MinMaxScaler") # Normalize conv inputs if yamlConfig['NormalizeInputs'] and yamlConfig['InputType'] == 'Conv1D': reshape_X_train_val = self.features_val.reshape(self.features_val.shape[0] * self.features_val.shape[1], self.features_val.shape[2]) scaler = preprocessing.StandardScaler().fit(reshape_X_train_val) for p in range(self.features_val.shape[1]): self.features_val[:, p, :] = scaler.transform(self.features_val[:, p, :]) print("Reshaped data for conv and Scaled Features Data W/ StandardScaler")
def __init__(self, path): self.f = h5py.File(path, 'r') self.keyname = list(self.f.keys())
def __init__(self, path): self.f = h5py.File(path, 'r') self.keyname = list(self.f.keys()) self.size = min([len(self.f[key]) for key in self.keyname])
def __init__(self, file_path, max_query_len, max_doc_len, idfs): self.fp = h5py.File(file_path, 'r') self.max_query_len = max_query_len self.max_doc_len = max_doc_len self.idfs = idfs
def main(): logging.basicConfig(stream=sys.stdout, level=logging.INFO) json_file_parser = ArgumentParser() json_file_parser.add_argument("--config_file", type=str, default=None) json_file_parser.add_argument("--tpu_num_cores", type=int, default=None) json_parser_args = json_file_parser.parse_args() parser = HfArgumentParser([TrainingArguments, ExtraArgs]) if json_parser_args.config_file is None: training_args, extra_args = parser.parse_args_into_dataclasses() else: training_args, extra_args = parser.parse_json_file( json_parser_args.config_file) with h5pickle.File("data/train.hdf5", "r", libver="latest", swmr=True, skip_cache=False) as f: train_dataset = f["train"] val_dataset = f["val"] if extra_args.max_n_train is not None: train_dataset = train_dataset[:extra_args.max_n_train] if extra_args.max_n_val is not None: val_dataset = val_dataset[:extra_args.max_n_val] model = get_model(extra_args) tokenizer = GPT2Tokenizer( "data/german_tokenizer_cc/vocab.json", "data/german_tokenizer_cc/merges.txt", ) tokenizer.pad_token = tokenizer.eos_token name = generate_slug(2) if json_parser_args.tpu_num_cores is not None: training_args.tpu_num_cores = json_parser_args.tpu_num_cores training_args.remove_unused_columns = False steps_per_epoch = int( len(train_dataset) / training_args.per_device_train_batch_size / training_args.gradient_accumulation_steps / training_args.tpu_num_cores) training_args.steps_per_epoch = steps_per_epoch training_args.eval_steps = steps_per_epoch training_args.save_steps = ( steps_per_epoch * training_args.num_train_epochs ) # only save once at the end to save space training_args.run_name = name training_args.output_dir = os.path.join("checkpoints", name) trainer = GPT2Trainer( model, training_args, extra_args=extra_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, callbacks=[GPT2WandbCallback], ) trainer.remove_callback(WandbCallback) trainer.train() print("Done!")
_valid_subjects.append([0, 14, 15, 16]) _valid_subjects.append([0, 14, 15, 16]) _valid_subjects.append([0, 14, 15, 16]) # test subjects _test_subjects.append([5, 6, 11, 12, 13]) _test_subjects.append([3, 4, 7, 9]) _test_subjects.append([1, 2, 8, 10]) else: _train_subjects.append( [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]) _valid_subjects.append( [0] ) # Note that this is a hack and should not be used to get results for papers _test_subjects.append([0]) else: file = h5pickle.File(_hyperparams.hdf5_file, mode="r") keys = [int(subject[1:]) for subject in list(file.keys())] file.close() if _hyperparams.k_fold_validation: all_subjects = range(len(keys)) for leave_one_out_idx in all_subjects: _train_subjects.append(all_subjects[:leave_one_out_idx] + all_subjects[leave_one_out_idx + 1:]) _valid_subjects.append( [leave_one_out_idx] ) # Note that this is a hack and should not be used to get results for papers _test_subjects.append([leave_one_out_idx]) else: _train_subjects.append(keys[1:]) _valid_subjects.append([keys[0]]) _test_subjects.append([keys[0]])