Beispiel #1
0
    def process(self, mode='r', projections='ixh', shape=None, **kwargs):
        """ Detect available projections in the cube and store handlers to them in attributes. """
        if mode == 'a':
            mode = 'r+' if os.path.exists(self.path) else 'w-'
        self.mode = mode

        if self.mode in ['r', 'r+']:
            self.file = h5py.File(self.path, mode=mode)

        elif self.mode == 'w-':
            # TODO Create new HDF5 file with required projections
            pass

        # Check available projections
        self.available_axis = [
            axis for axis, name in self.AXIS_TO_NAME.items()
            if name in self.file
        ]
        self.available_names = [
            self.AXIS_TO_NAME[axis] for axis in self.available_axis
        ]

        # Save cube handlers to instance
        self.axis_to_cube = {}
        for axis in self.available_axis:
            name = self.AXIS_TO_NAME[axis]
            cube = self.file[name]

            self.axis_to_cube[axis] = cube
            setattr(self, name, cube)

        # Parse attributes from meta / set defaults
        self.add_attributes(**kwargs)
Beispiel #2
0
    def train_dataloader(self):
        _train_transforms: None = None

        if self.hparams.augment:
            _train_transforms = transforms.Compose([
                transforms.RandomResizedCrop(size=(224, 224),
                                             scale=(0.85, 1.0)),
                transforms.RandomGrayscale(p=0.08),
                # lambda x: x if np.random.random_sample() > 0.08 else x.filter(ImageFilter.GaussianBlur(radius=1)),
                # lambda x: x if np.random.random_sample() > 0.08 else x.filter(ImageFilter.GaussianBlur(radius=3)),
                self.gaussianBlur1,
                self.gaussianBlur3,
                transforms.Resize((224, 224), Image.BICUBIC),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])
        _data_train = RTGENEH5Dataset(h5_file=h5pickle.File(
            self.hparams.hdf5_file, mode="r"),
                                      subject_list=self._train_subjects,
                                      transform=_train_transforms)
        return DataLoader(_data_train,
                          batch_size=self.hparams.batch_size,
                          shuffle=True,
                          num_workers=self.hparams.num_io_workers,
                          pin_memory=False)
Beispiel #3
0
    def __init__(self, h5file, datasetImage='/train/image', 
                 datasetProf='/train/prof', transform=None):
        """
        Args:
            h5file (string): Path to the h5 file
            datasetImage (string): dataset for images
            datasetProf (string): dataset for professors            
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.isLoaded = False
        self.alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
        try:
            h5 = h5pickle.File( h5file, 'r', skip_cache=False)
            self.images = h5[ datasetImage]
            self.prof = h5[ datasetProf]
 
            self.length, self.width, self.height, self.channels = self.images.shape
            self.prof_lenght = self.prof.shape[1]
            self.isLoaded = True
        except OSError as e:
            print(" ** Error using H5 file",e)
            return
        except ValueError as e:
            print(" ** Value error",e)
            return
        
        #self.lenght = min(self.length, 512)
        self.reject = int( np.max(self.prof))
            
        # not implemented yet
        self.transform = transform
Beispiel #4
0
 def test_repickling(self):
     f = h5pickle.File(self.file, 'r', skip_cache=True)
     dataset = f['a']
     dataset_pickled = pickle.dumps(dataset,
                                    protocol=pickle.HIGHEST_PROTOCOL)
     dataset_unpickled = pickle.loads(dataset_pickled)
     dataset_repickled = pickle.dumps(dataset_unpickled,
                                      protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #5
0
 def process(self, **kwargs):
     """ Put info from `.hdf5` groups to attributes.
     No passing through data whatsoever.
     """
     _ = kwargs
     # self.file_hdf5 = SafeIO(self.path, opener=h5pickle.File, mode='r')
     self.file_hdf5 = h5pickle.File(self.path, mode='r')
     self.add_attributes()
Beispiel #6
0
 def test_dataloader(self):
     _data_test = RTGENEH5Dataset(h5_file=h5pickle.File(
         self.hparams.hdf5_file, mode="r"),
                                  subject_list=self._test_subjects)
     return DataLoader(_data_test,
                       batch_size=self.hparams.batch_size,
                       shuffle=True,
                       num_workers=self.hparams.num_io_workers,
                       pin_memory=False)
    def test_readonly_skip_cache(self):
        f = h5pickle.File(self.file, 'r', skip_cache=True)
        self.assertEqual(f['a'][()], 1, 'can read from file')

        g = pickle.loads(pickle.dumps(f['a']))

        self.assertEqual(g[()], 1,
                         'reading from dataset should give correct result')

        f.close()
        g.file.close()
    def test_create_writable_file(self):
        f = h5pickle.File(self.file, 'w', skip_cache=True)

        got_oserror = False
        try:
            g = pickle.loads(pickle.dumps(f))
            # We expect an error here, since we cannot open on multiple processes
        except OSError:
            got_oserror = True

        self.assertTrue(got_oserror, 'Forbidden to open write-only file twice')

        f.close()
        os.remove(self.file)
    def test_3(self):
        f = h5pickle.File(self.file, 'r', skip_cache=True)
        protocol = 3

        h = pickle.loads(pickle.dumps(f, protocol=protocol))
        self.assertEqual(
            f['a'][()], 1,
            'reading from file dataset should give correct result')

        g = pickle.loads(pickle.dumps(f['a'], protocol=protocol))
        self.assertEqual(g[()], 1,
                         'reading from dataset should give correct result')

        f.close()
        h.close()
        g.file.close()
    def test_readonly(self):
        f = h5pickle.File(self.file, 'a', skip_cache=False)

        g = pickle.loads(pickle.dumps(f))

        # Due to the caching mechanism f and g should be the same
        self.assertEqual(id(f), id(g),
                         'pickling and unpickling should use cache')

        self.assertEqual(g['a'][()], 1,
                         'reading from dataset should give correct result')

        # skip_cache should be propagated (but they're the same so kind of meaningless test)
        self.assertFalse(g.skip_cache, 'skip_cache should be propagated')

        f.close()
    def test_readonly_skip_cache(self):
        f = h5pickle.File(self.file, 'a', skip_cache=True)
        self.assertEqual(f['a'][()], 1, 'can read from file')

        g = pickle.loads(pickle.dumps(f))

        self.assertEqual(g['a'][()], 1,
                         'reading from dataset should give correct result')
        # Since cache is skipped I want 2 different handles
        self.assertNotEqual(
            id(f), id(g),
            'pickling and unpickling should create new file handle')
        self.assertTrue(g.skip_cache, 'skip_cache should be propagated')

        f.close()
        g.close()
Beispiel #12
0
    def __init__(self, h5_path, folds, do_transform=False, pathology=False, test=False):
        self.h5_path = h5_path
        self.folds = folds
        self.test = test

        self.f = h5py.File(self.h5_path, 'r')
        self.fold_names = []
        for fold in self.folds:
            for i in range(self.f[str(fold)].shape[0]):
                self.fold_names.append((i,str(fold)))

        self.pathology = pathology
        self.do_transform = do_transform
        self.transform = Compose([
            RandomAffine(degrees=(-20,20), translate=(0.1, 0.1))
        ])
Beispiel #13
0
    def __init__(self, h5_path, folds, do_transform=False, classes=4):
        self.folds = folds
        self.f = h5py.File(h5_path, 'r')
        self.fold_names = []
        self.classes = [[] for i in range(classes)]

        for fold in folds:
            for i in range(self.f[str(fold)].shape[0]):
                self.fold_names.append((i,str(fold)))
                label = self.f["{0}_label".format(fold)][i,...]
                self.classes[np.argmax(label)].append((i, str(fold)))

        self.do_transform = do_transform
        self.transform = Compose([
            RandomResizedCrop(512),
            RandomHorizontalFlip()
        ])
Beispiel #14
0
    def create_file_from_iterable(cls,
                                  src,
                                  shape,
                                  window,
                                  stride,
                                  dst=None,
                                  agg=None,
                                  projection='ixh',
                                  threshold=None):
        """ Aggregate multiple chunks into file with 3D cube.

        Parameters
        ----------
        src : iterable
            Each item is a tuple (position, array) where position is a 3D coordinate of the left upper array corner.
        shape : tuple
            Shape of the resulting array.
        window : tuple
            Chunk shape.
        stride : tuple
            Stride for chunks. Values in overlapped regions will be aggregated.
        dst : str or None, optional
            Path to the resulting .hdf5. If None, function will return array with predictions.
        agg : 'mean', 'min' or 'max' or None, optional
            The way to aggregate values in overlapped regions. None means that new chunk will rewrite
            previous value in cube.
        projection : str, optional
            Projections to create in hdf5 file, by default 'ixh'.
        threshold : float or None, optional
            If not None, threshold to transform values into [0, 1]. Default is None.
        """
        shape = np.array(shape)
        window = np.array(window)
        stride = np.array(stride)

        if dst is None:
            dst = np.zeros(shape)
        else:
            file_hdf5 = h5py.File(dst, 'a')
            dst = file_hdf5.create_dataset('cube', shape)
            cube_hdf5_x = file_hdf5.create_dataset('cube_x', shape[[1, 2, 0]])
            cube_hdf5_h = file_hdf5.create_dataset('cube_h', shape[[2, 0, 1]])

        lower_bounds = [
            make_axis_grid((0, shape[i]), stride[i], shape[i], window[i])
            for i in range(3)
        ]
        lower_bounds = np.stack(np.meshgrid(*lower_bounds),
                                axis=-1).reshape(-1, 3)
        upper_bounds = lower_bounds + window
        grid = np.stack([lower_bounds, upper_bounds], axis=-1)

        for position, chunk in src:
            slices = tuple([
                slice(position[i], position[i] + chunk.shape[i])
                for i in range(3)
            ])
            _chunk = dst[slices]
            if agg in ('max', 'min'):
                chunk = np.maximum(chunk,
                                   _chunk) if agg == 'max' else np.minimum(
                                       chunk, _chunk)
            elif agg == 'mean':
                grid_mask = np.logical_and(
                    grid[..., 1] >= np.expand_dims(position, axis=0),
                    grid[..., 0] < np.expand_dims(position + window,
                                                  axis=0)).all(axis=1)
                agg_map = np.zeros_like(chunk)
                for chunk_slc in grid[grid_mask]:
                    _slices = [
                        slice(
                            max(chunk_slc[i, 0], position[i]) - position[i],
                            min(chunk_slc[i, 1], position[i] + window[i]) -
                            position[i]) for i in range(3)
                    ]
                    agg_map[tuple(_slices)] += 1
                chunk /= agg_map
                chunk = _chunk + chunk
            dst[slices] = chunk
        if isinstance(dst, np.ndarray):
            if threshold is not None:
                dst = (dst > threshold).astype(int)
        else:
            for i in range(0, dst.shape[0], window[0]):
                slide = dst[i:i + window[0]]
                if threshold is not None:
                    slide = (slide > threshold).astype(int)
                    dst[i:i + window[0]] = slide
                cube_hdf5_x[:, :, i:i + window[0]] = slide.transpose((1, 2, 0))
                cube_hdf5_h[:, i:i + window[0]] = slide.transpose((2, 0, 1))
        return dst
Beispiel #15
0
    def make_sgy(self,
                 path_hdf5=None,
                 path_spec=None,
                 postfix='',
                 remove_hdf5=False,
                 zip_result=True,
                 path_segy=None,
                 pbar=False):
        """ Convert POST-STACK HDF5 cube to SEG-Y format with supplied spec.

        Parameters
        ----------
        path_hdf5 : str
            Path to load hdf5 file from.
        path_spec : str
            Path to load segy file from with geometry spec.
        path_segy : str
            Path to store converted cube. By default, new cube is stored right next to original.
        postfix : str
            Postfix to add to the name of resulting cube.
        """
        path_segy = path_segy or (os.path.splitext(path_hdf5)[0] + postfix +
                                  '.sgy')
        if not path_spec:
            if hasattr(self, 'segy_path'):
                path_spec = self.segy_path
            else:
                path_spec = os.path.splitext(self.path) + '.sgy'

        # By default, if path_hdf5 is not provided, `temp.hdf5` next to self.path will be used
        if path_hdf5 is None:
            path_hdf5 = os.path.join(os.path.dirname(self.path), 'temp.hdf5')

        with h5py.File(path_hdf5, 'r') as src:
            cube_hdf5 = src['cube']

            from .base import SeismicGeometry  #pylint: disable=import-outside-toplevel
            geometry = SeismicGeometry(path_spec)
            segy = geometry.segyfile

            spec = segyio.spec()
            spec.sorting = None if segy.sorting is None else int(segy.sorting)
            spec.format = None if segy.format is None else int(segy.format)
            spec.samples = range(self.depth)

            idx = np.stack(geometry.dataframe.index)
            ilines, xlines = self.load_meta_item(
                'ilines'), self.load_meta_item('xlines')

            i_enc = {num: k for k, num in enumerate(ilines)}
            x_enc = {num: k for k, num in enumerate(xlines)}

            spec.ilines = ilines
            spec.xlines = xlines

            with segyio.create(path_segy, spec) as dst_file:
                # Copy all textual headers, including possible extended
                for i in range(1 + segy.ext_headers):
                    dst_file.text[i] = segy.text[i]
                dst_file.bin = segy.bin

                for c, (i, x) in enumerate(tqdm(idx, disable=(not pbar))):
                    locs = tuple([i_enc[i], x_enc[x], slice(None)])
                    dst_file.header[c] = segy.header[c]
                    dst_file.trace[c] = cube_hdf5[locs]
                dst_file.bin = segy.bin
                dst_file.bin[segyio.BinField.Traces] = len(idx)

        if remove_hdf5:
            os.remove(path_hdf5)

        if zip_result:
            dir_name = os.path.dirname(os.path.abspath(path_segy))
            file_name = os.path.basename(path_segy)
            shutil.make_archive(
                os.path.splitext(path_segy)[0], 'zip', dir_name, file_name)
    def __init__(self, dataPath, yamlConfig, normalize=True):
        data_path = dataPath

        # List of features to use
        features = yamlConfig['Inputs']
        self.features_list = features
        # List of labels to use
        labels = yamlConfig['Labels']
        self.labels_list = labels

        columns_arr = np.array([])
        features_labels_df = pd.DataFrame()
        #Check/Handle directory of files vs 1 file
        if os.path.isdir(data_path):
            print("Directory of data files found!")
            first = True
            for file in os.listdir(data_path):
                if file.endswith(".h5") or file.endswith(".h5df"):
                    try:
                        print("Loading " + str(file))
                        self.h5File = h5py.File(os.path.join(data_path,file), 'r', libver='latest', swmr=True)
                        if first:
                            columns_arr = np.array(self.h5File['jetFeatureNames'][:]).astype(str)  # slicing h5 data because otherwise it's a reference to the actual file?
                            first = False
                        this_file = pd.DataFrame(self.h5File["jets"][:], columns=columns_arr)
                        features_labels_df = pd.concat([features_labels_df,this_file],axis=0) #concat along axis 0 if doesn't work?
                        self.h5File.close()
                    except Exception as e:
                        print("Error! Failed to load jet file " + file)
                        print(e)
        elif os.path.isfile(data_path):
            print("Single data file found!")
            self.h5File = h5py.File(dataPath, 'r', libver='latest', swmr=True)
            # Convert to dataframe
            columns_arr = np.array(self.h5File['jetFeatureNames'][:]).astype(str)  # slicing h5 data because otherwise it's a reference to the actual file?
            features_labels_df = pd.DataFrame(self.h5File["jets"][:], columns=columns_arr)
        else:
            print("Error! path specified is a special file (socket, FIFO, device file), or isn't valid")

        features_labels_df = features_labels_df.drop_duplicates()
        features_df = features_labels_df[features]
        labels_df = features_labels_df[labels]
        # Convert to numpy array
        self.features_val = features_df.values.astype(np.float)
        self.labels_val = labels_df.values.astype(np.float)

        if 'j_index' in features:
            self.features_val = self.features_val[:, :-1]  # drop the j_index feature
        if 'j_index' in labels:
            self.labels_val = self.labels_val[:, :-1]  # drop the j_index label
            labels = labels[:-1]

        if normalize:
            # Normalize inputs
            if yamlConfig['NormalizeInputs'] and yamlConfig['InputType'] != 'Conv1D' \
                    and yamlConfig['InputType'] != 'Conv2D':
                scaler = preprocessing.StandardScaler().fit(self.features_val)
                self.features_val = scaler.transform(self.features_val)
                print("Scaled Features Data W/ StandardScaler")

            # Normalize inputs (w/ MinMax for squared hinge)
            if yamlConfig['NormalizeInputs'] and yamlConfig['InputType'] != 'Conv1D' \
                    and yamlConfig['InputType'] != 'Conv2D' \
                    and  yamlConfig['KerasLoss'] == 'squared_hinge':
                scaler = preprocessing.MinMaxScaler().fit(self.features_val)
                self.features_val = scaler.transform(self.features_val)
                print("Scaled Features Data W/ MinMaxScaler")

            # Normalize conv inputs
            if yamlConfig['NormalizeInputs'] and yamlConfig['InputType'] == 'Conv1D':
                reshape_X_train_val = self.features_val.reshape(self.features_val.shape[0] * self.features_val.shape[1],
                                                          self.features_val.shape[2])
                scaler = preprocessing.StandardScaler().fit(reshape_X_train_val)
                for p in range(self.features_val.shape[1]):
                    self.features_val[:, p, :] = scaler.transform(self.features_val[:, p, :])
                print("Reshaped data for conv and Scaled Features Data W/ StandardScaler")
Beispiel #17
0
 def __init__(self, path):
     self.f = h5py.File(path, 'r')
     self.keyname = list(self.f.keys())
Beispiel #18
0
 def __init__(self, path):
     self.f = h5py.File(path, 'r')
     self.keyname = list(self.f.keys())
     self.size = min([len(self.f[key]) for key in self.keyname])
 def __init__(self, file_path, max_query_len, max_doc_len, idfs):
     self.fp = h5py.File(file_path, 'r')
     self.max_query_len = max_query_len
     self.max_doc_len = max_doc_len
     self.idfs = idfs
Beispiel #20
0
def main():
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    json_file_parser = ArgumentParser()
    json_file_parser.add_argument("--config_file", type=str, default=None)
    json_file_parser.add_argument("--tpu_num_cores", type=int, default=None)
    json_parser_args = json_file_parser.parse_args()

    parser = HfArgumentParser([TrainingArguments, ExtraArgs])

    if json_parser_args.config_file is None:
        training_args, extra_args = parser.parse_args_into_dataclasses()
    else:
        training_args, extra_args = parser.parse_json_file(
            json_parser_args.config_file)

    with h5pickle.File("data/train.hdf5",
                       "r",
                       libver="latest",
                       swmr=True,
                       skip_cache=False) as f:
        train_dataset = f["train"]
        val_dataset = f["val"]

        if extra_args.max_n_train is not None:
            train_dataset = train_dataset[:extra_args.max_n_train]

        if extra_args.max_n_val is not None:
            val_dataset = val_dataset[:extra_args.max_n_val]

        model = get_model(extra_args)

        tokenizer = GPT2Tokenizer(
            "data/german_tokenizer_cc/vocab.json",
            "data/german_tokenizer_cc/merges.txt",
        )
        tokenizer.pad_token = tokenizer.eos_token

        name = generate_slug(2)

        if json_parser_args.tpu_num_cores is not None:
            training_args.tpu_num_cores = json_parser_args.tpu_num_cores

        training_args.remove_unused_columns = False
        steps_per_epoch = int(
            len(train_dataset) / training_args.per_device_train_batch_size /
            training_args.gradient_accumulation_steps /
            training_args.tpu_num_cores)
        training_args.steps_per_epoch = steps_per_epoch
        training_args.eval_steps = steps_per_epoch
        training_args.save_steps = (
            steps_per_epoch * training_args.num_train_epochs
        )  # only save once at the end to save space
        training_args.run_name = name
        training_args.output_dir = os.path.join("checkpoints", name)

        trainer = GPT2Trainer(
            model,
            training_args,
            extra_args=extra_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            callbacks=[GPT2WandbCallback],
        )
        trainer.remove_callback(WandbCallback)

        trainer.train()
        print("Done!")
Beispiel #21
0
         _valid_subjects.append([0, 14, 15, 16])
         _valid_subjects.append([0, 14, 15, 16])
         _valid_subjects.append([0, 14, 15, 16])
         # test subjects
         _test_subjects.append([5, 6, 11, 12, 13])
         _test_subjects.append([3, 4, 7, 9])
         _test_subjects.append([1, 2, 8, 10])
     else:
         _train_subjects.append(
             [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
         _valid_subjects.append(
             [0]
         )  # Note that this is a hack and should not be used to get results for papers
         _test_subjects.append([0])
 else:
     file = h5pickle.File(_hyperparams.hdf5_file, mode="r")
     keys = [int(subject[1:]) for subject in list(file.keys())]
     file.close()
     if _hyperparams.k_fold_validation:
         all_subjects = range(len(keys))
         for leave_one_out_idx in all_subjects:
             _train_subjects.append(all_subjects[:leave_one_out_idx] +
                                    all_subjects[leave_one_out_idx + 1:])
             _valid_subjects.append(
                 [leave_one_out_idx]
             )  # Note that this is a hack and should not be used to get results for papers
             _test_subjects.append([leave_one_out_idx])
     else:
         _train_subjects.append(keys[1:])
         _valid_subjects.append([keys[0]])
         _test_subjects.append([keys[0]])