Beispiel #1
0
def load_hdf5(data_fp):
    with download_h5(data_fp) as hdf5_data:
        columns = [
            s.decode("utf-8")
            for s in hdf5_data[HDF5_COLUMNS_KEY][()].tolist()
        ]

        numpy_dataset = {}
        for column in columns:
            numpy_dataset[column] = hdf5_data[column][()]

    return from_numpy_dataset(numpy_dataset)
Beispiel #2
0
def load_hdf5(data_fp, clean_cols: bool = False):
    with download_h5(data_fp) as hdf5_data:
        columns = [
            s.decode("utf-8")
            for s in hdf5_data[HDF5_COLUMNS_KEY][()].tolist()
        ]

        numpy_dataset = {}
        for column in columns:
            # Column names from training hdf5 will be in the form 'Survived_a2fv4'
            np_col = column.rsplit("_", 1)[0] if clean_cols else column
            numpy_dataset[np_col] = hdf5_data[column][()]

    return from_numpy_dataset(numpy_dataset)
Beispiel #3
0
    def get(self, proc_column, idx=None):
        if idx is None:
            idx = range(self.size)
        if (self.data_hdf5_fp is None or
                PREPROCESSING not in self.features[proc_column] or
                'in_memory' not in self.features[proc_column][
                    'preprocessing']):
            return self.dataset[proc_column][idx]
        if self.features[proc_column][PREPROCESSING]['in_memory']:
            return self.dataset[proc_column][idx]

        sub_batch = self.dataset[proc_column][idx]

        indices = np.empty((3, len(sub_batch)), dtype=np.int64)
        indices[0, :] = sub_batch
        indices[1, :] = np.arange(len(sub_batch))
        indices = indices[:, np.argsort(indices[0])]

        with download_h5(self.data_hdf5_fp) as h5_file:
            im_data = h5_file[proc_column + '_data'][indices[0, :], :, :]
        indices[2, :] = np.arange(len(sub_batch))
        indices = indices[:, np.argsort(indices[1])]
        return im_data[indices[2, :]]