Ejemplo n.º 1
0
def subsample_data(images,
                   labels,
                   pct,
                   chunk_size=None,
                   shuffle=False,
                   seed=None,
                   labels_id='labels'):
    hdf5_aux = []
    if pct < 1.:
        # Store the data into an auxiliary HDF5 file
        filename = 'hdf5_aux_{}'.format(time.time())
        da.to_hdf5(filename, {'data': images, labels_id: labels})

        # Read HDF5
        hdf5_aux1 = h5py.File(filename, 'r')

        images, labels, hdf5_aux2 = hdf52dask(hdf5_aux1,
                                              group=None,
                                              chunk_size=chunk_size,
                                              shuffle=shuffle,
                                              seed=seed,
                                              pct=pct,
                                              labels_id=labels_id)

        if hdf5_aux2:
            hdf5_aux.extend([hdf5_aux1, hdf5_aux2])
        else:
            hdf5_aux.extend([hdf5_aux1])

    return images, labels, hdf5_aux
Ejemplo n.º 2
0
def create_input_chunks(cs, partition, data_dir, file_format):
    """
        cs: chunk shape
        file_format: file format
        data_dir: to store the file
    """
    if file_format == "HDF5":
        file_manager = HDF5_manager()
    else:
        print("File format not supported yet. Aborting...")
        sys.exit(1)

    print(f"Creating input chunks at {data_dir}")

    create_empty_dir(data_dir)

    _slices = ((0,cs[0]), (0,cs[1]), (0,cs[2]))
    for i in range(partition[0]):
        for j in range(partition[1]):
            for k in range(partition[2]):
                print(f"Creating random array... shape: {cs}")
                arr = da.random.uniform(size=cs)
                print(f"Done, converting to float16...")
                arr = arr.astype(np.float16)
                out_filename = f'{i}_{j}_{k}.hdf5'
                print(f"Building {out_filename} with shape {cs}")
                outfilepath = os.path.join(data_dir, out_filename)
                print(f"Storing...")
                da.to_hdf5(outfilepath, '/data', arr, chunks=None, compression=None)
Ejemplo n.º 3
0
    def run(self):
        os.makedirs(self.output().path)
        filename_pattern = os.path.join('./data/images', '*.jpg')

        dsk_images = dask_image.imread.imread(filename_pattern)

        da.to_hdf5('data/storage' + "/" + 'stored.hdf5', {'/x': dsk_images[0]})
Ejemplo n.º 4
0
    def normalize(self, gropuname1, groupname2):
        # ## normalize y ## #
        with h5py.File(self.OUTPATH, mode='r+') as f:
            for atom in self.MAINCHAIN:
                # load
                train_y = da.from_array(
                    f[f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}'],
                    chunks=("auto", 3))
                val_y = da.from_array(
                    f[f'/{atom}/{groupname2}/{self.RESPONSE_NAME}'],
                    chunks=("auto", 3))

                total_y = da.concatenate([train_y, val_y], axis=0)
                y_mean = da.mean(total_y.reshape(-1), axis=0).compute()
                y_std = da.std(total_y.reshape(-1), axis=0).compute()

                # normalize
                train_y = da.divide(da.subtract(train_y, y_mean), y_std)
                val_y = da.divide(da.subtract(val_y, y_mean), y_std)

                # save
                da.to_hdf5(self.OUTPATH,
                           f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}',
                           train_y)
                da.to_hdf5(self.OUTPATH,
                           f'/{atom}/{groupname2}/{self.RESPONSE_NAME}', val_y)

                f.create_dataset(name=f'/{atom}/normalization',
                                 data=np.array([y_mean, y_std]))

                print(f'[{atom}]\tmean: {y_mean:.3f}\tstd: {y_std:.3f}')
Ejemplo n.º 5
0
def save_to_hdf5(arr,
                 file_path,
                 physik_cs=None,
                 key='/data',
                 compression=None):
    """ Save dask array to hdf5 dataset.

    Arguments: 
    ----------
        arr: dask array
        file_path
        physik_cs
        key
        compression: compression algorithm. If None then compression unabled.
    """

    print(f'Saving a dask array at {file_path}:')
    print(f'- physik_cs: {physik_cs}')
    print(f'- key: {key}')
    print(f'- compression: {compression}')

    da.to_hdf5(file_path, key, arr, chunks=physik_cs, compression=compression)
    print(f'Array successfully saved.\n')

    print(f'Inspecting created file...')
    with h5py.File(file_path, 'r') as f:
        inspect_h5py_file(f)
Ejemplo n.º 6
0
    def __call__(self, times=None):
        """Run the filtering process on this experiment."""

        # run over the full range of valid time indices unless specified otherwise
        tgrid = self.fieldset.gridset.grids[0].time
        if times is None:
            times = tgrid.copy()

            if self.uneven_window:
                raise NotImplementedError("uneven windows aren't supported")

        # restrict to period covered by window
        times = np.array(times)
        window_left = times - tgrid[0] >= self.window_size
        window_right = times <= tgrid[-1] - self.window_size
        times = times[window_left & window_right]

        da_out = {v: [] for v in self.sample_variables}

        # do the filtering at each timestep
        for idx, time in enumerate(times):
            # returns a dictionary of sample_variable -> dask array
            filtered = self.filter_step(idx, time)
            for v, a in filtered.items():
                da_out[v].append(a)

        # dump all to disk
        da.to_hdf5(self.name + ".h5",
                   {v: da.stack(a)
                    for v, a in da_out.items()})
Ejemplo n.º 7
0
 def save_data(self, path):
     """Writes the data in the right format to disk"""
     if self.metadata["input_type"] == "hyperspy":
         self.data.save(path)
     else:
         if isinstance(self.data, da.core.Array):
             da.to_hdf5(path, f"/{DEFAULT_DATA_FILE}", self.data)
         else:
             with h5py.File(path, "w") as f:
                 f.create_dataset(f"{DEFAULT_DATA_FILE}", data=self.data)
Ejemplo n.º 8
0
def save_arr(arr, storage_type, file_path, key='/data', axis=0, chunks_shape=None):
    """ Save array to hdf5 dataset or numpy file stack.
    """
    if storage_type == "hdf5":
        if chunks_shape:
            da.to_hdf5(file_path, key, arr, chunks=chunks_shape)
        else:
            da.to_hdf5(file_path, key, arr)
    elif storage_type == "numpy":
        da.to_npy_stack(os.path.join(file_path, "npy/"), arr, axis=axis)
    return
Ejemplo n.º 9
0
 def estimate_ld(hd5, filename, chromosome, threads, memory):
     print("Estimating LD for Chromosome", chromosome)
     dset = hd5['/%s' % chromosome][:]
     chunks = estimate_chunks(dset.shape, threads, memory)
     array = da.from_array(dset, chunks=chunks)
     del dset
     gc.collect()
     rho = da.corrcoef(da.ma.masked_invalid(array).T) ** 2
     filename='%s_ld.hdf5' % filename
     da.to_hdf5(filename, '/%s' % chromosome, rho)
     return chromosome, filename
Ejemplo n.º 10
0
def create_input_file(shape, dirname, file_manager):
    """ Creating the original array.
    """
    filename = f'{shape[0]}_{shape[1]}_{shape[2]}_original.hdf5'
    filepath = os.path.join(dirname, filename)

    if not os.path.isfile(filepath):
        arr = da.random.random(size=shape)
        arr = arr.astype(np.float16)
        da.to_hdf5(filepath, '/data', arr, chunks=None, compression=None)

    return filepath
Ejemplo n.º 11
0
 def saveSurface(self, dataSet, comment, dataType, surface):
     chunk_size = surface.shape
     #create the group for the new data set
     self.grid.data.create_group('geometeries/surfaces/' + dataSet)
     #save the dataType metedata
     self.grid.data['geometeries/surfaces/' +
                    dataSet].attrs['dataType'] = dataType
     #save all other metadata
     self.grid.data['geometeries/surfaces/' +
                    dataSet].attrs['comment'] = comment
     #save the surface
     da.to_hdf5(self.grid.fname,
                'geometeries/surfaces/' + dataSet + '/surface',
                da.from_array(surface, chunks=chunk_size))
Ejemplo n.º 12
0
    def shuffle(self, groupname):
        with h5py.File(self.OUTPATH, mode='r+') as f:
            for atom in self.MAINCHAIN:
                X = da.from_array(f[f'/{atom}/{groupname}/{self.EXPLANATORY_NAME}'])
                Y = da.from_array(f[f'/{atom}/{groupname}/{self.RESPONSE_NAME}'])

                random_order = np.random.permutation(X.shape[0])

                X = da.slicing.shuffle_slice(X, random_order)
                Y = da.slicing.shuffle_slice(Y, random_order)

                da.to_hdf5(self.OUTPATH, f'/{atom}/{groupname}/{self.EXPLANATORY_NAME}', X)
                da.to_hdf5(self.OUTPATH, f'/{atom}/{groupname}/{self.RESPONSE_NAME}', Y)

                print(f'{atom} shuffled.')
Ejemplo n.º 13
0
Archivo: main.py Proyecto: Runkli/Study
def main():

    x0=int(sys.argv[1])-1
    y0=int(sys.argv[2])-1
    z0=int(sys.argv[3])-1
    x1=int(sys.argv[4])
    y1=int(sys.argv[5])
    z1=int(sys.argv[6]) 
    
    filepath = '/home/ilknull/Files/Code/HPC-Study-master/Dask/'
    
    if(int(sys.argv[7])==1):
#        print("new")
        r=int(sys.argv[4])
        c=int(sys.argv[5])
        h=int(sys.argv[6])
        
        
        a = da.ones((r,c,h),dtype=np.int16)
        
        da.to_hdf5('in.hdf5',{'/a': a})
        np.array((r,c,h),dtype=np.int16).tofile(filepath+'meta.bin')
        
 
    dims = np.fromfile(filepath+'meta.bin',dtype=np.int16,count=-1)
   
    r = dims[0]
    c = dims[1]
    h = dims[2]
    
    
    fpIn = h5py.File('in.hdf5',mode='r+')
    
    procArrays=[]
    ar = fpIn['/a']
#    print(ar.shape)
#    print('types: ',type(ar),type(ar[0]))
    for z in range(h):    
        slic = ar[z]
        slic = delayed(proc)(slic,z,r,c,x0,x1,y0,y1,z0,z1).compute()
        procArrays.append(slic)

    procDask = da.stack(procArrays)
    print(procDask)
    print(procDask.compute())  
    

    da.to_hdf5('out.hdf5',{'/arr',procDask})
Ejemplo n.º 14
0
def write_h5(uri, path, data):
    import h5py
    import dask.array as da

    # NOTE
    # This failed to work, causing h5py cannot be pickled error.
    #   can't pickle local object when doing a to_hdf5 when using dask.distributed
    #   https://github.com/dask/distributed/issues/927
    da.to_hdf5(uri, path, data)

    # logger.info(f"uri={uri}, path={path}, data={data.shape},{data.dtype}")
    # with h5py.File(uri, "w") as h5:
    #    dst = h5.create_dataset(path, shape=data.shape, dtype=data.dtype)
    #    store(data, dst)

    return uri
Ejemplo n.º 15
0
    def assignMprops(self):
        #assign static, figure out dynamic assignment later
        static = True
        if (static):
            for units in self.mdata["UNITS"].keys():
                #load up the current grid obj
                print(units)
                vp = da.from_array(self.gridObj.data["/points/vp"],
                                   chunks=(self.gridObj.pointx.chunksize))
                vs = da.from_array(self.gridObj.data["/points/vs"],
                                   chunks=(self.gridObj.pointx.chunksize))
                p = da.from_array(self.gridObj.data["/points/p"],
                                  chunks=(self.gridObj.pointx.chunksize))
                qp = da.from_array(self.gridObj.data["/points/qp"],
                                   chunks=(self.gridObj.pointx.chunksize))
                qs = da.from_array(self.gridObj.data["/points/qs"],
                                   chunks=(self.gridObj.pointx.chunksize))
                #where is there a valid unit that matches the current unit or one that hasnt been assigned?
                unitAssignments = da.where(
                    da.logical_and(self.gridObj.pointunit == int(units),
                                   vp == -666), True, False)
                #note that unit assignment is NOT a list of indexes as you would normally expect, it behaves more like
                # a np.where operator which is why I need to reset it before each assignment
                vp[unitAssignments] = self.mdata["UNITS"][units]["VP"]
                unitAssignments = da.where(
                    da.logical_and(self.gridObj.pointunit == int(units),
                                   vs == -666), True, False)
                vs[unitAssignments] = self.mdata["UNITS"][units]["VS"]
                unitAssignments = da.where(
                    da.logical_and(self.gridObj.pointunit == int(units),
                                   p == -666), True, False)
                p[unitAssignments] = self.mdata["UNITS"][units]["P"]
                unitAssignments = da.where(
                    da.logical_and(self.gridObj.pointunit == int(units),
                                   qp == -666), True, False)
                qp[unitAssignments] = self.mdata["UNITS"][units]["QP"]
                unitAssignments = da.where(
                    da.logical_and(self.gridObj.pointunit == int(units),
                                   qs == -666), True, False)
                qs[unitAssignments] = self.mdata["UNITS"][units]["QS"]

                #save all of the changes to the arrays (for this cycle)
                da.to_hdf5(self.gridObj.fname, "/points/vp", vp)
                da.to_hdf5(self.gridObj.fname, "/points/vs", vs)
                da.to_hdf5(self.gridObj.fname, "/points/p", p)
                da.to_hdf5(self.gridObj.fname, "/points/qp", qp)
                da.to_hdf5(self.gridObj.fname, "/points/qs", qs)
Ejemplo n.º 16
0
def create_input_chunks_distributed(cs, partition, data_dir, file_format):
    """ for HDF5 only for now
        cs: chunk shape
        file_format: file format
        data_dir: to store the file
    """
    if not file_format == "HDF5":
        print("File format not supported yet. Aborting...")
        sys.exit(1)

    for i in range(6):
        for filename in os.listdir('/disk' + str(i) + '/gtimothee'):
            if filename.endswith(".json") or filename.endswith(".hdf5"):
                os.remove(os.path.join('/disk' + str(i) + '/gtimothee', filename))
    print(f"Creating input chunks...")

    disk_index = 0
    repartition_dict = dict()

    for i in range(partition[0]):
        for j in range(partition[1]):
            for k in range(partition[2]):
                print(f"Creating random array... shape: {cs}")
                arr = da.random.uniform(size=cs)
                print(f"Done, converting to float16...")
                arr = arr.astype(np.float16)
                out_filename = f'{i}_{j}_{k}.hdf5'
                print(f"Building {out_filename} with shape {cs}")
                data_dirpath = os.path.join('/disk' + str(disk_index), 'gtimothee')
                outfilepath = os.path.join(data_dirpath, out_filename)
                print(f"Storing on {data_dirpath}...")
                da.to_hdf5(outfilepath, '/data', arr, chunks=None, compression=None)

                repartition_dict[str((i,j,k))] = outfilepath

                disk_index += 1
                if disk_index == 6:
                    disk_index = 0

    print(f"Writing repartition file...")
    json_file = os.path.join('/disk0', 'gtimothee', 'repartition_dict.json')
    if os.path.isfile(json_file):
        os.remove(json_file)

    with open(json_file, 'w+') as outfile:
        json.dump(repartition_dict, outfile)
Ejemplo n.º 17
0
def save_as_hdf5(x, path, progress=False):
    if is_dask(x):
        if progress:
            with ProgressBar():
                da.to_hdf5(path,
                           'data',
                           x,
                           compression="gzip",
                           compression_opts=9)
        else:
            da.to_hdf5(path, 'data', x, compression="gzip", compression_opts=9)
    else:
        f = h5py.File(path, 'w')
        f.create_dataset('data',
                         data=x,
                         compression="gzip",
                         compression_opts=9)
Ejemplo n.º 18
0
def uncompress_to_hdf5():
    print('Writing to hdf5 file after loading raw data in RAM.')

    raw_arr = uncompress()

    # create dask array from data in RAM
    arr = da.from_array(raw_arr, chunks=(1400, 1400, 350))

    # write to numpy stack
    out_filepath = 'data/out.hdf5'
    if os.path.isfile(out_filepath):
        os.remove(out_filepath)

    out_file_path = "outputs/load_raw_write_hdf5_uncompressed.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        da.to_hdf5(out_filepath, 'data', arr, chunks=None)

        print(
            f'time to save the array to hdf5 without compression: {time.time() - t}'
        )
        visualize([prof, rprof, cprof], out_file_path)

    # write to numpy stack
    out_filepath = 'data/out.hdf5'
    os.remove(out_filepath)

    out_file_path = "outputs/load_raw_write_hdf5_commpressed.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        da.to_hdf5(out_filepath, 'data', arr, chunks=None, compression="gzip")

        print(
            f'time to save the array to hdf5 with compression: {time.time() - t}'
        )
        visualize([prof, rprof, cprof], out_file_path)
Ejemplo n.º 19
0
def test_to_hdf5():
    try:
        import h5py
    except ImportError:
        return
    x = da.ones((4, 4), chunks=(2, 2))
    y = da.ones(4, chunks=2, dtype='i4')

    with tmpfile('.hdf5') as fn:
        x.to_hdf5(fn, '/x')
        with h5py.File(fn) as f:
            d = f['/x']

            assert eq(d[:], x)
            assert d.chunks == (2, 2)

    with tmpfile('.hdf5') as fn:
        x.to_hdf5(fn, '/x', chunks=None)
        with h5py.File(fn) as f:
            d = f['/x']

            assert eq(d[:], x)
            assert d.chunks is None

    with tmpfile('.hdf5') as fn:
        x.to_hdf5(fn, '/x', chunks=(1, 1))
        with h5py.File(fn) as f:
            d = f['/x']

            assert eq(d[:], x)
            assert d.chunks == (1, 1)

    with tmpfile('.hdf5') as fn:
        da.to_hdf5(fn, {'/x': x, '/y': y})

        with h5py.File(fn) as f:
            assert eq(f['/x'][:], x)
            assert f['/x'].chunks == (2, 2)
            assert eq(f['/y'][:], y)
            assert f['/y'].chunks == (2,)
Ejemplo n.º 20
0
def save_arr(arr,
             storage_type,
             file_path,
             key='/data',
             axis=0,
             chunks_shape=None,
             compression=None):
    """ Save dask array to hdf5 dataset or numpy file stack.
    """

    if storage_type == "hdf5":
        if chunks_shape:
            print(f'Using chunk shape {chunks_shape}')
            da.to_hdf5(file_path, key, arr, chunks=chunks_shape)
        else:
            if compression == "gzip":
                print('Using gzip compression')
                da.to_hdf5(file_path,
                           key,
                           arr,
                           chunks=None,
                           compression="gzip")
            else:
                print('Without compression')
                da.to_hdf5(file_path, key, arr, chunks=None)
    elif storage_type == "numpy":
        da.to_npy_stack(os.path.join(file_path, "npy/"), arr, axis=axis)
Ejemplo n.º 21
0
def test_to_hdf5():
    try:
        import h5py
    except ImportError:
        return
    x = da.ones((4, 4), chunks=(2, 2))
    y = da.ones(4, chunks=2, dtype='i4')

    with tmpfile('.hdf5') as fn:
        x.to_hdf5(fn, '/x')
        with h5py.File(fn) as f:
            d = f['/x']

            assert eq(d[:], x)
            assert d.chunks == (2, 2)

    with tmpfile('.hdf5') as fn:
        x.to_hdf5(fn, '/x', chunks=None)
        with h5py.File(fn) as f:
            d = f['/x']

            assert eq(d[:], x)
            assert d.chunks is None

    with tmpfile('.hdf5') as fn:
        x.to_hdf5(fn, '/x', chunks=(1, 1))
        with h5py.File(fn) as f:
            d = f['/x']

            assert eq(d[:], x)
            assert d.chunks == (1, 1)

    with tmpfile('.hdf5') as fn:
        da.to_hdf5(fn, {'/x': x, '/y': y})

        with h5py.File(fn) as f:
            assert eq(f['/x'][:], x)
            assert f['/x'].chunks == (2, 2)
            assert eq(f['/y'][:], y)
            assert f['/y'].chunks == (2, )
Ejemplo n.º 22
0
def onthefly_to_hdf5():
    print('Writing to hdf5 file without loading raw data in RAM.')

    # write to numpy stack
    out_filepath = 'data/out.hdf5'
    if os.path.isfile(out_filepath):
        os.remove(out_filepath)

    out_file_path = "outputs/write_hdf5.html"
    with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler(
            metric=nbytes) as cprof:
        t = time.time()

        da.to_hdf5(out_filepath,
                   'data',
                   arr,
                   chunks=(1400, 1400, 350),
                   compression="gzip")

        print(
            f'time to save the array to hdf5 with compression: {time.time() - t}'
        )
        visualize([prof, rprof, cprof], out_file_path)
        #file_out_str = "test%d_numEvent1.hdf5"%label
        #file_out_str = "%s/%s_IMGall_RH%d_n%d_label%d.hdf5"%(eosDir,decay,int(scale[0]),neff,label)
        #file_out_str = "%s/%s_IMG_RH%d_n%dk.hdf5"%(eosDir,decay,int(scale[0]),neff//1000.)
        #file_out_str = "%s/%s_IMG_EBEEHBup_RH%d_n%dk.hdf5"%(eosDir,decay,int(scale[0]),neff//1000.)
        #file_out_str = "%s/%s_IMG_RH%d-%d_n%dk.hdf5"%(eosDir,decay,int(scale[0]),int(scale[1]),neff//1000.)
        print "  >> Writing to:", file_out_str
        if os.path.isfile(file_out_str):
            os.remove(file_out_str)
        #da.to_hdf5(file_out_str, {'/X_EB': X_EB, 'X_EEm': X_EEm, 'X_EEp': X_EEp, 'X_HBHE': X_HBHE, '/y': y}, compression='lzf')
        #da.to_hdf5(file_out_str, {'/X': X_EB, 'X_EEm': X_EEm, 'X_EEp': X_EEp, 'X_HBHE': X_HBHE, '/y': y}, compression='lzf')
        da.to_hdf5(file_out_str, {
                                  #'eventId': eventId,
                                  #'runId': runId,
                                  'X_ECAL': X_ECAL,
                                  #'X_ECAL_EEup': X_ECAL_EEup,
                                  'X_ECAL_stacked': X_ECAL_stacked,
                                  'X_EB': X_EB,
                                  'X_EEm': X_EEm,
                                  'X_EEp': X_EEp,
                                  'X_HBHE': X_HBHE,
                                  #'X_HBHE_EM': X_HBHE_EM,
                                  'X_HBHE_EB_up': X_HBHE_EB_up,
                                  'jetSeed_iphi': jetSeed_iphi,
                                  'jetSeed_ieta': jetSeed_ieta,
                                  'X_jets': X_jets,
                                  'jetPt': jetPt,
                                  'jetM': jetM,
                                  '/y': y
                                  }, compression='lzf')
        print "  >> Done.\n"
Ejemplo n.º 24
0
    # subtract mean
    axes = tuple(np.arange(b1.ndim, dtype=int)[b1.ndim//2:])
    b1 -= b1.mean(axis=axes, keepdims=True)
    b2 -= b2.mean(axis=axes, keepdims=True)
    # numerator of corrcoef
    numerator = np.multiply(b1, b2).mean(axis=axes, keepdims=False)
    # denomenator of corrcoef
    dof = np.prod( b1.shape[slice(axes[0], axes[-1]+1)] )
    b1_std = np.sqrt( (b1**2).mean(axis=axes, keepdims=False) / dof )
    b2_std = np.sqrt( (b2**2).mean(axis=axes, keepdims=False) / dof )
    denominator = np.multiply(b1_std, b2_std)
    # divide
    out = np.divide(numerator, denominator)
    return out


if __name__ == '__main__':
    f1 = h5py.File("test.h5", "r")
    f2 = h5py.File("test2.h5", "r")
    arr1 = da.from_array(f1["arr"])
    arr2 = da.from_array(f2["arr"])

    block_shape = (10, 10)

    with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof,\
            ProgressBar():
        out = da.map_blocks(corrcoef, arr1, arr2, block_shape,
                chunks=(400, 400))
        da.to_hdf5("out.h5", "/arr", out)
    visualize([prof, rprof])
Ejemplo n.º 25
0
for j, decay in enumerate(decays):

    #file_in_str = "%s/%s_IMGCROPS_n249k_RH.hdf5"%(eosDir,decay)
    #file_in_str = "%s/%s_IMGCROPS_n249k_pT.hdf5"%(eosDir,decay)
    file_in_str = "%s/%s_n250k_IMG_RHraw.hdf5" % (eosDir, decay)
    dset = h5py.File(file_in_str)
    X_in = da.from_array(dset['/X'], chunks=chunk_shape)
    y_in = da.from_array(dset['/y'], chunks=(chunk_size, ))
    assert X_in.shape[0] == y_in.shape[0]
    events = X_in.shape[0]
    #events = 10000
    assert events % chunk_size == 0
    print " >> Doing decay:", decay
    print " >> Input file:", file_in_str
    print " >> Total events:", events

    print " >> Processing..."
    X = da.concatenate([da.from_delayed(\
                        process_chunk(X_in[i:i+chunk_size]),\
                        shape=(chunk_size,170,360,2),\
                        dtype=np.float32)\
                        for i in range(0,events,chunk_size)])

    #file_out_str = "%s/%s_IMGCROPS_n249k_pT_RHv%d.hdf5"%(eosDir,decay,ver)
    file_out_str = "%s/%s_n250k_IMG_RHv%d.hdf5" % (eosDir, decay, ver)
    print " >> Writing to:", file_out_str
    da.to_hdf5(file_out_str, {'/X': X, '/y': y_in}, compression='lzf')

    print " >> Done.\n"
Ejemplo n.º 26
0
    branches = ["m0"]
    m0 = da.concatenate([\
                da.from_delayed(\
                    load_single(tree,i,i+chunk_size, branches),\
                    shape=(chunk_size,),\
                    dtype=np.float32)\
                for i in range(0,neff,chunk_size)])
    print " >> Expected shape:", m0.shape

    # Class label
    label = j
    #label = 1
    print " >> Class label:", label
    y = da.from_array(\
            np.full(X.shape[0], label, dtype=np.float32),\
            chunks=(chunk_size,))

    file_out_str = "%s/%s_FC_n%dk_label%d.hdf5" % (eosDir, decay,
                                                   neff // 1000., label)
    #file_out_str = "test.hdf5"
    print " >> Writing to:", file_out_str
    da.to_hdf5(file_out_str, {
        '/X': X,
        '/y': y,
        'eventId': eventId,
        'm0': m0
    },
               compression='lzf')

    print " >> Done.\n"
Ejemplo n.º 27
0
def write_main_dataset(h5_parent_group,
                       main_data,
                       main_data_name,
                       quantity,
                       units,
                       pos_dims,
                       spec_dims,
                       main_dset_attrs=None,
                       h5_pos_inds=None,
                       h5_pos_vals=None,
                       h5_spec_inds=None,
                       h5_spec_vals=None,
                       aux_spec_prefix='Spectroscopic_',
                       aux_pos_prefix='Position_',
                       verbose=False,
                       **kwargs):
    """
    Writes the provided data as a 'Main' dataset with all appropriate linking.
    By default, the instructions for generating the ancillary datasets should be specified using the pos_dims and
    spec_dims arguments as dictionary objects. Alternatively, if both the indices and values datasets are already
    available for either/or the positions / spectroscopic, they can be specified using the keyword arguments. In this
    case, fresh datasets will not be generated.

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group`
        Parent group under which the datasets will be created
    main_data : numpy.ndarray, dask.array.core.Array, list or tuple
        2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset.
        If creating an empty dataset - the dtype must be specified via a kwarg.
    main_data_name : String / Unicode
        Name to give to the main dataset. This cannot contain the '-' character.
    quantity : String / Unicode
        Name of the physical quantity stored in the dataset. Example - 'Current'
    units : String / Unicode
        Name of units for the quantity stored in the dataset. Example - 'A' for amperes
    pos_dims : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Position indices and values datasets
    spec_dims : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Spectroscopic indices and values datasets
    main_dset_attrs : dictionary, Optional
        Dictionary of parameters that will be written to the main dataset. Do NOT include region references here.
    h5_pos_inds : h5py.Dataset, Optional
        Dataset that will be linked with the name "Position_Indices"
    h5_pos_vals : h5py.Dataset, Optional
        Dataset that will be linked with the name "Position_Values"
    h5_spec_inds : h5py.Dataset, Optional
        Dataset that will be linked with the name "Spectroscopic_Indices"
    h5_spec_vals : h5py.Dataset, Optional
        Dataset that will be linked with the name "Spectroscopic_Values"
    aux_spec_prefix : str or unicode, Optional
        Default prefix for Spectroscopic datasets. Default = "Spectroscopic"
    aux_pos_prefix : str or unicode, Optional
        Default prefix for Position datasets. Default = "Position"
    verbose : bool, Optional, default=False
        If set to true - prints debugging logs
    kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other
        arguments this way

    Returns
    -------
    h5_main : USIDataset
        Reference to the main dataset

    """
    def __check_anc_before_creation(aux_prefix, dim_type='pos'):
        aux_prefix = validate_single_string_arg(aux_prefix,
                                                'aux_' + dim_type + '_prefix')
        if not aux_prefix.endswith('_'):
            aux_prefix += '_'
        if '-' in aux_prefix:
            warn(
                'aux_' + dim_type +
                ' should not contain the "-" character. Reformatted name from:{} to '
                '{}'.format(aux_prefix, aux_prefix.replace('-', '_')))
        aux_prefix = aux_prefix.replace('-', '_')
        for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']:
            if dset_name in h5_parent_group.keys():
                raise KeyError('Dataset named: ' + dset_name +
                               ' already exists in group: '
                               '{}'.format(h5_parent_group.name))
        return aux_prefix

    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError(
            'h5_parent_group should be a h5py.File or h5py.Group object')
    if not is_editable_h5(h5_parent_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

    quantity, units, main_data_name = validate_string_args(
        [quantity, units, main_data_name],
        ['quantity', 'units', 'main_data_name'])
    if verbose:
        print('quantity, units, main_data_name all OK')

    quantity = quantity.strip()
    units = units.strip()
    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn(
            'main_data_name should not contain the "-" character. Reformatted name from:{} to '
            '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')

    if isinstance(main_data, (list, tuple)):
        if not contains_integers(main_data, min_val=1):
            raise ValueError(
                'main_data if specified as a shape should be a list / tuple of integers >= 1'
            )
        if len(main_data) != 2:
            raise ValueError(
                'main_data if specified as a shape should contain 2 numbers')
        if 'dtype' not in kwargs:
            raise ValueError(
                'dtype must be included as a kwarg when creating an empty dataset'
            )
        _ = validate_dtype(kwargs.get('dtype'))
        main_shape = main_data
        if verbose:
            print('Selected empty dataset creation. OK so far')
    elif isinstance(main_data, (np.ndarray, da.core.Array)):
        if main_data.ndim != 2:
            raise ValueError('main_data should be a 2D array')
        main_shape = main_data.shape
        if verbose:
            print('Provided numpy or Dask array for main_data OK so far')
    else:
        raise TypeError(
            'main_data should either be a numpy array or a tuple / list with the shape of the data'
        )

    if h5_pos_inds is not None and h5_pos_vals is not None:
        # The provided datasets override fresh building instructions.
        validate_anc_h5_dsets(h5_pos_inds,
                              h5_pos_vals,
                              main_shape,
                              is_spectroscopic=False)
        if verbose:
            print('Provided h5 position indices and values OK')
    else:
        aux_pos_prefix = __check_anc_before_creation(aux_pos_prefix,
                                                     dim_type='pos')
        pos_dims = validate_dimensions(pos_dims, dim_type='Position')
        validate_dims_against_main(main_shape,
                                   pos_dims,
                                   is_spectroscopic=False)
        if verbose:
            print('Passed all pre-tests for creating position datasets')
        h5_pos_inds, h5_pos_vals = write_ind_val_dsets(
            h5_parent_group,
            pos_dims,
            is_spectral=False,
            verbose=verbose,
            base_name=aux_pos_prefix)
        if verbose:
            print('Created position datasets!')

    if h5_spec_inds is not None and h5_spec_vals is not None:
        # The provided datasets override fresh building instructions.
        validate_anc_h5_dsets(h5_spec_inds,
                              h5_spec_vals,
                              main_shape,
                              is_spectroscopic=True)
        if verbose:
            print('Provided h5 spectroscopic datasets were OK')
    else:
        aux_spec_prefix = __check_anc_before_creation(aux_spec_prefix,
                                                      dim_type='spec')
        spec_dims = validate_dimensions(spec_dims, dim_type='Spectroscopic')
        validate_dims_against_main(main_shape,
                                   spec_dims,
                                   is_spectroscopic=True)
        if verbose:
            print('Passed all pre-tests for creating spectroscopic datasets')
        h5_spec_inds, h5_spec_vals = write_ind_val_dsets(
            h5_parent_group,
            spec_dims,
            is_spectral=True,
            verbose=verbose,
            base_name=aux_spec_prefix)
        if verbose:
            print('Created Spectroscopic datasets')

    if h5_parent_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn(
                'This HDF5 file has been opened wth the "mpio" communicator. '
                'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed'
            )

    if isinstance(main_data, np.ndarray):
        # Case 1 - simple small dataset
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 data=main_data,
                                                 **kwargs)
        if verbose:
            print('Created main dataset with provided data')
    elif isinstance(main_data, da.core.Array):
        # Case 2 - Dask dataset
        # step 0 - get rid of any automated dtype specification:
        _ = kwargs.pop('dtype', None)
        # step 1 - create the empty dataset:
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 shape=main_data.shape,
                                                 dtype=main_data.dtype,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset: {} for writing Dask dataset: {}'.
                  format(h5_main, main_data))
            print(
                'Dask array will be written to HDF5 dataset: "{}" in file: "{}"'
                .format(h5_main.name, h5_main.file.filename))
        # Step 2 - now ask Dask to dump data to disk
        da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data})
        # main_data.to_hdf5(h5_main.file.filename, h5_main.name)  # Does not work with python 2 for some reason
    else:
        # Case 3 - large empty dataset
        h5_main = h5_parent_group.create_dataset(main_data_name, main_data,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset for Main')

    write_simple_attrs(h5_main, {'quantity': quantity, 'units': units})
    if verbose:
        print('Wrote quantity and units attributes to main dataset')

    if isinstance(main_dset_attrs, dict):
        write_simple_attrs(h5_main, main_dset_attrs)
        if verbose:
            print('Wrote provided attributes to main dataset')

    write_book_keeping_attrs(h5_main)

    # make it main
    link_as_main(h5_main, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals)
    if verbose:
        print('Successfully linked datasets - dataset should be main now')

    from ..usi_data import USIDataset
    return USIDataset(h5_main)
        #file_out_str = "%s/%s_IMG_RH%d_n%dk_label%d.hdf5"%(eosDir,decay,int(scale),neff//1000.,label)
        #file_out_str = "%s/%s_IMGcropV4_RH%d_n%dkx2_wgt.hdf5"%(eosDir,decay,int(scale),neff//1000.)
        #file_out_str = "%s/%s_IMG/%s_IMG_RH%d_n%d_%d.hdf5"%(eosDir,decay,decay,int(scale),neff*2,n)
        file_out_str = "%s_IMG_RH%d_n%d_%d.hdf5" % (decay, int(scale),
                                                    neff * 2, n)
        #file_out_str = "test.hdf5"
        print " >> Writing to:", file_out_str
        #da.to_hdf5(file_out_str, {'/X': X, '/y': y, 'eventId': eventId, 'X_crop0': X_crop0, 'X_crop1': X_crop1}, compression='lzf')
        da.to_hdf5(
            file_out_str,
            {
                '/X': X,
                '/y': y,
                #'eventId': eventId,
                'X_crop0': X_crop0,
                'X_crop_stack0': X_crop_stack0,
                #'X_crop1': X_crop1
                #'X_p4': X_p4
                'y_mass': y_mass0,
                'y_pT': y_pT0,
                'y_DR': y_DR0,
                #'pho_pT0': pho_pT0,
                #'pho_E0': pho_E0,
                #'pho_eta0': pho_eta0
                #'wgt': wgt
            },
            compression='lzf')

        print " >> Done.\n"
Ejemplo n.º 29
0
                for i in range(0,neff,chunk_size)])
    print " >> Expected shape:", X_HBHE.shape

    # Class label
    label = j
    #label = 1
    print " >> Class label:", label
    y = da.from_array(\
            np.full(X_EB.shape[0], label, dtype=np.float32),\
            chunks=(chunk_size,))

    #file_out_str = "%s/%s_IMG_RHraw_n%dk.hdf5"%(eosDir,decay,neff//1000.)
    #file_out_str = "%s/%s_IMG_RHv1_n%dk.hdf5"%(eosDir,decay,neff//1000.)
    #file_out_str = "%s/%s_IMG_RH%d_n%dk.hdf5"%(eosDir,decay,int(rescaler),neff//1000.)
    #file_out_str = "%s/%s_IMG_RH%d-%d_n%dk.hdf5"%(eosDir,decay,int(scale[0]),int(scale[1]),neff//1000.)
    file_out_str = "%s/TEST_%s_IMG_RH%d-%d_n%dk.hdf5" % (
        eosDir, decay, int(scale[0]), int(scale[1]), neff // 1000.)
    print " >> Writing to:", file_out_str
    #da.to_hdf5(file_out_str, {'/X': X, '/y': y}, chunks=(chunk_size,s,s,2), compression='lzf')
    #da.to_hdf5(file_out_str, {'/X_EB': X_EB, 'X_EEm': X_EEm, 'X_EEp': X_EEp, '/y': y}, compression='lzf')
    da.to_hdf5(file_out_str, {
        '/X_EB': X_EB,
        'X_EEm': X_EEm,
        'X_EEp': X_EEp,
        'X_HBHE': X_HBHE,
        '/y': y
    },
               compression='lzf')

    print " >> Done.\n"
Ejemplo n.º 30
0
for seed in xrange(1, 6):
    idx_under = np.empty((0, ), dtype=int)
    random_state = check_random_state(None)
    for target_class in np.unique(y_label):
        if target_class == 7:
            n_samples = 10000
            index_class = random_state.choice(range(
                np.count_nonzero(y_label == target_class)),
                                              size=n_samples)
        else:
            index_class = slice(None)
        idx_under = np.concatenate(
            (idx_under, np.flatnonzero(y_label == target_class)[index_class]),
            axis=0)

    sub_X = safe_indexing(trainSet_X, idx_under)
    sub_y = safe_indexing(trainSet_y, idx_under)
    sub_label = safe_indexing(y_label, idx_under)

    print('sub counter')
    print(sorted(Counter(sub_label).items()))

    trainSet_X_seed = da.from_array(sub_X, chunks=128)
    trainSet_y_seed = da.from_array(sub_y, chunks=128)
    da.to_hdf5(
        'train_LRCL_informed_window' + str(SLIDING_WINDOW_LENGTH) + '_seed' +
        str(seed) + '.h5', {
            'data': trainSet_X_seed,
            'label': trainSet_y_seed
        })
    jetEventId = jetEventId_[runMask][:nJets]
    print " >> %s: %s"%('jetEventId', jetEventId.shape)
    jetRunId = jetRunId_[runMask][:nJets]
    print " >> %s: %s"%('jetRunId', jetRunId.shape)
    X_ECAL_stacked = X_ECAL_stacked_[runMask][:nJets]
    print " >> %s: %s"%('X_ECAL_stacked', X_ECAL_stacked.shape)
    y_jets = y_jets_[runMask][:nJets]
    print " >> %s: %s"%('y_jets', y_jets.shape)

    #file_out_str = "test_jets.hdf5"
    file_out_str = "%s/%s/%s_n%d_label%d_run%d.hdf5"%(eosDir, decay, decay, nJets, label, i)
    #file_out_str = "%s/%s/%s_n%d_label%d_jet%d_run%d.hdf5"%(eosDir, decay, decay, nJets, label, ijet, i)
    print " >> Writing to:", file_out_str
    da.to_hdf5(file_out_str, {
                            #'runId': runId,
                            #'lumiId': lumiId,
                            #'eventId': eventId,
                            'X_ECAL_stacked': X_ECAL_stacked,
                            #'y': y,
                            'jetRunId': jetRunId,
                            'jetEventId': jetEventId,
                            #'jetSeed_iphi': jetSeed_iphi,
                            #'jetSeed_ieta': jetSeed_ieta,
                            #'jetM': jetM,
                            #'jetPt': jetPt,
                            #'X_jets': X_jets,
                            'y_jets': y_jets 
                            }, compression='lzf')

    print " >> Done.\n"
Ejemplo n.º 32
0
import h5py
from glob import glob
import os

filenames = sorted(glob(os.path.join('data', 'weather-big', '*.hdf5')))
dsets = [h5py.File(filename, mode='r')['/t2m'] for filename in filenames]

import dask.array as da
arrays = [da.from_array(dset, chunks=(500, 500)) for dset in dsets]

x = da.stack(arrays, axis=0)

result = x[:, ::2, ::2]

da.to_hdf5(os.path.join('data', 'myfile.hdf5'), '/output', result)