Python copy_all Examples, zarr.copy_all Python Examples

Example #1

0

Show file

def convert(
        input,
        output,
        chunk_size=16 * 1024 * 1024,
        genome=None,
        overwrite=False
):
    input_path, input_ext = splitext(input)
    output_path, output_ext = splitext(output)

    print('converting: %s to %s' % (input, output))

    if input_ext == '.h5' or input_ext == '.loom':
        if output_ext == '.zarr':
            # Convert 10x (HDF5) to Zarr
            source = h5py.File(input)
            zarr.tree(source)

            store = zarr.DirectoryStore(output)
            dest = zarr.group(store=store, overwrite=overwrite)

            # following fails if without_attrs=False (the default), possibly related to https://github.com/h5py/h5py/issues/973
            zarr.copy_all(source, dest, log=sys.stdout, without_attrs=True)
            zarr.tree(dest)
        elif output_ext == '.h5ad':
            if not genome:
                keys = list(h5py.File(input).keys())
                if len(keys) == 1:
                    genome = keys[0]
                else:
                    raise Exception(
                        'Set --genome flag when converting from 10x HDF5 (.h5) to Anndata HDF5 (.h5ad); top-level groups in file %s: %s'
                        % (input, ','.join(keys))
                    )
            adata = read_10x_h5(input, genome=genome)

            # TODO: respect overwrite flag
            adata.write(output)

    elif input_ext == '.h5ad':
        adata = read_h5ad(input, backed='r')
        (r, c) = adata.shape
        chunks = (getsize(input) - 1) / chunk_size + 1
        chunk_size = (r - 1) / chunks + 1
        if output_ext == '.zarr':
            print('converting %s (%dx%d) to %s in %d chunks (%d rows each)' % (input, r, c, output, chunks, chunk_size))

            # TODO: respect overwrite flag

            adata.write_zarr(
                make_store(output),
                chunks=(chunk_size, c)
            )
        else:
            raise Exception('Unrecognized output extension: %s' % output_ext)
    else:
        raise Exception('Unrecognized input extension: %s' % input_ext)

Example #2

0

Show file

File: images.py Project: DiamondLightSource/python-tristan

def save_multiple_images(
    array: da.Array, output_file: Path, write_mode: str = "x"
) -> None:
    """
    Calculate and store a Dask array in an HDF5 file without exceeding available memory.

    Use the Dask distributed scheduler to compute a Dask array and store the
    resulting values to a data set 'data' in the root group of an HDF5 file.  The
    distributed scheduler is capable of managing worker memory better than the
    default scheduler.  In the latter case, the workers can sometimes demand more
    than the available amount of memory.  Using the distributed scheduler avoids this
    problem.

    The distributed scheduler cannot write directly to HDF5 files because h5py.File
    objects are not serialisable.  To work around this issue, the data are first
    stored to a Zarr DirectoryStore, then copied to the final HDF5 file and the Zarr
    store deleted.

    Multithreading is used, as the calculation is assumed to be I/O bound.

    Args:
        array:  A Dask array to be calculated and stored.
        output_file:  Path to the output HDF5 file.
        write_mode:  HDF5 file opening mode.  See :class:`h5py.File`.
    """
    # Set a more generous connection timeout than the default 30s.
    with dask.config.set(
        {
            "distributed.comm.timeouts.connect": "60s",
            "distributed.comm.timeouts.tcp": "60s",
            "distributed.deploy.lost-worker-timeout": "60s",
            "distributed.scheduler.idle-timeout": "600s",
            "distributed.scheduler.locks.lease-timeout": "60s",
        }
    ):
        intermediate = str(output_file.with_suffix(".zarr"))

        # Overwrite any pre-existing Zarr storage.  Don't compute immediately but
        # return the Array object so we can compute it with a progress bar.
        method = {"overwrite": True, "compute": False, "return_stored": True}
        # Prepare to save the calculated images to the intermediate Zarr store.
        array = array.to_zarr(intermediate, component="data", **method)
        # Compute the Array and store the values, using a progress bar.
        progress(array.persist())

    print("\nTransferring the images to the output file.")
    store = zarr.DirectoryStore(intermediate)
    with h5py.File(output_file, write_mode) as f:
        zarr.copy_all(zarr.open(store), f, **Bitshuffle())

    # Delete the Zarr store.
    store.clear()

Example #3

0

Show file

def write():
    with h5py.File(h5_file) as source:

        store = zarr.MongoDBStore(database='geosim', collection='Приобка')
        # z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True)
        dest = zarr.open(
            store,
            mode='w',
        )
        # print('compressor', dest.compressor)
        zarr.storage.default_compressor = Zstd(level=1)
        # zarr.storage.compressor =Zstd(level=1)
        zarr.copy_all(source, dest, log=sys.stdout)
        # z[...] = 42
        print(dest['DSET0'])
        store.close()

Example #4

0

Show file

File: preprocess_image.py Project: priyaptl/vqa-project

def features_to_zarr(phase):
    FIELDNAMES = [
        'image_id', 'image_w', 'image_h', 'num_boxes', 'boxes', 'features'
    ]

    if phase == 'trainval':
        infiles = [
            'raw/trainval_36/trainval_resnet101_faster_rcnn_genome_36.tsv',
        ]
    elif phase == 'test':
        infiles = [
            'raw/test2015_36/test2015_resnet101_faster_rcnn_genome_36.tsv',
        ]
    else:
        raise SystemExit('Unrecognised phase')

    # Read the tsv and load data in a dictionary
    in_data = {}
    for infile in infiles:
        print(infile)
        with open(infile, "r") as tsv_in_file:
            reader = csv.DictReader(tsv_in_file,
                                    delimiter='\t',
                                    fieldnames=FIELDNAMES)
            for item in reader:
                item['image_id'] = str(item['image_id'])
                item['image_h'] = int(item['image_h'])
                item['image_w'] = int(item['image_w'])
                item['num_boxes'] = int(item['num_boxes'])
                for field in ['boxes', 'features']:
                    encoded_str = base64.decodestring(
                        item[field].encode('utf-8'))
                    item[field] = np.frombuffer(encoded_str,
                                                dtype=np.float32).reshape(
                                                    (item['num_boxes'], -1))
                in_data[item['image_id']] = item

    # convert dict to pandas dataframe
    train = pd.DataFrame.from_dict(in_data)
    train = train.transpose()

    # create image sizes csv
    print('Writing image sizes csv...')
    d = train.to_dict()
    dw = d['image_w']
    dh = d['image_h']
    d = [dw, dh]
    dwh = {}
    for k in dw.keys():
        dwh[k] = np.array([d0[k] for d0 in d])
    image_sizes = pd.DataFrame(dwh)
    image_sizes.to_csv(phase + '_image_size.csv')

    # select bounding box coordinates and fill hdf5
    h = h5py.File(phase + 'box.hdf5', mode='w')
    t = train['boxes']
    d = t.to_dict()
    print('Creating bounding box file...')
    for k, v in tqdm(d.items()):
        h.create_dataset(str(k), data=v)
    if h:
        h.close()

    # convert to zarr
    print('Writing zarr file...')
    i_feat = h5py.File(phase + 'box.hdf5', 'r', libver='latest')
    dest = zarr.open_group(phase + '_boxes.zarr', mode='w')
    zarr.copy_all(i_feat, dest)
    i_feat.close()
    dest.close()
    os.remove(phase + 'box.hdf5')

    # select features and fill hdf5
    h = h5py.File(phase + '.hdf5', mode='w')
    t = train['features']
    d = t.to_dict()
    print('Creating image features file...')
    for k, v in tqdm(d.items()):
        h.create_dataset(str(k), data=v)
    if h:
        h.close()

    # convert to zarr
    print('Writing zarr file...')
    i_feat = h5py.File(phase + '.hdf5', 'r', libver='latest')
    dest = zarr.open_group(phase + '.zarr', mode='w')
    zarr.copy_all(i_feat, dest)
    i_feat.close()
    dest.close()
    os.remove(phase + '.hdf5')

Example #5

0

Show file

File: test_pipeline_dask.py Project: cbc-group/segmentation

def convert_hdf5(zarr_src, h5_dst):
    source = zarr.open(zarr_src, "r")

    with h5py.File(h5_dst, mode="w") as dest:
        zarr.copy_all(source, dest, log=sys.stdout, if_exists="replace")

Example #6

0

Show file

def zarr_to_h5(zarr_path, h5_path):
    source = zarr.open(zarr_path, "r")

    with h5py.File(h5_path, mode="w") as dest:
        zarr.copy_all(source, dest, log=sys.stdout, if_exists="replace")

Example #7

0

Show file

File: local-10x-to-zarr.py Project: tomwhite/single-cell-experiments

import h5py
import sys
import zarr

# Convert 10x (HDF5) to Zarr
source = h5py.File("/Downloads/1M_neurons_filtered_gene_bc_matrices_h5.h5")
zarr.tree(source)

store = zarr.DirectoryStore('data/10x.zarr')
dest = zarr.group(store=store, overwrite=True)
# following fails if without_attrs=False (the default), possibly related to https://github.com/h5py/h5py/issues/973
zarr.copy_all(source, dest, log=sys.stdout, without_attrs=True)

Example #8

0

Show file

    print('Finished!')


hdf50 = Path(f'{int_data_dir}/{phases[0]}_u{u_approach}_inp{input_size}_processed.h5') 
hdf51 = Path(f'{int_data_dir}/{phases[1]}_u{u_approach}_inp{input_size}_processed.h5') 
zarr0 = Path(f'{proc_data_dir}/{phases[0]}_u{u_approach}_inp{input_size}_processed.zarr')
zarr1 = Path(f'{proc_data_dir}/{phases[1]}_u{u_approach}_inp{input_size}_processed.zarr')

if not (hdf50.is_file() or hdf51.is_file()):
    img_paths = {phase: raw_data_dir + dframe[phase].iloc[:,0] for phase in phases}
    proc_images(img_paths, labels_array, data_dir, u_approach, input_size, phases=phases, tforms=tforms) 

if not (zarr0.is_dir() or zarr1.is_dir()):
    source = h5py.File(hdf5paths['val'], mode='r')
    dest = zarr.open_group(zarrpaths['val'], mode='w')
    zarr.copy_all(source, dest, log="zarr1.output")

    source = h5py.File(hdf5paths['train'], mode='r')
    dest = zarr.open_group(zarrpaths['train'], mode='w')
    zarr.copy_all(source, dest, log="zarr0.output")


# In[20]:


# #Accounting for imbalanced classes
# df = dframe['train'].iloc[:,5:].fillna(0).copy()
# df = df.replace(-1, np.nan)

# # #Get a list of the number of positive, negative, and uncertain samples for each class
# class_sample_count = [df[df==t].count() for t in [-1, 0, 1]]

Example #9

0

Show file

def read():
    store = zarr.MongoDBStore(database='geosim', collection='Приобка')
    source = zarr.open(store, mode='r')
    # source = z['DSET0']
    with h5py.File(h5_dst) as dest:
        zarr.copy_all(source, dest, log=sys.stdout, if_exists='replace')

Example #10

0

Show file

File: images.py Project: DiamondLightSource/python-tristan

 def sequence_to_disk(i, output_file):
     with h5py.File(output_file, write_mode) as f:
         return zarr.copy_all(arrays[i], f, **Bitshuffle())