Example #1
0
def convert(
        input,
        output,
        chunk_size=16 * 1024 * 1024,
        genome=None,
        overwrite=False
):
    input_path, input_ext = splitext(input)
    output_path, output_ext = splitext(output)

    print('converting: %s to %s' % (input, output))

    if input_ext == '.h5' or input_ext == '.loom':
        if output_ext == '.zarr':
            # Convert 10x (HDF5) to Zarr
            source = h5py.File(input)
            zarr.tree(source)

            store = zarr.DirectoryStore(output)
            dest = zarr.group(store=store, overwrite=overwrite)

            # following fails if without_attrs=False (the default), possibly related to https://github.com/h5py/h5py/issues/973
            zarr.copy_all(source, dest, log=sys.stdout, without_attrs=True)
            zarr.tree(dest)
        elif output_ext == '.h5ad':
            if not genome:
                keys = list(h5py.File(input).keys())
                if len(keys) == 1:
                    genome = keys[0]
                else:
                    raise Exception(
                        'Set --genome flag when converting from 10x HDF5 (.h5) to Anndata HDF5 (.h5ad); top-level groups in file %s: %s'
                        % (input, ','.join(keys))
                    )
            adata = read_10x_h5(input, genome=genome)

            # TODO: respect overwrite flag
            adata.write(output)

    elif input_ext == '.h5ad':
        adata = read_h5ad(input, backed='r')
        (r, c) = adata.shape
        chunks = (getsize(input) - 1) / chunk_size + 1
        chunk_size = (r - 1) / chunks + 1
        if output_ext == '.zarr':
            print('converting %s (%dx%d) to %s in %d chunks (%d rows each)' % (input, r, c, output, chunks, chunk_size))

            # TODO: respect overwrite flag

            adata.write_zarr(
                make_store(output),
                chunks=(chunk_size, c)
            )
        else:
            raise Exception('Unrecognized output extension: %s' % output_ext)
    else:
        raise Exception('Unrecognized input extension: %s' % input_ext)
def save_multiple_images(
    array: da.Array, output_file: Path, write_mode: str = "x"
) -> None:
    """
    Calculate and store a Dask array in an HDF5 file without exceeding available memory.

    Use the Dask distributed scheduler to compute a Dask array and store the
    resulting values to a data set 'data' in the root group of an HDF5 file.  The
    distributed scheduler is capable of managing worker memory better than the
    default scheduler.  In the latter case, the workers can sometimes demand more
    than the available amount of memory.  Using the distributed scheduler avoids this
    problem.

    The distributed scheduler cannot write directly to HDF5 files because h5py.File
    objects are not serialisable.  To work around this issue, the data are first
    stored to a Zarr DirectoryStore, then copied to the final HDF5 file and the Zarr
    store deleted.

    Multithreading is used, as the calculation is assumed to be I/O bound.

    Args:
        array:  A Dask array to be calculated and stored.
        output_file:  Path to the output HDF5 file.
        write_mode:  HDF5 file opening mode.  See :class:`h5py.File`.
    """
    # Set a more generous connection timeout than the default 30s.
    with dask.config.set(
        {
            "distributed.comm.timeouts.connect": "60s",
            "distributed.comm.timeouts.tcp": "60s",
            "distributed.deploy.lost-worker-timeout": "60s",
            "distributed.scheduler.idle-timeout": "600s",
            "distributed.scheduler.locks.lease-timeout": "60s",
        }
    ):
        intermediate = str(output_file.with_suffix(".zarr"))

        # Overwrite any pre-existing Zarr storage.  Don't compute immediately but
        # return the Array object so we can compute it with a progress bar.
        method = {"overwrite": True, "compute": False, "return_stored": True}
        # Prepare to save the calculated images to the intermediate Zarr store.
        array = array.to_zarr(intermediate, component="data", **method)
        # Compute the Array and store the values, using a progress bar.
        progress(array.persist())

    print("\nTransferring the images to the output file.")
    store = zarr.DirectoryStore(intermediate)
    with h5py.File(output_file, write_mode) as f:
        zarr.copy_all(zarr.open(store), f, **Bitshuffle())

    # Delete the Zarr store.
    store.clear()
Example #3
0
def write():
    with h5py.File(h5_file) as source:

        store = zarr.MongoDBStore(database='geosim', collection='Приобка')
        # z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True)
        dest = zarr.open(
            store,
            mode='w',
        )
        # print('compressor', dest.compressor)
        zarr.storage.default_compressor = Zstd(level=1)
        # zarr.storage.compressor =Zstd(level=1)
        zarr.copy_all(source, dest, log=sys.stdout)
        # z[...] = 42
        print(dest['DSET0'])
        store.close()
Example #4
0
def features_to_zarr(phase):
    FIELDNAMES = [
        'image_id', 'image_w', 'image_h', 'num_boxes', 'boxes', 'features'
    ]

    if phase == 'trainval':
        infiles = [
            'raw/trainval_36/trainval_resnet101_faster_rcnn_genome_36.tsv',
        ]
    elif phase == 'test':
        infiles = [
            'raw/test2015_36/test2015_resnet101_faster_rcnn_genome_36.tsv',
        ]
    else:
        raise SystemExit('Unrecognised phase')

    # Read the tsv and load data in a dictionary
    in_data = {}
    for infile in infiles:
        print(infile)
        with open(infile, "r") as tsv_in_file:
            reader = csv.DictReader(tsv_in_file,
                                    delimiter='\t',
                                    fieldnames=FIELDNAMES)
            for item in reader:
                item['image_id'] = str(item['image_id'])
                item['image_h'] = int(item['image_h'])
                item['image_w'] = int(item['image_w'])
                item['num_boxes'] = int(item['num_boxes'])
                for field in ['boxes', 'features']:
                    encoded_str = base64.decodestring(
                        item[field].encode('utf-8'))
                    item[field] = np.frombuffer(encoded_str,
                                                dtype=np.float32).reshape(
                                                    (item['num_boxes'], -1))
                in_data[item['image_id']] = item

    # convert dict to pandas dataframe
    train = pd.DataFrame.from_dict(in_data)
    train = train.transpose()

    # create image sizes csv
    print('Writing image sizes csv...')
    d = train.to_dict()
    dw = d['image_w']
    dh = d['image_h']
    d = [dw, dh]
    dwh = {}
    for k in dw.keys():
        dwh[k] = np.array([d0[k] for d0 in d])
    image_sizes = pd.DataFrame(dwh)
    image_sizes.to_csv(phase + '_image_size.csv')

    # select bounding box coordinates and fill hdf5
    h = h5py.File(phase + 'box.hdf5', mode='w')
    t = train['boxes']
    d = t.to_dict()
    print('Creating bounding box file...')
    for k, v in tqdm(d.items()):
        h.create_dataset(str(k), data=v)
    if h:
        h.close()

    # convert to zarr
    print('Writing zarr file...')
    i_feat = h5py.File(phase + 'box.hdf5', 'r', libver='latest')
    dest = zarr.open_group(phase + '_boxes.zarr', mode='w')
    zarr.copy_all(i_feat, dest)
    i_feat.close()
    dest.close()
    os.remove(phase + 'box.hdf5')

    # select features and fill hdf5
    h = h5py.File(phase + '.hdf5', mode='w')
    t = train['features']
    d = t.to_dict()
    print('Creating image features file...')
    for k, v in tqdm(d.items()):
        h.create_dataset(str(k), data=v)
    if h:
        h.close()

    # convert to zarr
    print('Writing zarr file...')
    i_feat = h5py.File(phase + '.hdf5', 'r', libver='latest')
    dest = zarr.open_group(phase + '.zarr', mode='w')
    zarr.copy_all(i_feat, dest)
    i_feat.close()
    dest.close()
    os.remove(phase + '.hdf5')
def convert_hdf5(zarr_src, h5_dst):
    source = zarr.open(zarr_src, "r")

    with h5py.File(h5_dst, mode="w") as dest:
        zarr.copy_all(source, dest, log=sys.stdout, if_exists="replace")
Example #6
0
def zarr_to_h5(zarr_path, h5_path):
    source = zarr.open(zarr_path, "r")

    with h5py.File(h5_path, mode="w") as dest:
        zarr.copy_all(source, dest, log=sys.stdout, if_exists="replace")
import h5py
import sys
import zarr

# Convert 10x (HDF5) to Zarr
source = h5py.File("/Downloads/1M_neurons_filtered_gene_bc_matrices_h5.h5")
zarr.tree(source)

store = zarr.DirectoryStore('data/10x.zarr')
dest = zarr.group(store=store, overwrite=True)
# following fails if without_attrs=False (the default), possibly related to https://github.com/h5py/h5py/issues/973
zarr.copy_all(source, dest, log=sys.stdout, without_attrs=True)


Example #8
0
    print('Finished!')


hdf50 = Path(f'{int_data_dir}/{phases[0]}_u{u_approach}_inp{input_size}_processed.h5') 
hdf51 = Path(f'{int_data_dir}/{phases[1]}_u{u_approach}_inp{input_size}_processed.h5') 
zarr0 = Path(f'{proc_data_dir}/{phases[0]}_u{u_approach}_inp{input_size}_processed.zarr')
zarr1 = Path(f'{proc_data_dir}/{phases[1]}_u{u_approach}_inp{input_size}_processed.zarr')

if not (hdf50.is_file() or hdf51.is_file()):
    img_paths = {phase: raw_data_dir + dframe[phase].iloc[:,0] for phase in phases}
    proc_images(img_paths, labels_array, data_dir, u_approach, input_size, phases=phases, tforms=tforms) 

if not (zarr0.is_dir() or zarr1.is_dir()):
    source = h5py.File(hdf5paths['val'], mode='r')
    dest = zarr.open_group(zarrpaths['val'], mode='w')
    zarr.copy_all(source, dest, log="zarr1.output")

    source = h5py.File(hdf5paths['train'], mode='r')
    dest = zarr.open_group(zarrpaths['train'], mode='w')
    zarr.copy_all(source, dest, log="zarr0.output")


# In[20]:


# #Accounting for imbalanced classes
# df = dframe['train'].iloc[:,5:].fillna(0).copy()
# df = df.replace(-1, np.nan)

# # #Get a list of the number of positive, negative, and uncertain samples for each class
# class_sample_count = [df[df==t].count() for t in [-1, 0, 1]]
Example #9
0
def read():
    store = zarr.MongoDBStore(database='geosim', collection='Приобка')
    source = zarr.open(store, mode='r')
    # source = z['DSET0']
    with h5py.File(h5_dst) as dest:
        zarr.copy_all(source, dest, log=sys.stdout, if_exists='replace')
 def sequence_to_disk(i, output_file):
     with h5py.File(output_file, write_mode) as f:
         return zarr.copy_all(arrays[i], f, **Bitshuffle())