Esempio n. 1
0
    def __init__(self,
                 path: str,
                 mode: str = 'r',
                 storage_type: str = None) -> None:
        """ Initialize a Zarr file, if mode == 'w', create an empty one, otherwise, load from path
        path : `str`, path for the zarr object.
        storage_type : `str`, currently only support 'ZipStore' and 'NestedDirectoryStore'. If None, use 'NestedDirectoryStore' by default.
        """
        self.store = self.root = None

        if storage_type is None:
            storage_type = 'NestedDirectoryStore'

        if mode == 'w':
            # Create a new zarr file
            check_and_remove_existing_path(path)
            self.store = zarr.ZipStore(
                path, mode='w'
            ) if storage_type == 'ZipStore' else zarr.NestedDirectoryStore(
                path)
            self.root = zarr.group(self.store, overwrite=True)
        else:
            # Load existing zarr file
            self.store = zarr.NestedDirectoryStore(path) if os.path.isdir(
                path) else zarr.ZipStore(path, mode='r')
            if mode == 'a' and isinstance(self.store, zarr.ZipStore):
                self._to_directory()
            self.root = zarr.open(self.store, mode=mode)
    def _to_zip(self):
        if not isinstance(self.store, zarr.ZipStore):
            zip_path = self.store.path + '.zip'
            zip_store = zarr.ZipStore(zip_path, mode = 'w')
            zarr.copy_store(self.store, zip_store)
            zip_store.close()

            shutil.rmtree(self.store.path)

            self.store = zarr.ZipStore(zip_path, mode = 'r')
            self.root = zarr.open_group(self.store, mode = 'r')
Esempio n. 3
0
    def _to_directory(self):
        orig_path = self.store.path

        if not orig_path.endswith('.zip'):
            self.store.close()
            zip_path = orig_path + '.zip'
            check_and_remove_existing_path(zip_path)
            os.replace(orig_path, zip_path)
            self.store = zarr.ZipStore(zip_path, mode='r')
        else:
            zip_path = orig_path

        dest_path = zip_path[:-4]
        check_and_remove_existing_path(dest_path)
        dir_store = zarr.NestedDirectoryStore(dest_path)
        zarr.copy_store(self.store, dir_store)
        self.store.close()
        os.remove(zip_path)

        self.store = dir_store
        self.root = zarr.open_group(self.store)

        logger.info(
            f"Converted ZipStore zarr file {orig_path} to NestedDirectoryStore {dest_path}."
        )
Esempio n. 4
0
def init_zarr(sample_id, path, file_format, schema_version):
    """Initializes the zarr output.

    Args:
        sample_id (str): sample or cell id
        path (str): path to the zarr output
        file_format (str): zarr file format [DirectoryStore, ZipStore]
        schema_version (str): version string of this output to allow for parsing of future changes

    Returns:
        root (zarr.hierarchy.Group): initialized zarr group
    """

    store = None
    if file_format == "DirectoryStore":
        store = zarr.DirectoryStore(path)

    if file_format == "ZipStore":
        store = zarr.ZipStore(path, mode='w')

    # create the root group
    root = zarr.group(store, overwrite=True)

    #root.attrs['README'] = "The schema adopted in this zarr store may undergo changes in the future"
    root.attrs['sample_id'] = sample_id
    root.attrs['optimus_output_schema_version'] = schema_version

    # Create the expression_matrix group
    #root.create_group("expression_matrix", overwrite=True);

    return root
Esempio n. 5
0
def _add_item(path: FilepathType, features: FeatureMap, input_dict: dict):
    parsed_input_dict = dict()
    for feat_name, feat in features.items():
        try:
            feat_val = input_dict[feat_name]
        except KeyError:
            raise RuntimeError(f"Feature missing ({feat_name})")

        if feat.shape is not None and np.array(feat_val).shape != feat.shape:
            raise ValueError(f"Shape mismatch for {feat_name} "
                             f"(expected {feat.shape}, "
                             f"got {np.array(feat_val).shape})")

        feat_val = (np.expand_dims(feat_val, axis=0) if isinstance(
            feat_val, np.ndarray) else np.array(
                [feat_val], dtype=feat.dtype)).astype(feat.dtype)

        parsed_input_dict[feat_name] = feat_val

    path = Path(path).resolve()
    lock = FileLock(_lockfile(path))
    with lock, warnings.catch_warnings():
        warnings.simplefilter("ignore", UserWarning)

        store = zarr.ZipStore(path, mode="a")
        root = zarr.group(store=store)
        for k, v in parsed_input_dict.items():
            root[k].append(v)
        store.close()
Esempio n. 6
0
 def read_metadata(cls, path: FilepathType) -> Metadata:
     path = Path(path).resolve()
     lock = FileLock(_lockfile(path))
     with lock, zarr.ZipStore(path, mode="r") as store:
         root = zarr.group(store=store)
         metadata = root.attrs[_METADATA_KEY]
     return Metadata.deserialize(metadata)
Esempio n. 7
0
def check_simulation_result(
    datadir: Path,
    config: Dict[str, Any],
    run: Dict[str, Any],
    xp: Any,
) -> None:
    # Extract properties for simulation
    dataset, paramset = run["dataset"], run["paramset"]
    ds_config = config["datasets"][dataset]
    ps_config = config["paramsets"][paramset]
    dataset_dir = datadir / "dataset" / dataset
    result_dir = datadir / "result" / run["name"]

    # Load simulated data
    with zarr.ZipStore(str(dataset_dir / "genotypes.zarr.zip"), mode="r") as store:
        ds = xr.open_zarr(store, consolidated=False)
        df_covariate = load_covariates(dataset_dir)
        df_trait = load_traits(dataset_dir)
        contigs = ds["variant_contig"].values
        G = xp.asarray(ds["call_genotype"].sum(dim="ploidy").values)
        X = xp.asarray(df_covariate.values)
        Y = xp.asarray(df_trait.values)
        alphas = ps_config["alphas"]
        if alphas is not None:
            alphas = xp.asarray(alphas)

        # Define transformed traits
        res = regenie_transform(
            G.T,
            X,
            Y,
            contigs,
            variant_block_size=ps_config["variant_block_size"],
            sample_block_size=ps_config["sample_block_size"],
            normalize=True,
            add_intercept=False,
            alphas=alphas,
            orthogonalize=False,
            # Intentionally make mistakes related to these flags
            # in order to match Glow results
            _glow_adj_dof=True,
            _glow_adj_scaling=True,
            _glow_adj_alpha=True,
        )
        YBP = res["regenie_base_prediction"].data
        YMP = res["regenie_meta_prediction"].data

        # Check equality of stage 1 and 2 transformations
        check_stage_1_results(YBP, ds_config, ps_config, result_dir)
        check_stage_2_results(YMP, df_trait, result_dir)

        # Check equality of GWAS results
        X = da.from_array(X)
        Q = da.linalg.qr(X)[0]
        YR = Y - YMP
        YP = YR - Q @ (Q.T @ YR)
        stats = linear_regression(
            _dask_cupy_to_numpy(G.T), _dask_cupy_to_numpy(YP), _dask_cupy_to_numpy(Q)
        )
        check_stage_3_results(ds, stats, df_trait, result_dir)
Esempio n. 8
0
 def test_format_written(self):
     ts = msprime.simulate(10, random_seed=1)
     tszip.compress(ts, self.path)
     with zarr.ZipStore(str(self.path), mode="r") as store:
         root = zarr.group(store=store)
         self.assertEqual(root.attrs["format_name"], compression.FORMAT_NAME)
         self.assertEqual(root.attrs["format_version"], compression.FORMAT_VERSION)
Esempio n. 9
0
def init_zarr(sample_id, path, file_format):
    """Initializes the zarr output
    Args:
        sample_id (str): sample or cell id
        path (str): path to the zarr output
        fileformat (str): zarr file format [ DirectoryStore, ZipStore]
    Returns:
        zarr.hierarchy.Group
    """

    store = None
    if file_format == "DirectoryStore":
        store = zarr.DirectoryStore(path)

    if file_format == "ZipStore":
        store = zarr.ZipStore(path, mode='w')

    # create the root group
    root = zarr.group(store, overwrite=True)

    # add some readme for the user
    root.attrs['README'] = (
        "The schema adopted in this zarr store may undergo "
        "changes in the future")
    root.attrs['sample_id'] = sample_id

    return root
Esempio n. 10
0
def load(savepath,
         lazy: bool = False,
         normalize_strings: bool = True,
         use_temp: bool = False):
    """[summary]

    Args:
        savepath ([type]): [description]
        lazy (bool, optional): [description]. Defaults to True.
        normalize_strings (bool, optional): [description]. Defaults to True.
        use_temp (bool, optional): Unpack zip to temp file - potentially speeds up loading and allows overwriting existing zarr file.
                                   Defaults to True.
    Returns:
        [type]: [description]
    """
    zarr_store = zarr.ZipStore(savepath, mode='r')
    if use_temp:
        dest = zarr.TempStore()
        zarr.copy_store(zarr_store, dest)
        zarr_store.close()
        zarr_store = dest
    dataset = xr.open_zarr(zarr_store)
    if not lazy:
        dataset.load()
        zarr_store.close()

    if normalize_strings:
        dataset = _normalize_strings(dataset)

    return dataset
Esempio n. 11
0
def decompress(path):
    """
    Returns a decompressed tskit tree sequence read from the specified path.
    """
    store = zarr.ZipStore(path, mode='r')
    root = zarr.group(store=store)
    return decompress_zarr(root)
Esempio n. 12
0
def init_zarr(sample_id, path, file_format):
    """Initializes the zarr output.

    Args:
        sample_id (str): sample or cell id
        path (str): path to the zarr output
        file_format (str): zarr file format [DirectoryStore, ZipStore]

    Returns:
        root (zarr.hierarchy.Group): initialized zarr group
    """

    store = None
    if file_format == "DirectoryStore":
        store = zarr.DirectoryStore(path)

    if file_format == "ZipStore":
        store = zarr.ZipStore(path, mode='w')

    # create the root group
    root = zarr.group(store, overwrite=True)

    # add some readme for the user
    root.attrs[
        'README'] = "The schema adopted in this zarr store may undergo changes in the future"
    root.attrs['sample_id'] = sample_id

    # now iterate through list of expected groups and create them
    for dataset in ZARR_GROUP:
        root.create_group(dataset, overwrite=True)

    return root
Esempio n. 13
0
def compress(ts, path, variants_only=False):
    """
    Compresses the specified tree sequence and writes it to the specified path.
    By default, fully lossless compression is used so that tree sequences are
    identical before and after compression. By specifying the ``variants_only``
    option, a lossy compression can be used, which discards any information
    that is not needed to represent the variants (which are stored losslessly).

    :param tskit.TreeSequence ts: The input tree sequence.
    :param str destination: The string or :class:`pathlib.Path` instance describing
        the location of the compressed file.
    :param bool variants_only: If True, discard all information not necessary
        to represent the variants in the input file.
    """
    destination = str(path)
    # Write the file into a temporary directory on the same file system so that
    # we can write the output atomically.
    destdir = os.path.dirname(os.path.abspath(destination))
    with tempfile.TemporaryDirectory(dir=destdir,
                                     prefix=".tszip_work_") as tmpdir:
        filename = os.path.join(tmpdir, "tmp.trees.tgz")
        logging.debug(f"Writing to temporary file {filename}")
        with zarr.ZipStore(filename, mode="w") as store:
            root = zarr.group(store=store)
            compress_zarr(ts, root, variants_only=variants_only)
        os.replace(filename, destination)
    logging.info(f"Wrote {destination}")
def create_zarr_store(ds,
                      rootdir,
                      ignore_vars=[],
                      storetype='directory',
                      consolidated=True):
    """
    OBSOLETE

    Write each variable from a xarray Dataset ds into a new zarr ZipStore
    under the root directory rootdir, excluding optional variables from
    ignore_vars.

    PARAMETERS:
    ===========

    ds: xarray.Dataset
        input dataset
    rootdir: str
        root path to the zarr stores
    ignore_vars: list
        variables to ignore
    storetype: str
        zarr store type (directory, zip)
    consolidated: logical
        zarr option to store (default = True)

    RETURNS:
    ========

    None
    """

    for variable in ds.variables:
        if variable not in ignore_vars:
            print(f'writing {variable}')
            # create a bogus dataset to copy a single variable
            tmp = _xr.Dataset()
            tmp[variable] = ds[variable]
            # update output directory with variable name
            outputdir = rootdir.replace('<VARNAME>', variable)
            # create the output directory
            check = subprocess.check_call(f'mkdir -p {outputdir}', shell=True)
            exit_code(check)
            # create a zarr store in write mode
            store_exists = os.path.exists(f'{outputdir}/{variable}')
            if storetype == 'directory' and not store_exists:
                store = _zarr.DirectoryStore(f'{outputdir}/{variable}')
                # then copy to zarr
                tmp.to_zarr(store, consolidated=consolidated)
            elif storetype == 'zip':
                store = _zarr.ZipStore(f'{outputdir}/{variable}.zip', mode='w')
                # then copy to zarr
                tmp.to_zarr(store, mode='w', consolidated=consolidated)
            # and close store
            if storetype == 'zip':
                store.close()
            tmp.close()
    return None
Esempio n. 15
0
def _0(obj: classo_problem) -> CLASSOProblemDirectoryFormat :
    # for output of regress
    ff = CLASSOProblemDirectoryFormat()
    zipfile = str(ff.path/'problem.zip')
    store = zarr.ZipStore(zipfile,mode='w')
    root = zarr.open(store=store)
    to_zarr(obj,'problem',root)
    store.close()
    return ff 
Esempio n. 16
0
 def test_provenance(self):
     ts = msprime.simulate(10, random_seed=1)
     for variants_only in [True, False]:
         tszip.compress(ts, self.path, variants_only=variants_only)
         with zarr.ZipStore(str(self.path), mode='r') as store:
             root = zarr.group(store=store)
             self.assertEqual(
                 root.attrs["provenance"],
                 provenance.get_provenance_dict(
                     {"variants_only": variants_only}))
Esempio n. 17
0
 def from_array(cls,
                path: Path,
                array: np.ndarray,
                overwrite: bool = False) -> Raw:
     if path.is_file() and (not overwrite):
         raise RuntimeError("File already exists")
     else:
         with zarr.ZipStore(path, zipfile.ZIP_DEFLATED) as store:
             zarr.save_array(store, array)
     return cls(path)
Esempio n. 18
0
 def __len__(self):
     sizes = set()
     lock = FileLock(_lockfile(self.path))
     with lock, zarr.ZipStore(self.path, mode="r") as store:
         root = zarr.group(store=store)
         for feat_name in self.features.keys():
             sizes.add(len(root[feat_name]))
     if len(sizes) != 1:
         raise RuntimeError("Dataset corrupted!!!")
     return sizes.pop()
Esempio n. 19
0
def load_zarr(path):
    path = str(path)
    try:
        store = zarr.ZipStore(path, mode='r')
    except zipfile.BadZipFile as bzf:
        raise exceptions.FileFormatError("File is not in tgzip format") from bzf
    root = zarr.group(store=store)
    try:
        check_format(root)
        yield root
    finally:
        store.close()
Esempio n. 20
0
def _load_dataset_file(dataset_filepath):
    """
    Loads a single dataset file give by its path
    :param dataset_filepath: path where the file is located
    :return:starting_idx: [int] - List of indices where ech game starts
            x: nd.array - Numpy array which contains the game positions
            y_value: nd.array - Numpy array which describes the winner for each board position
            y_policy: nd.array - Numpy array which describes the policy distribution for each board state
                                 (in case of a pgn dataset the move is one hot encoded)
    """
    return get_numpy_arrays(
        zarr.group(store=zarr.ZipStore(dataset_filepath, mode="r")))
def open_group(path):
    if path.endswith("h5"):
        return h5py.File(path, "r")
    elif path.endswith(("zarr2", "zarr")):
        return zarr.open_group(path, "r")
    elif path.endswith("zip"):
        zz = zarr.ZipStore(path)
        return zarr.Group(zz)
    else:
        raise ValueError(
            "Bad filepath provided: {0}. Only hdf5/zarr supported.".format(
                path))
Esempio n. 22
0
def _load_dataset_file(dataset_filepath):
    """
    Loads a single dataset file give by its path
    :param dataset_filepath: path where the file is located
    :return:
    """
    store = zarr.ZipStore(dataset_filepath, mode='r')
    pgn_dataset = zarr.group(store=store)

    s_idcs, x, yv, yp = get_numpy_arrays(pgn_dataset)

    return s_idcs, x, yv, yp
Esempio n. 23
0
    def __getitem__(self, idx) -> DynamicNamedTuple:
        feat_names = list(self.features.keys())
        lock = FileLock(_lockfile(self.path))
        with lock, zarr.ZipStore(self.path, mode="r") as store:
            root = zarr.group(store=store)
            data_dict = {
                feat_name: root[feat_name][idx]
                for feat_name in feat_names
            }

        return (ItemViewMeta(feat_names)(**data_dict) if isinstance(idx, int)
                else ItemSetViewMeta(feat_names)(**data_dict))
Esempio n. 24
0
def compress(ts, path):
    """
    Compresses the specified tree sequence and writes it to the specified
    path.
    """
    logging.info("Compressing to {}".format(path))
    try:
        store = zarr.ZipStore(path, mode='w')
        root = zarr.group(store=store)
        compress_zarr(ts, root)
        store.close()
    except Exception as e:
        os.unlink(path)
        raise e
Esempio n. 25
0
def run(dataset: str, dataset_dir="data/dataset"):
    dataset_dir = Path(dataset_dir)
    plink_path = dataset_dir / dataset / "genotypes"
    zarr_path = dataset_dir / dataset / "genotypes.zarr.zip"
    ds = read_plink(path=plink_path, bim_sep="\t", fam_sep="\t")
    # Pre-compute string lengths until this is done:
    # https://github.com/pystatgen/sgkit-plink/issues/12
    ds = ds.compute()
    logger.info(f"Loaded dataset {dataset}:")
    logger.info("\n" + str(ds))
    store = zarr.ZipStore(zarr_path, mode="w")
    ds.to_zarr(store, mode="w")
    store.close()
    logger.info(f"Conversion to zarr at {zarr_path} successful")
Esempio n. 26
0
def check_simulation_result(
    datadir: Path, config: Dict[str, Any], run: Dict[str, Any]
) -> None:
    # Extract properties for simulation
    dataset, paramset = run["dataset"], run["paramset"]
    ds_config = config["datasets"][dataset]
    ps_config = config["paramsets"][paramset]
    dataset_dir = datadir / "dataset" / dataset
    result_dir = datadir / "result" / run["name"]

    # Load simulated data
    with zarr.ZipStore(str(dataset_dir / "genotypes.zarr.zip"), mode="r") as store:
        ds = xr.open_zarr(store)  # type: ignore[no-untyped-call]
        df_covariate = load_covariates(dataset_dir)
        df_trait = load_traits(dataset_dir)
        contigs = ds["variant_contig"].values
        G = ds["call_genotype"].sum(dim="ploidy").values
        X = df_covariate.values
        Y = df_trait.values

        # Define transformed traits
        res = regenie_transform(
            G.T,
            X,
            Y,
            contigs,
            variant_block_size=ps_config["variant_block_size"],
            sample_block_size=ps_config["sample_block_size"],
            normalize=True,
            add_intercept=False,
            alphas=ps_config["alphas"],
            orthogonalize=False,
            # Intentionally make mistakes related to these flags
            # in order to match Glow results
            _glow_adj_dof=True,
            _glow_adj_scaling=True,
            _glow_adj_alpha=True,
        )
        YBP = res["base_prediction"].data
        YMP = res["meta_prediction"].data

        # Check equality of stage 1 and 2 transformations
        check_stage_1_results(YBP, ds_config, ps_config, result_dir)
        check_stage_2_results(YMP, df_trait, result_dir)

        # Check equality of GWAS results
        YR = Y - YMP
        stats = linear_regression(G.T, X, YR)
        check_stage_3_results(ds, stats, df_trait, result_dir)
Esempio n. 27
0
def test_backwards_compat_zarr():
    import scanpy as sc
    import zarr

    pbmc_orig = sc.datasets.pbmc68k_reduced()
    # Old zarr writer couldn’t do sparse arrays
    pbmc_orig.raw._X = pbmc_orig.raw.X.toarray()
    del pbmc_orig.uns["neighbors"]

    # This was written out with anndata=0.6.22.post1
    zarrpth = HERE / "data/pbmc68k_reduced_legacy.zarr.zip"
    with zarr.ZipStore(zarrpth, mode="r") as z:
        pbmc_zarr = ad.read_zarr(z)

    assert_equal(pbmc_zarr, pbmc_orig)
Esempio n. 28
0
def compress_zarr_dataset(data,
                          file_path,
                          compression='lz4',
                          clevel=5,
                          start_idx=0,
                          end_idx=0):
    """
    Loads in a zarr data set and exports it with a given compression type and level
    :param data: Zarr data set which will be compressed
    :param file_path: File name path where the data will be exported (e.g. "./export/data.zip")
    :param compression: Compression type
    :param clevel: Compression level
    :param start_idx: Starting index of data to be exported.
    :param end_idx: If end_idx != 0 the data set will be exported to the specified index,
    excluding the sample at end_idx (e.g. end_idx = len(x) will export it fully)
    :return: True if a NaN value was detected
    """
    compressor = Blosc(cname=compression, clevel=clevel, shuffle=Blosc.SHUFFLE)

    # open a dataset file and create arrays
    store = zarr.ZipStore(file_path, mode="w")
    zarr_file = zarr.group(store=store, overwrite=True)

    nan_detected = False
    for key in data.keys():
        if end_idx == 0:
            x = data[key]
        else:
            x = data[key][start_idx:end_idx]

        if np.isnan(x).any():
            nan_detected = True

        array_shape = list(x.shape)
        array_shape[0] = 128
        # export array
        zarr_file.create_dataset(
            name=key,
            data=x,
            shape=x.shape,
            dtype=type(x.flatten()[0]),
            chunks=array_shape,
            synchronizer=zarr.ThreadSynchronizer(),
            compression=compressor,
        )
    store.close()
    logging.info("dataset was exported to: %s", file_path)
    return nan_detected
Esempio n. 29
0
def setup_input(samples, input_pattern, seqid, field):
    log('Setting up input array ...')
    input_paths = [input_pattern.format(sample=s) for s in samples]
    input_stores = [zarr.ZipStore(ip, mode='r') for ip in input_paths]
    input_roots = [zarr.group(store) for store in input_stores]
    input_arrays = [
        root[s][seqid][field] for root, s in zip(input_roots, samples)
    ]
    input_arrays = [da.from_array(a, chunks=a.chunks) for a in input_arrays]

    # here we add a dim to allow the hstack to work. must share the shape (X, 1, )
    input_arrays = [a[:, None] if a.ndim == 1 else a for a in input_arrays]

    input_array = da.hstack(input_arrays)
    log('Input array:', input_array)
    return input_array
def save_results(conn, image, data, dataset, path):
    filename, file_extension = os.path.splitext(image.getName())
    # Save the probabilities file as an image
    print("Saving Probabilities as zarr file attached to the original Image")
    name = filename + "_Probabilities_zarr.zip"
    desc = "ilastik probabilities from Image:%s" % image.getId()
    # Re-organise array from tzyxc to zctyx order expected by OMERO
    # data = data.swapaxes(0, 1).swapaxes(3, 4).swapaxes(2, 3).swapaxes(1, 2)
    namespace = "ilastik.zarr.demo"
    fp = os.path.join(path, name)
    with zarr.ZipStore(fp, mode='w') as store:
        zarr.array(data, store=store, dtype='int16',
                   compressor=zarr.Blosc(cname='zstd'))
    ann = conn.createFileAnnfromLocalFile(fp, mimetype="application/zip",
                                          ns=namespace, desc=desc)
    image.linkAnnotation(ann)