Ejemplo n.º 1
0
 def get_choice(self, x):
     self.show_menu = False
     self.s.close()
     self.m.remove_layer(self.p)
     self.p = None
     choice = x['new']
     if choice == 'Show flow':
         self.show_flow = True
     elif choice == 'Hide flow':
         self.show_flow = False
         self.m.remove_layer(self.io)
         self.io = None
     elif choice == 'Delineate watershed':
         self.show_flow = False
         self.m.remove_layer(self.io)
         self.io = None
         self.label.value = 'Delineating watershed, please wait...'
         delineate(*self.coord)
         self.label.value = 'Watershed delineated'
         ds_mask = xr.open_zarr('tmp/ds_mask/0').compute()
         mask = ds_mask['mask'].values
         polygon = get_polygon(mask, ds_mask.lat.values[0]+0.5/1200, ds_mask.lon.values[0]-0.5/1200)
         self.m.add_layer(polygon)
         self.label.value = 'Watershed displayed'
     elif choice == 'Set marker':
         if self.marker is not None:
             self.m.remove_layer(self.marker)
         self.marker = Marker(location=self.coord)
         self.m.add_layer(self.marker)
     elif choice == 'Close':
         pass
Ejemplo n.º 2
0
def get_gpm_precipitation(da_mask_gpm):
    ds_gpm = xr.open_zarr(gcsfs.GCSMap('pangeo-data/gpm_imerg_early'))
    da_gpm = ds_gpm['precipitationCal']
    pix_deg_gpm = 0.1
    da_area_gpm = pixel_area(pix_deg_gpm)
    da_mask_gpm = da_area_gpm.reindex_like(da_mask_gpm, method='nearest', tolerance=0.001) * da_mask_gpm
    da_mask_gpm = da_mask_gpm / da_mask_gpm.sum(['lat', 'lon'])
    p_gpm = (da_gpm.reindex_like(da_mask_gpm, method='nearest', tolerance=0.01) * da_mask_gpm).sum(['lat', 'lon'])
    p_gpm = p_gpm.persist()
    return p_gpm
Ejemplo n.º 3
0
def test_dask_distributed_zarr_integration_test(loop):
    chunks = {'dim1': 4, 'dim2': 3, 'dim3': 5}
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            original = create_test_data().chunk(chunks)
            with create_tmp_file(allow_cleanup_failure=ON_WINDOWS,
                                 suffix='.zarr') as filename:
                original.to_zarr(filename)
                with xr.open_zarr(filename) as restored:
                    assert isinstance(restored.var1.data, da.Array)
                    computed = restored.compute()
                    assert_allclose(original, computed)
Ejemplo n.º 4
0
def get_trmm_precipitation(da_mask_trmm):
    ds_trmm = xr.open_zarr(gcsfs.GCSMap('pangeo-data/trmm_3b42rt'))
    # TRMM data was stored with lon in 0/360 range, rearrange it in -180/180:
    long_0_360 = ds_trmm.lon.values
    ds_trmm.lon.values = np.where(long_0_360 < 180, long_0_360, long_0_360 - 360)
    da_trmm = ds_trmm['precipitation'].sortby('lon')
    pix_deg_trmm = 0.25
    da_area_trmm = pixel_area(pix_deg_trmm)
    da_mask_trmm = da_area_trmm.reindex_like(da_mask_trmm, method='nearest', tolerance=0.001) * da_mask_trmm
    da_mask_trmm = da_mask_trmm / da_mask_trmm.sum(['lat', 'lon'])
    p_trmm = (da_trmm.reindex_like(da_mask_trmm, method='nearest', tolerance=0.001) * da_mask_trmm).sum(['lat', 'lon'])
    p_trmm = p_trmm.persist()
    return p_trmm
Ejemplo n.º 5
0
def get_gpm_mask(labels, gcs_path):
    pix_deg_flow = 1 / 1200
    pix_deg_gpm = 0.1
    ratio = int(pix_deg_gpm / pix_deg_flow)
    da_mask_gpm = []
    for label in labels:
        ds = xr.open_zarr(gcsfs.GCSMap(f'{gcs_path}/{label}'))
        da1 = ds['mask'].compute()
        da2 = adjust_bbox(da1, {'lat': (pix_deg_gpm, -pix_deg_flow), 'lon': (pix_deg_gpm, pix_deg_flow)})
        da3 = aggregate_da(da2, {'lat': ratio, 'lon': ratio}) / (ratio * ratio)
        da3 = da3.rename({'lat_agg': 'lat', 'lon_agg': 'lon'})
        da3.lon.values = np.round(da3.lon.values, 2)
        da3.lat.values = np.round(da3.lat.values, 2)
        da_mask_gpm.append(da3)
    da_mask_gpm = xr.concat(da_mask_gpm, 'label').assign_coords(label=labels)
    return da_mask_gpm
Ejemplo n.º 6
0
    def _get_dataset_lazily(self, index: int, **zarr_kwargs) -> xr.Dataset:
        """
        Read the dataset for the level at given *index*.

        :param index: the level index
        :param zarr_kwargs: kwargs passed to xr.open_zarr()
        :return: the dataset for the level at *index*.
        """
        ext, level_path = self._level_paths[index]
        if ext == ".link":
            with open(level_path, "r") as fp:
                level_path = fp.read()
                # if file_path is a relative path, resolve it against the levels directory
                if not os.path.isabs(level_path):
                    base_dir = os.path.dirname(self._dir_path)
                    level_path = os.path.join(base_dir, level_path)
        with measure_time(
                tag=f"opened local dataset {level_path} for level {index}"):
            return assert_cube(xr.open_zarr(level_path, **zarr_kwargs),
                               name=level_path)
Ejemplo n.º 7
0
    def test_vars2dim(self):
        result = self.invoke_cli(["vars2dim", TEST_ZARR_DIR])

        output_path = self.TEST_OUTPUT
        self.assertEqual(0, result.exit_code)
        self.assertTrue(os.path.isdir(output_path))

        ds = xr.open_zarr(output_path)
        self.assertIn("var", ds.dims)
        self.assertEqual(3, ds.dims["var"])
        self.assertIn("var", ds.coords)
        self.assertIn("data", ds.data_vars)
        var_names = ds["var"]
        self.assertEqual(("var", ), var_names.dims)
        self.assertTrue(hasattr(var_names, "encoding"))
        self.assertEqual("<U13", str(var_names.dtype))
        self.assertEqual(3, len(var_names))
        self.assertIn("precipitation", str(var_names[0]))
        self.assertIn("soil_moisture", str(var_names[1]))
        self.assertIn("temperature", str(var_names[2]))
Ejemplo n.º 8
0
    def test_dataset_to_zarr(self):
        dataset = xarray.Dataset(
            {'foo': ('x', np.arange(0, 60, 10))},
            coords={'x': np.arange(6)},
            attrs={'meta': 'data'},
        )
        chunked = dataset.chunk({'x': 3})

        temp_dir = self.create_tempdir().full_path
        (test_util.EagerPipeline() | xbeam.DatasetToZarr(chunked, temp_dir))
        actual = xarray.open_zarr(temp_dir, consolidated=True)
        xarray.testing.assert_identical(actual, dataset)

        temp_dir = self.create_tempdir().full_path
        with self.assertRaisesRegex(
                ValueError,
                'template does not have any variables chunked with Dask',
        ):
            (test_util.EagerPipeline()
             | xbeam.DatasetToZarr(dataset, temp_dir))
Ejemplo n.º 9
0
def test_vcf_to_zarr__large_vcf(shared_datadir, is_path, tmp_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                         is_path)
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(path, output, chunk_length=5_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (1, )
    assert ds["call_genotype"].shape == (19910, 1, 2)
    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
    assert ds["call_genotype_phased"].shape == (19910, 1)
    assert ds["variant_allele"].shape == (19910, 4)
    assert ds["variant_contig"].shape == (19910, )
    assert ds["variant_id"].shape == (19910, )
    assert ds["variant_id_mask"].shape == (19910, )
    assert ds["variant_position"].shape == (19910, )

    assert ds["variant_allele"].dtype == "O"
    assert ds["variant_id"].dtype == "O"
Ejemplo n.º 10
0
def test_vcf_to_zarr__mutable_mapping(shared_datadir, is_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                         is_path)
    output: MutableMapping[str, bytes] = {}

    vcf_to_zarr(path, output, chunk_length=5_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (1, )
    assert ds["call_genotype"].shape == (19910, 1, 2)
    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
    assert ds["call_genotype_phased"].shape == (19910, 1)
    assert ds["variant_allele"].shape == (19910, 4)
    assert ds["variant_contig"].shape == (19910, )
    assert ds["variant_id"].shape == (19910, )
    assert ds["variant_id_mask"].shape == (19910, )
    assert ds["variant_position"].shape == (19910, )

    assert ds["variant_allele"].dtype == "O"
    assert ds["variant_id"].dtype == "O"
Ejemplo n.º 11
0
def read_multiple_obs(obs_files, x_data):
    """
    read and format multiple observation files. we read in the pretrain data to
    make sure we have the same indexing.
    :param obs_files: [list] list of filenames of observation files
    :param pre_train_file: [str] the file of pre_training data
    :return: [xr dataset] the observations in the same time
    """
    obs = [x_data.sortby(["seg_id_nat", "date"])]
    for filename in obs_files:
        ds = xr.open_zarr(filename)
        obs.append(ds)
        if "site_id" in ds.variables:
            del ds["site_id"]
    obs = xr.merge(obs, join="left")
    obs = obs[["temp_c", "discharge_cms"]]
    obs = obs.rename(
        {"temp_c": "seg_tave_water", "discharge_cms": "seg_outflow"}
    )
    return obs
Ejemplo n.º 12
0
def fmt_preds_obs(pred_data, obs_file, variable):
    """
    combine predictions and observations in one dataframe
    :param pred_data:[str] filepath to the predictions file
    :param obs_file:[str] filepath to the observations file
    :param variable: [str] either 'flow' or 'temp'
    """
    obs_var, seg_var = get_var_names(variable)
    pred_data = load_if_not_df(pred_data)
    # pred_data.loc[:, "seg_id_nat"] = pred_data["seg_id_nat"].astype(int)
    if {"date", "seg_id_nat"}.issubset(pred_data.columns):
        pred_data.set_index(["date", "seg_id_nat"], inplace=True)
    obs = xr.open_zarr(obs_file).to_dataframe()
    obs_cln = obs[[obs_var]]
    obs_cln.columns = ["obs"]
    preds = pred_data[[seg_var]]
    preds.columns = ["pred"]
    obs_cln_trim = trim_obs(obs_cln, preds)
    combined = preds.join(obs_cln_trim)
    return combined
Ejemplo n.º 13
0
    def test_update_corrupt_cube(self):
        self.write_cube('2019-01-01', 3)

        cube = xr.open_zarr(self.CUBE_PATH)
        t, y, x = cube.precipitation.shape
        new_shape = y, t, x
        t, y, x = cube.precipitation.dims
        new_dims = y, t, x
        cube['precipitation'] = xr.DataArray(
            cube.precipitation.values.reshape(new_shape),
            dims=new_dims,
            coords=cube.precipitation.coords)
        cube.to_zarr(self.CUBE_PATH_2)

        with self.assertRaises(ValueError) as cm:
            insert_time_slice(self.CUBE_PATH_2, 2,
                              self.make_slice('2019-01-02T06:30'))
        self.assertEqual(
            "dimension 'time' of variable 'precipitation' must be first dimension",
            f"{cm.exception}")
Ejemplo n.º 14
0
def test_vcf_to_zarr__multiple_max_alt_alleles(shared_datadir, is_path,
                                               tmp_path):
    paths = [
        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
    ]
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    with pytest.warns(MaxAltAllelesExceededWarning):
        vcf_to_zarr(
            paths,
            output,
            target_part_size="40KB",
            chunk_length=5_000,
            max_alt_alleles=1,
        )
        ds = xr.open_zarr(output)

        # the maximum number of alt alleles actually seen is stored as an attribute
        assert ds.attrs["max_alt_alleles_seen"] == 7
Ejemplo n.º 15
0
 def test_append_overlapping_append_newer(self):
     for consolidated in False, True:
         with self.subTest(consolidated=consolidated):
             dst_path = "my.zarr"
             self.add_path(dst_path)
             ds1, ds2 = new_append_test_datasets(
                 ["2001-01-01", "2001-01-02", "2001-01-03"],
                 ["2001-01-02", "2001-01-03", "2001-01-04", "2001-02-05"]
             )
             ds1.to_zarr(dst_path, consolidated=consolidated)
             w = DatasetWriter(dst_path, output_append=True,
                               output_append_dim="t",
                               output_append_mode=AppendMode.newer,
                               output_consolidated=consolidated)
             w.write_dataset(ds2)
             ds3 = xr.open_zarr(dst_path, consolidated=consolidated)
             expected = np.array(["2001-01-01", "2001-01-02", "2001-01-03",
                                  "2001-01-04", "2001-02-05"],
                                 dtype="datetime64[ns]")
             np.testing.assert_equal(expected, ds3.t.data)
Ejemplo n.º 16
0
def read_image(infile):
    """
  Read xarray zarr format image from disk

  Parameters
  ----------
  infile : str
      input zarr image filename

  Returns
  -------
  xarray.core.dataset.Dataset
      New xarray Dataset of image contents
  """
    import os
    from xarray import open_zarr

    infile = os.path.expanduser(infile)
    xds = open_zarr(infile)
    return xds
Ejemplo n.º 17
0
    def __init__(
        self,
        IMC_name,
        ROI_number,
        path=None,
        use_panel_file=False,
        panel_file=None,
        panel_skip_rows=4,
    ):
        self.IMC_name = IMC_name
        self.ROI_number = ROI_number
        if path:
            self.filepath = path
        else:
            self.filepath = "/data/meds1_d/storage/raw/imc/"

        print("Loading IMC dataset")
        path_to_open = (
            os.path.join(self.filepath, self.IMC_name) + "/Q00" + str(self.ROI_number)
        )
        ds = xr.open_zarr(path_to_open)
        image = ds["Q00" + str(ROI_number)]
        image.load()
        channels = pd.DataFrame(image.meta[0]["q_channels"])
        self.data = image
        self.channels = channels
        self.channels["good"] = np.zeros(len(self.channels))

        if use_panel_file:
            panel = pd.read_excel(panel_file, skiprows=panel_skip_rows)
            for item in self.channels.metal:
                if "(" in item:
                    metal = item[:2]
                    number = item[3:6]
                    tomatch = number + metal
                    match = panel[panel["Metal"] == tomatch].Target.tolist()
                    if len(match) > 0:
                        self.channels.loc[self.channels.metal == item, "target"] = match
                        self.channels.loc[self.channels.metal == item, "good"] = 1
        else:
            self.channels["good"] = np.ones(len(self.channels))
Ejemplo n.º 18
0
def test_vcf_to_zarr__fields(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "sample.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(
        path,
        output,
        chunk_length=5,
        chunk_width=2,
        fields=["INFO/DP", "INFO/AA", "INFO/DB", "FORMAT/DP"],
    )
    ds = xr.open_zarr(output)

    assert_array_equal(ds["variant_DP"], [-1, -1, 14, 11, 10, 13, 9, -1, -1])
    assert ds["variant_DP"].attrs["comment"] == "Total Depth"

    assert_array_equal(ds["variant_AA"],
                       ["", "", "", "", "T", "T", "G", "", ""])
    assert ds["variant_AA"].attrs["comment"] == "Ancestral Allele"

    assert_array_equal(
        ds["variant_DB"],
        [False, False, True, False, True, False, False, False, False])
    assert ds["variant_DB"].attrs["comment"] == "dbSNP membership, build 129"

    dp = np.array(
        [
            [-1, -1, -1],
            [-1, -1, -1],
            [1, 8, 5],
            [3, 5, 3],
            [6, 0, 4],
            [-1, 4, 2],
            [4, 2, 3],
            [-1, -1, -1],
            [-1, -1, -1],
        ],
        dtype="i4",
    )
    assert_array_equal(ds["call_DP"], dp)
    assert ds["call_DP"].attrs["comment"] == "Read Depth"
Ejemplo n.º 19
0
    def read(self, path: str, **kwargs) -> xr.Dataset:
        path_or_store = path
        consolidated = False

        if isinstance(path, str):
            endpoint_url = None
            region_name = None
            root = None

            if 'endpoint_url' in kwargs:
                endpoint_url = kwargs.pop('endpoint_url')
                root = path
            if 'region_name' in kwargs:
                region_name = kwargs.pop('region_name')
            if path.startswith("http://") or path.startswith("https://"):
                import urllib3.util
                url = urllib3.util.parse_url(path_or_store)
                if url.port is not None:
                    endpoint_url = f'{url.scheme}://{url.host}:{url.port}'
                else:
                    endpoint_url = f'{url.scheme}://{url.host}'
                root = url.path
                if root.startswith('/'):
                    root = root[1:]

            if endpoint_url and root is not None:
                s3 = s3fs.S3FileSystem(anon=True,
                                       client_kwargs=dict(
                                           endpoint_url=endpoint_url,
                                           region_name=region_name))
                consolidated = s3.exists(f'{root}/.zmetadata')
                path_or_store = s3fs.S3Map(root=root, s3=s3, check=False)
                if 'max_cache_size' in kwargs:
                    max_cache_size = kwargs.pop('max_cache_size')
                    if max_cache_size > 0:
                        path_or_store = zarr.LRUStoreCache(
                            path_or_store, max_size=max_cache_size)
            else:
                consolidated = os.path.exists(
                    os.path.join(path_or_store, '.zmetadata'))
        return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
Ejemplo n.º 20
0
    def load(self, filename: Optional[str] = None):
        """Load pose tracks estimated using DeepPoseKit.

        Args:
            filepath (str): name of the file produced by DeepPoseKit

        Returns:
            poses_ego (xarray.DataArray): poses in EGOcentric (centered around thorax, head aligned straight upwards), [frames, flies, bodypart, x/y]
            poses_allo (xarray.DataArray): poses in ALLOcentric (frame) coordinates, [frames, flies, bodypart, x/y]
            partnames (List[str]): list of names for each body part (is already part of the poses_ego/allo xrs)
            first_pose_frame, last_pose_frame (int): frame corresponding to first and last item in the poses_ego/allo arrays (could attach this info as xr dim)
        """
        if filename is None:
            filename = self.path

        with zarr.ZipStore(filename, mode='r') as zarr_store:
            ds = xr.open_zarr(zarr_store).load()  # the final `load()` prevents
        nb_flies = len(ds.flies)
        box_size = np.array(ds.attrs['box_size'])

        poses_allo = ds.poses + ds.box_centers - box_size/2

        first_pose_frame = int(np.argmin(np.isnan(ds.poses.data[:, 0, 0, 0]).data))
        last_pose_frame = int(np.argmin(~np.isnan(np.array(ds.poses.data[first_pose_frame:, 0, 0, 0]).data)) + first_pose_frame)
        if last_pose_frame == first_pose_frame or last_pose_frame == 0:
            last_pose_frame = ds.poses.shape[0]

        # CUT to first/last frame with poses
        poses_ego = ds.poses[first_pose_frame:last_pose_frame, ...]
        poses_allo = poses_allo[first_pose_frame:last_pose_frame, ...]

        poses_ego = poses_ego - poses_ego.sel(poseparts='thorax')  # CENTER egocentric poses around thorax

        # ROTATE egocentric poses such that the angle between head and thorax is 0 degrees (straight upwards)
        head_thorax_angle = 270 + np.arctan2(poses_ego.sel(poseparts='head', coords='y'),
                                             poses_ego.sel(poseparts='head', coords='x')) * 180 / np.pi
        for cnt, (a, p_ego) in enumerate(zip(head_thorax_angle.data, poses_ego.data)):
            for fly in range(nb_flies):
                poses_ego.data[cnt, fly, ...] = rotate_pose(p_ego[fly], -a[fly])

        return poses_ego, poses_allo, ds.poseparts, first_pose_frame, last_pose_frame
Ejemplo n.º 21
0
    def test_replace_time_slice(self):
        self.write_cube('2019-01-02', 10)

        replace_time_slice(self.CUBE_PATH, 5,
                           self.make_slice('2019-01-06T02:00'))
        replace_time_slice(self.CUBE_PATH, 9,
                           self.make_slice('2019-01-11T02:00'))
        replace_time_slice(self.CUBE_PATH, 0,
                           self.make_slice('2019-01-01T02:00'))

        cube = xr.open_zarr(self.CUBE_PATH)
        expected = np.array([
            '2019-01-01T14:00', '2019-01-03T12:00', '2019-01-04T12:00',
            '2019-01-05T12:00', '2019-01-06T12:00', '2019-01-06T14:00',
            '2019-01-08T12:00', '2019-01-09T12:00', '2019-01-10T12:00',
            '2019-01-11T14:00'
        ],
                            dtype=cube.time.dtype)
        self.assertEqual(10, cube.time.size)
        self.assertEqual(None, cube.time.chunks)
        np.testing.assert_equal(cube.time.values, expected)
Ejemplo n.º 22
0
def perform_cmip6_query(conf, query_string):
    df_sub = conf.df.query(query_string)
    if (df_sub.zstore.values.size == 0):
        return df_sub

    mapper = conf.fs.get_mapper(df_sub.zstore.values[-1])
    ds = xr.open_zarr(mapper, consolidated=True)

    time_object = ds["time"].values[0]

    # Convert if necesssary
    if time_object.year == 1:

        times = ds["time"].values
        times_plus_2000 = []
        for t in times:
            times_plus_2000.append(
                cftime.DatetimeNoLeap(t.year + 2000, t.month, t.day, t.hour))
        ds["time"].values = times_plus_2000
        ds = xr.decode_cf(ds)
    return ds
Ejemplo n.º 23
0
def get_sst():
    import fsspec
    import xarray as xr
    #climatology years
    cyr1,cyr2='1993-01-01','2018-12-31'

    #sst
    file_location = 's3://mur-sst/zarr'
    ds = xr.open_zarr(fsspec.get_mapper(file_location, anon=True),consolidated=True)
    ds_sst = ds.drop({'analysis_error','mask','sea_ice_fraction'})
    tem = ds_sst.analysed_sst.attrs
    tem['var_name']='mur_sst'
    ds_sst.analysed_sst.attrs=tem
    ds_sst_clim = ds_sst.sel(time=slice(cyr1,cyr2))
    ds_sst_clim = ds_sst_clim.groupby('time.dayofyear').mean('time',keep_attrs=True,skipna=False)
    
    #put data into a dictionary
    data_dict={'sst':ds_sst}
    clim_dict={'sst_clim':ds_sst_clim}
  
    return data_dict,clim_dict
Ejemplo n.º 24
0
 def test_upsample_with_multiple_methods(self):
     result = self.invoke_cli([
         'resample',
         '--variables',
         'temperature',
         '-F',
         '12H',
         '-T',
         '6H',
         # '-K', 'quadratic',
         # '-M', 'interpolate',
         '-M',
         'nearest',
         TEST_ZARR_DIR
     ])
     self.assertEqual(0, result.exit_code)
     self.assertTrue(os.path.isdir('out.zarr'))
     ds = xr.open_zarr('out.zarr')
     assert_cube(ds)
     # self.assertIn('temperature_interpolate', ds)
     self.assertIn('temperature_nearest', ds)
Ejemplo n.º 25
0
def test_zarr_mapping_set_2d(dtype=int):
    arr = np.array([2.0], dtype=dtype)
    schema = xr.Dataset({"a": (["x"], arr)}).chunk()
    coords = {"time": [0, 1, 2], "space": list("xyz")}

    store = {}
    m = vcm.ZarrMapping.from_schema(store,
                                    schema,
                                    dims=["time", "space"],
                                    coords=coords)
    for time, space in product(coords["time"], coords["space"]):
        m[time, space] = schema

    ds = xr.open_zarr(store)
    assert list(ds.a.dims) == ["time", "space", "x"]

    # check all data
    for time, space in product(coords["time"], coords["space"]):
        a = ds.sel(space=space, time=time).drop(["time", "space"]).load()
        b = schema.load()
        xr.testing.assert_equal(a, b)
Ejemplo n.º 26
0
def test_dask_distributed_zarr_integration_test(loop, consolidated, compute):
    if consolidated:
        zarr = pytest.importorskip('zarr', minversion="2.2.1.dev2")
        write_kwargs = dict(consolidated=True)
        read_kwargs = dict(consolidated=True)
    else:
        write_kwargs = read_kwargs = {}
    chunks = {'dim1': 4, 'dim2': 3, 'dim3': 5}
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            original = create_test_data().chunk(chunks)
            with create_tmp_file(allow_cleanup_failure=ON_WINDOWS,
                                 suffix='.zarrc') as filename:
                maybe_futures = original.to_zarr(filename, compute=compute,
                                                 **write_kwargs)
                if not compute:
                    maybe_futures.compute()
                with xr.open_zarr(filename, **read_kwargs) as restored:
                    assert isinstance(restored.var1.data, da.Array)
                    computed = restored.compute()
                    assert_allclose(original, computed)
Ejemplo n.º 27
0
 def __init__(self,
              dataset_name,
              img_res=(512, 512),
              downsize_factor=(4, 4),
              local_path=None):
     self.dataset_name = dataset_name
     self.img_res = img_res
     self.downsize_factor = downsize_factor
     uris = [
         '/rigel/ocp/projects/shared_data/sst_superresolution/LLC4320/SST.{tstep:010d}.zarr'
         for tstep in range(0, 4088 + 1, 73)
     ][:2]
     dsets = [xr.open_zarr(uri, consolidated=True) for uri in uris]
     ds = xr.combine_nested(dsets, 'timestep')
     print(ds)
     sst_coarse = ds.SST.coarsen(x=self.downsize_factor[0],
                                 y=self.downsize_factor[1]).mean()
     print(sst_coarse)
     region = 306
     self.hr = ds.SST[0, region].load().values
     self.lr = sst_coarse[timestep, region].load().values
Ejemplo n.º 28
0
def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path):
    paths = [
        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
    ]
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    vcf_to_zarr(paths, output, target_part_size=None, chunk_length=5_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (1, )
    assert ds["call_genotype"].shape == (19910, 1, 2)
    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
    assert ds["call_genotype_phased"].shape == (19910, 1)
    assert ds["variant_allele"].shape == (19910, 4)
    assert ds["variant_contig"].shape == (19910, )
    assert ds["variant_id"].shape == (19910, )
    assert ds["variant_id_mask"].shape == (19910, )
    assert ds["variant_position"].shape == (19910, )

    assert ds.chunks["variants"] == (5000, 5000, 5000, 4910)
Ejemplo n.º 29
0
def data_tile_meta(prefix, var, tile_pos):
    z, x, y = tile_pos
    tile_meta = mercantile.Tile(x=x, y=y, z=z)
    min_lon, min_lat, max_lon, max_lat = mercantile.bounds(tile_meta)

    data_tile = xr.open_zarr(store=get_store(prefix))[var]
    data_tile = data_tile.sel(longitude=slice(min_lon, max_lon),
                              latitude=slice(max_lat, min_lat))

    return {
        "variable": var,
        "z": z,
        "x": x,
        "y": y,
        "lats": [min_lat, max_lat],
        "lons": [min_lon, max_lon],
        f"{var}Max": float(data_tile.max().compute()),
        f"{var}Min": float(data_tile.min().compute()),
        "units": data_tile.units,
        "description": data_tile.long_name,
    }
Ejemplo n.º 30
0
def test_NetCDFtoZarrMultiVarSequentialRecipe(
    daily_xarray_dataset, netcdf_local_paths_by_variable, tmp_target, tmp_cache
):
    paths, items_per_file, fnames_by_variable, path_format = netcdf_local_paths_by_variable
    pattern = VariableSequencePattern(
        path_format, keys={"variable": ["foo", "bar"], "n": list(range(len(paths) // 2))}
    )
    r = recipe.NetCDFtoZarrMultiVarSequentialRecipe(
        input_pattern=pattern,
        sequence_dim="time",
        inputs_per_chunk=1,
        nitems_per_input=items_per_file,
        target=tmp_target,
        input_cache=tmp_cache,
    )
    _manually_execute_recipe(r)

    ds_target = xr.open_zarr(tmp_target.get_mapper(), consolidated=True).compute()
    print(ds_target)
    print(daily_xarray_dataset)
    assert ds_target.identical(daily_xarray_dataset)
Ejemplo n.º 31
0
def delineate(lat, lon, sub_latlon=[], sub_nb=None, acc_delta=np.inf, progress=False):
    pix_deg = 1 / 1200
    tile_deg = 5
    lat = (lat // pix_deg) * pix_deg + pix_deg
    lon = (lon // pix_deg) * pix_deg
    for ll in sub_latlon:
        ll[0] = (ll[0] // pix_deg) * pix_deg + pix_deg
        ll[1] = (ll[1] // pix_deg) * pix_deg
    if sub_nb is not None:
        dir_tile, acc_tile, y, x = get_tile(lat, lon, 2, pix_deg, tile_deg)
        acc_delta = acc_tile[y,  x] / (sub_nb + 1)
    getSubBass = True
    sample_i = 0
    samples = np.empty((1024, 2), dtype=np.float64)
    lengths = np.empty(1024, dtype=np.float64)
    areas = np.empty(1024, dtype=np.float64)
    areas = np.empty(1024, dtype=np.float64)
    labels = np.empty((1024, 3), dtype=np.int32)
    dirNeighbors = np.empty(1024, dtype=np.uint8)
    accNeighbors = np.empty(1024, dtype=np.float64)
    ws_latlon = np.empty(2, dtype=np.float64)
    # output mask ->
    mxw = 10000 # bytes
    myw = mxw * 8 # bits
    mm = np.empty((myw, mxw), dtype=np.uint8)
    mm_back = np.empty((myw, mxw), dtype=np.uint8)
    mx0_deg = 0
    my0_deg = 0
    # <- output mask

    simple_delineation = False
    if len(sub_latlon) == 0:
        _sub_latlon = np.empty((1, 2), dtype=np.float64)
        _sub_latlon[0, :] = [lat, lon]
        if not np.isfinite(acc_delta):
            simple_delineation = True
    else:
        _sub_latlon = np.empty((len(sub_latlon), 2), dtype=np.float64)
        _sub_latlon[:, :] = sub_latlon
    tile_size = int(round(tile_deg / pix_deg))
    if simple_delineation:
        sample_size = 1
        samples[0] = [lat, lon]
    else:
        if progress:
            print('Getting bassin partition...')
        samples, labels, areas, lengths, sample_size, mx0_deg, my0_deg, ws_mask, ws_latlon, dirNeighbors, accNeighbors = cdelineate(lat, lon, getSubBass, sample_i, samples, labels, areas, lengths, pix_deg, tile_deg, acc_delta, _sub_latlon, mm, mm_back, mx0_deg, my0_deg, dirNeighbors, accNeighbors)
        if not is_empty_latlon(_sub_latlon):
            print("WARNING: not all subbasins have been processed. This means that they don't fall into different pixels, or that they are not located in the basin. Please check their lat/lon coordinates.")
            for i in range(_sub_latlon.shape[0]):
                if _sub_latlon[i, 0] > -900:
                    print(_sub_latlon[i, 0], _sub_latlon[i, 1])
    #print('Delineating sub-bassins...')
    mask, latlon = [], []
    getSubBass = False
    lat_min = np.inf
    lat_max = -np.inf
    lon_min = np.inf
    lon_max = -np.inf
    new_labels = []
    shutil.rmtree('tmp/ws', ignore_errors=True)
    shutil.rmtree('tmp/ds_mask', ignore_errors=True)
    this_range = range(sample_size)
    if progress:
        this_range = tqdm(this_range)
    for sample_i in this_range:
        _, _, _, _, _, mx0_deg, my0_deg, ws_mask, ws_latlon, dirNeighbors, accNeighbors = cdelineate(lat, lon, getSubBass, sample_i, samples, labels, areas, lengths, pix_deg, tile_deg, acc_delta, _sub_latlon, mm, mm_back, mx0_deg, my0_deg, dirNeighbors, accNeighbors)
        clat = np.array([ws_latlon[0] - (i + 0.5) * pix_deg for i in range(ws_mask.shape[0])])
        clon = np.array([ws_latlon[1] + (i + 0.5) * pix_deg for i in range(ws_mask.shape[1])])
        da_mask = xr.DataArray(ws_mask, coords=[clat, clon], dims=['lat', 'lon'])
        if sample_i == 0:
            new_labels.append('0')
        else:
            i = labels[sample_i][0]
            new_labels.append(new_labels[i] + ',' + str(labels[sample_i][2]))
        ds_mask = da_mask.to_dataset(name='mask')
        ds_mask.to_zarr(store=f'tmp/ws/{sample_i}', mode='w')
        #lat_min = min(lat_min, clat[-1])
        #lat_max = max(lat_max, clat[0])
        #lon_min = min(lon_min, clon[0])
        #lon_max = max(lon_max, clon[-1])
    #vmin = {'lat': lat_min, 'lon': lon_min}
    #vmax = {'lat': lat_max, 'lon': lon_max}
    #new_lat = np.arange(lat_max, lat_min-tolerance, -pix_deg)
    #new_lon = np.arange(lon_min, lon_max+tolerance, pix_deg)
    #for sample_i in range(sample_size):
    #    label = new_labels[sample_size-1-sample_i]
    #    ds_mask = xr.open_zarr(f'tmp/ws/{sample_i}').compute()
    #    da_mask = ds_mask[str(sample_i)]
    #    ilat = (np.abs(new_lat - da_mask.lat.values[0])).argmin()
    #    ilon = (np.abs(new_lon - da_mask.lon.values[0])).argmin()
    #    nlat = 1 + int(round((lat_max - lat_min) / pix_deg))
    #    nlon = 1 + int(round((lon_max - lon_min) / pix_deg))
    #    mask = da.zeros((nlat, nlon), chunks=(1000,1000), dtype='uint8')
    #    #mask = zarr.zeros((nlat, nlon), chunks=(nlat, nlon), dtype='uint8')
    #    da_mask2 = xr.DataArray(mask, coords=[new_lat, new_lon], dims=['lat', 'lon'])
    #    da_mask2[ilat:ilat+da_mask.shape[0], ilon:ilon+da_mask.shape[1]] = da_mask.values
    #    if sample_i == 0:
    #        ds_mask = da_mask2.to_dataset(name=label)
    #    else:
    #        ds_mask[label] = da_mask2
    #ds_mask.to_zarr(store=f'tmp/ds_ws', mode='w')
    #shutil.rmtree('tmp/ds_ws', ignore_errors=True)
    for sample_i in range(sample_size):
        label = new_labels[sample_size-1-sample_i]
        ds_mask = xr.open_zarr(f'tmp/ws/{sample_i}').compute()
        da_mask = ds_mask['mask']
        olatlon = [samples[sample_i][0]-pix_deg/2, samples[sample_i][1]+pix_deg/2]
        da_mask.attrs = {'outlet': olatlon, 'area': areas[sample_size-1-sample_i], 'length': lengths[sample_size-1-sample_i], 'label': label}
        ds_mask = da_mask.to_dataset()
        ds_mask.to_zarr(store=f'tmp/ds_mask/{label}', mode='w')
        shutil.rmtree(f'tmp/ws/{sample_i}', ignore_errors=True)