def test_s3_read_netcdf(s3): url = f's3://{test_bucket_name}/example_1.nc' s3options = dict(client_kwargs={"endpoint_url": endpoint_uri}) source = intake.open_netcdf(url, storage_options=s3options) ds = source.read() assert ds['rh'].isel(lat=0, lon=0, time=0).values.dtype == 'float32' assert ds['rh'].isel(lat=0, lon=0, time=0).values == 0.5
def test_http_read_netcdf_simplecache(data_server): url = f'simplecache::{data_server}/example_1.nc' source = intake.open_netcdf(url, chunks={}, xarray_kwargs={"engine": "netcdf4"}) ds = source.to_dask() assert isinstance(ds, xr.core.dataset.Dataset) assert isinstance(ds.temp.data, dask.array.core.Array)
def test_http_read_netcdf_dask(data_server): url = f'{data_server}/next_example_1.nc' source = intake.open_netcdf(url, chunks={}, xarray_kwargs=dict(engine='h5netcdf')) ds = source.to_dask() # assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore) assert isinstance(ds, xr.core.dataset.Dataset) assert isinstance(ds.temp.data, dask.array.core.Array)
def test_open_netcdf_s3_simplecache(): bucket = 's3://its-live-data.jpl.nasa.gov' key = 'icesat2/alt06/rel003/ATL06_20181230162257_00340206_003_01.h5' url = f'simplecache::{bucket}/{key}' source = intake.open_netcdf( url, xarray_kwargs=dict(group='gt1l/land_ice_segments', engine='h5netcdf'), storage_options=dict(s3={'anon': True}), ) ds = source.to_dask() assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore) assert isinstance(ds, xr.core.dataarray.Dataset)
def test_open_netcdf_gs(): bucket = 'gs://ldeo-glaciology' key = 'bedmachine/BedMachineAntarctica_2019-11-05_v01.nc' url = f'{bucket}/{key}' source = intake.open_netcdf( url, chunks=3000, xarray_kwargs=dict(engine='h5netcdf'), ) ds = source.to_dask() assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore) assert isinstance(ds, xr.core.dataarray.Dataset)
def write_catalog(self): # if the catalog already exists, don't do this if os.path.exists(self.catalog_name): return else: lines = 'sources:\n' for filename in self.filenames: if 'csv' in filename: file_intake = intake.open_csv(filename) data = file_intake.read() metadata = { 'variables': list(data.columns.values), 'geospatial_lon_min': float(data['longitude'].min()), 'geospatial_lat_min': float(data['latitude'].min()), 'geospatial_lon_max': float(data['longitude'].max()), 'geospatial_lat_max': float(data['latitude'].max()), 'time_coverage_start': data['time'].min(), 'time_coverage_end': data['time'].max() } file_intake.metadata = metadata # 'time variables info': 'test', 'space variables info': 'test'} elif 'nc' in filename: file_intake = intake.open_netcdf(filename) data = file_intake.read() metadata = { 'coords': list(data.coords.keys()), 'variables': list(data.data_vars.keys()), } file_intake.metadata = metadata file_intake.name = filename.split('/')[-1] lines += file_intake.yaml().strip('sources:') f = open(self.catalog_name, "w") f.write(lines) f.close()
def generate_catalog(file_path_name, dataset_sub_name, parent_page, tags): """ FILE_NAME: If there are more than one file, FILE_NAME is the pattern for the NetCDF files, otherwise, Name of the NetCDF file. e.g.: 'air.mon.mean.nc' DATASET_SUB_NAME: Name of the directory containing the NetCDf data files, e.g.: 'GHCN_CAMS'. If there is subdirectory like monthly, daily, etc., it should also be included and separated by "_". PARENT_PAGE: Name of the parent directory in the dataset type hierarchy, e.g.: Temperature TAG: A dataset may need to be catalogued into multiple child catalogs, e.g.: "Atmosphere", "Temperature". Please keep the format consistent """ file_path_name = file_path_name.strip('""') path, fileName = os.path.split(file_path_name) print("1 :" + file_path_name) print("2 :" + dataset_sub_name) print("3 :" + parent_page) print("4: " + tags) nfiles = len(glob.glob(file_path_name)) # Set is_combine based on number of files if (nfiles > 1): is_combine = True print("More than one file###") else: print("one file###") is_combine = False temp = dataset_sub_name #print("file path name is "+ file_path_name) #print("dataset_sub_name is "+ dataset_sub_name) #print("parent page is " + parent_page) if int(is_combine) == True: # Read with xarray source = xr.open_mfdataset(file_path_name, combine='nested', concat_dim='time') src = source # Use intake with xarray kwargs source = intake.open_netcdf(file_path_name, concat_dim='time', xarray_kwargs={ 'combine': 'nested', 'decode_times': True }) else: source = intake.open_netcdf(file_path_name) src = xr.open_dataset(file_path_name) source.discover() #print('subname' + dataset_sub_name) dataset_sub_name = open(dataset_sub_name.strip('""') + '.yaml', 'w') dataset_sub_name.write(source.yaml()) dataset_sub_name.close() print(str(dataset_sub_name.name) + " was cataloged") ############################################# # CATALOG_DIR: Github repository containing the master catalog # NOTE: It will be more accurate later catalog_dir = "https://raw.githubusercontent.com/kpegion/COLA-DATASETS-CATALOG/gh-pages/intake-catalogs/" print(type(path)) print(path) open_catalog = catalog_dir + temp + ".yaml" #print("Here is: {0}".format(open_catalog)) try: title = src.attrs['title'] except: title = dataset_sub_name try: url = src.attrs['References'] except: url = "" # Here url roles as the location url = path html_repr = xr.core.formatting_html.dataset_repr(src).replace('\\n', '\n') _header = src_header(title, parent_page, open_catalog, url, tags, open_catalog) tags = tags.split(',') _footer = src_footer() html_src = _header + html_repr + _footer page_name = fileName.replace('*', '').replace('..', '.') html_page = page_name + ".html" with open(html_page, "w") as file: file.write(html_src) print(html_page + " was created\n")
import xarray as xr import intake path = '/shared/scratch/nbehboud/gridded/temp/GHCN_CAMS/' mean_temp = 'air.mon.mean.nc' # Use intake with xarray kwargs source = intake.open_netcdf(path + mean_temp) source.discover() mean_outf = open('ghcn_cams.yaml', 'w') mean_outf.write(source.yaml()) mean_outf.close()
def write_catalog(self): """Write catalog file.""" # if the catalog already exists, don't do this if os.path.exists(self.catalog_name): return else: f = open(self.catalog_name, "w") if self.axds_type == "platform2": lines = "sources:\n" for dataset_id, dataset in self.search_results.items(): if self.filetype == "csv": urlpath = dataset["source"]["files"]["data.csv.gz"][ "url"] file_intake = intake.open_csv( urlpath, csv_kwargs=dict(parse_dates=["time"])) elif self.filetype == "netcdf": key = [ key for key in dataset["source"]["files"].keys() if ".nc" in key ][0] urlpath = dataset["source"]["files"][key]["url"] file_intake = intake.open_netcdf( urlpath ) # , xarray_kwargs=dict(parse_dates=['time'])) # to get all metadata # source = intake.open_textfiles(meta_url, decoder=json.loads) # source.metadata = source.read()[0] meta_url = dataset["source"]["files"]["meta.json"]["url"] meta_url = meta_url.replace(" ", "%20") attributes = pd.read_json(meta_url)["attributes"] file_intake.description = attributes["summary"] metadata = { "urlpath": urlpath, "meta_url": meta_url, "platform_category": attributes["platform_category"], "geospatial_lon_min": attributes["geospatial_lon_min"], "geospatial_lat_min": attributes["geospatial_lat_min"], "geospatial_lon_max": attributes["geospatial_lon_max"], "geospatial_lat_max": attributes["geospatial_lat_max"], "source_id": attributes["packrat_source_id"], "packrat_uuid": attributes["packrat_uuid"], "time_coverage_start": attributes["time_coverage_start"], "time_coverage_end": attributes["time_coverage_end"], } file_intake.metadata = metadata file_intake.name = attributes["packrat_uuid"] lines += file_intake.yaml().strip("sources:") elif self.axds_type == "layer_group": lines = """ plugins: source: - module: intake_xarray sources: """ # catalog entries are by module uuid and unique to opendap urls # dataset_ids are module uuids for dataset_id, dataset in self.search_results.items(): # layer_groups associated with module layer_groups = dataset["data"]["layer_group_info"] # get search results for layer_groups urlpaths = [] for layer_group_uuid in layer_groups.keys(): url_layer_group = self.url_builder( self.url_docs_base, dataset_id=layer_group_uuid) search_results_lg = requests.get( url_layer_group, headers=self.search_headers).json()[0] if "OPENDAP" in search_results_lg["data"][ "access_methods"]: url = search_results_lg["source"]["layers"][0][ "thredds_opendap_url"] if ".html" in url: url = url.replace(".html", "") urlpaths.append(url) else: urlpaths.append("") logger.warning( f"no opendap url for module: module uuid {dataset_id}, layer_group uuid {layer_group_uuid}" ) continue # there may be different urls for different layer_groups # in which case associate the layer_group uuid with the dataset # since the module uuid wouldn't be unique # if there were no urlpaths for any of the layer_groups, # urlpaths is like ['', '', '', '', '', '', '', ''] if len(set(urlpaths)) > 1: logger.warning( f"there are multiple urls for module: module uuid {dataset_id}. urls: {set(urlpaths)}" ) for urlpath, layer_group_uuid in zip( urlpaths, layer_groups.keys()): lines += self.write_catalog_layer_group_entry( dataset, layer_group_uuid, urlpath, layer_groups) # check for when no urlpaths, don't save entry # if not opendap accessible elif set(urlpaths) == {""}: logger.warning( f"no opendap url for module: module uuid {dataset_id} for any of its layer_groups. Do not include entry in catalog." ) continue else: urlpath = list(set(urlpaths))[0] # use module uuid lines += self.write_catalog_layer_group_entry( dataset, dataset_id, urlpath, layer_groups) f.write(lines) f.close()
def test_http_read_netcdf(data_server): url = f'{data_server}/example_1.nc' source = intake.open_netcdf(url) ds = source.read() assert ds['rh'].isel(lat=0, lon=0, time=0).values.dtype == 'float32' assert ds['rh'].isel(lat=0, lon=0, time=0).values == 0.5
def test_http_open_netcdf(data_server): url = f'{data_server}/example_1.nc' source = intake.open_netcdf(url) ds = source.to_dask() assert isinstance(ds, xr.core.dataset.Dataset) assert isinstance(ds.temp.data, numpy.ndarray)
import intake from intake_xarray.netcdf import NetCDFSource print(list(intake.registry)) cat_source: NetCDFSource = intake.open_netcdf( '/Users/tpmaxwel/Dropbox/Tom/Data/MERRA/MERRA2/6hr/*.nc4', concat_dim="time") cat_source.discover() with open("./catalog_local.yaml", 'w') as f: f.write(cat_source.yaml())
import intake files_path = "/Users/tpmaxwel/Dropbox/Tom/Data/MERRA/DAILY/2005/JAN/*.nc" datasource = intake.open_netcdf(files_path)
def write_catalog(self): """Write catalog file.""" # if the catalog already exists, don't do this if os.path.exists(self.catalog_name): return else: lines = "sources:\n" for filename in self.filenames: if "csv" in filename: file_intake = intake.open_csv(filename) data = file_intake.read() # # Remove skiprows entry and input header entry that we want # file_intake._csv_kwargs.pop("skiprows") # file_intake._csv_kwargs.update({"header": [0, 1]}) metadata = { "variables": list(data.columns.values), "geospatial_lon_min": float(data["longitude"].min()), "geospatial_lat_min": float(data["latitude"].min()), "geospatial_lon_max": float(data["longitude"].max()), "geospatial_lat_max": float(data["latitude"].max()), "time_coverage_start": data["time"].min(), "time_coverage_end": data["time"].max(), } file_intake.metadata = metadata elif "nc" in filename: file_intake = intake.open_netcdf(filename) data = file_intake.read() coords = list(data.coords.keys()) if "T" in data.cf.get_valid_keys(): time_coverage_start = str(data.cf["T"].min().values) time_coverage_end = str(data.cf["T"].max().values) else: time_coverage_start = "" time_coverage_end = "" if "longitude" in data.cf.get_valid_keys(): geospatial_lon_min = float(data.cf["longitude"].min()) geospatial_lon_max = float(data.cf["longitude"].max()) else: geospatial_lon_min = "" geospatial_lon_max = "" if "latitude" in data.cf.get_valid_keys(): geospatial_lat_min = float(data.cf["latitude"].min()) geospatial_lat_max = float(data.cf["latitude"].max()) else: geospatial_lat_min = "" geospatial_lat_max = "" metadata = { "coords": coords, "variables": list(data.data_vars.keys()), "time_variable": data.cf["T"].name, "lon_variable": data.cf["longitude"].name, "lat_variable": data.cf["latitude"].name, "geospatial_lon_min": geospatial_lon_min, "geospatial_lon_max": geospatial_lon_max, "geospatial_lat_min": geospatial_lat_min, "geospatial_lat_max": geospatial_lat_max, "time_coverage_start": time_coverage_start, "time_coverage_end": time_coverage_end, } file_intake.metadata = metadata file_intake.name = filename.split("/")[-1] lines += file_intake.yaml().strip("sources:") f = open(self.catalog_name, "w") f.write(lines) f.close()
def getDataSource(self, **kwargs ) -> DataSource: cdim = kwargs.get("concat_dim", "time") datasource = intake.open_netcdf( self.files, concat_dim=cdim ) datasource.discover() return datasource