def write_catalog(self): # if the catalog already exists, don't do this if os.path.exists(self.catalog_name): return else: lines = 'sources:\n' for filename in self.filenames: if 'csv' in filename: file_intake = intake.open_csv(filename) data = file_intake.read() metadata = { 'variables': list(data.columns.values), 'geospatial_lon_min': float(data['longitude'].min()), 'geospatial_lat_min': float(data['latitude'].min()), 'geospatial_lon_max': float(data['longitude'].max()), 'geospatial_lat_max': float(data['latitude'].max()), 'time_coverage_start': data['time'].min(), 'time_coverage_end': data['time'].max() } file_intake.metadata = metadata # 'time variables info': 'test', 'space variables info': 'test'} elif 'nc' in filename: file_intake = intake.open_netcdf(filename) data = file_intake.read() metadata = { 'coords': list(data.coords.keys()), 'variables': list(data.data_vars.keys()), } file_intake.metadata = metadata file_intake.name = filename.split('/')[-1] lines += file_intake.yaml().strip('sources:') f = open(self.catalog_name, "w") f.write(lines) f.close()
import numpy as np import pandas as pd import hvplot.pandas import holoviews as hv from holoviews.streams import Selection1D, Params import panel as pn import geoviews as gv import geoviews.tile_sources as gts import cartopy.crs as ccrs import pyproj hv.extension('bokeh') df = intake.open_csv('./data/bird_migration/{species}.csv').read() def fill_day(v): next_year = v.assign(day=v.day + v.day.max()) last_year = v.assign(day=v.day - v.day.max()) surrounding_years = pd.concat([last_year, v, next_year]) filled = surrounding_years.assign(lat=surrounding_years.lat.interpolate(), lon=surrounding_years.lon.interpolate()) this_year = filled[filled.day.isin(v.day)] return this_year g = pyproj.Geod(ellps='WGS84')
def write_catalog(self): """Write catalog file.""" # if the catalog already exists, don't do this if os.path.exists(self.catalog_name): return else: f = open(self.catalog_name, "w") if self.axds_type == "platform2": lines = "sources:\n" for dataset_id, dataset in self.search_results.items(): if self.filetype == "csv": urlpath = dataset["source"]["files"]["data.csv.gz"][ "url"] file_intake = intake.open_csv( urlpath, csv_kwargs=dict(parse_dates=["time"])) elif self.filetype == "netcdf": key = [ key for key in dataset["source"]["files"].keys() if ".nc" in key ][0] urlpath = dataset["source"]["files"][key]["url"] file_intake = intake.open_netcdf( urlpath ) # , xarray_kwargs=dict(parse_dates=['time'])) # to get all metadata # source = intake.open_textfiles(meta_url, decoder=json.loads) # source.metadata = source.read()[0] meta_url = dataset["source"]["files"]["meta.json"]["url"] meta_url = meta_url.replace(" ", "%20") attributes = pd.read_json(meta_url)["attributes"] file_intake.description = attributes["summary"] metadata = { "urlpath": urlpath, "meta_url": meta_url, "platform_category": attributes["platform_category"], "geospatial_lon_min": attributes["geospatial_lon_min"], "geospatial_lat_min": attributes["geospatial_lat_min"], "geospatial_lon_max": attributes["geospatial_lon_max"], "geospatial_lat_max": attributes["geospatial_lat_max"], "source_id": attributes["packrat_source_id"], "packrat_uuid": attributes["packrat_uuid"], "time_coverage_start": attributes["time_coverage_start"], "time_coverage_end": attributes["time_coverage_end"], } file_intake.metadata = metadata file_intake.name = attributes["packrat_uuid"] lines += file_intake.yaml().strip("sources:") elif self.axds_type == "layer_group": lines = """ plugins: source: - module: intake_xarray sources: """ # catalog entries are by module uuid and unique to opendap urls # dataset_ids are module uuids for dataset_id, dataset in self.search_results.items(): # layer_groups associated with module layer_groups = dataset["data"]["layer_group_info"] # get search results for layer_groups urlpaths = [] for layer_group_uuid in layer_groups.keys(): url_layer_group = self.url_builder( self.url_docs_base, dataset_id=layer_group_uuid) search_results_lg = requests.get( url_layer_group, headers=self.search_headers).json()[0] if "OPENDAP" in search_results_lg["data"][ "access_methods"]: url = search_results_lg["source"]["layers"][0][ "thredds_opendap_url"] if ".html" in url: url = url.replace(".html", "") urlpaths.append(url) else: urlpaths.append("") logger.warning( f"no opendap url for module: module uuid {dataset_id}, layer_group uuid {layer_group_uuid}" ) continue # there may be different urls for different layer_groups # in which case associate the layer_group uuid with the dataset # since the module uuid wouldn't be unique # if there were no urlpaths for any of the layer_groups, # urlpaths is like ['', '', '', '', '', '', '', ''] if len(set(urlpaths)) > 1: logger.warning( f"there are multiple urls for module: module uuid {dataset_id}. urls: {set(urlpaths)}" ) for urlpath, layer_group_uuid in zip( urlpaths, layer_groups.keys()): lines += self.write_catalog_layer_group_entry( dataset, layer_group_uuid, urlpath, layer_groups) # check for when no urlpaths, don't save entry # if not opendap accessible elif set(urlpaths) == {""}: logger.warning( f"no opendap url for module: module uuid {dataset_id} for any of its layer_groups. Do not include entry in catalog." ) continue else: urlpath = list(set(urlpaths))[0] # use module uuid lines += self.write_catalog_layer_group_entry( dataset, dataset_id, urlpath, layer_groups) f.write(lines) f.close()
import intake # list items in catalog print(list(intake.cat)) # read a csv file source = intake.open_csv( 'https://timeseries.weebly.com/uploads/2/1/0/8/21086414/sea_ice.csv') # discover basic information about the data source print(source.discover()) # display the number of partitions print(source.npartitions) # display the kind of containe the source produces print(source.container) # read the first partition df = source.read_partition(0) print(type(df)) print(df.shape) # Create a catalog file from the source print(source.yaml()) cat = intake.open_catalog('https://raw.githubusercontent.com/intake/' 'intake-examples/master/tutorial/sea.yaml') # Create a catalog file from the catalog which acts as a source print(cat.yaml())
def write_catalog(self): """Write catalog file.""" # if the catalog already exists, don't do this if os.path.exists(self.catalog_name): return else: lines = "sources:\n" for filename in self.filenames: if "csv" in filename: file_intake = intake.open_csv(filename) data = file_intake.read() # # Remove skiprows entry and input header entry that we want # file_intake._csv_kwargs.pop("skiprows") # file_intake._csv_kwargs.update({"header": [0, 1]}) metadata = { "variables": list(data.columns.values), "geospatial_lon_min": float(data["longitude"].min()), "geospatial_lat_min": float(data["latitude"].min()), "geospatial_lon_max": float(data["longitude"].max()), "geospatial_lat_max": float(data["latitude"].max()), "time_coverage_start": data["time"].min(), "time_coverage_end": data["time"].max(), } file_intake.metadata = metadata elif "nc" in filename: file_intake = intake.open_netcdf(filename) data = file_intake.read() coords = list(data.coords.keys()) if "T" in data.cf.get_valid_keys(): time_coverage_start = str(data.cf["T"].min().values) time_coverage_end = str(data.cf["T"].max().values) else: time_coverage_start = "" time_coverage_end = "" if "longitude" in data.cf.get_valid_keys(): geospatial_lon_min = float(data.cf["longitude"].min()) geospatial_lon_max = float(data.cf["longitude"].max()) else: geospatial_lon_min = "" geospatial_lon_max = "" if "latitude" in data.cf.get_valid_keys(): geospatial_lat_min = float(data.cf["latitude"].min()) geospatial_lat_max = float(data.cf["latitude"].max()) else: geospatial_lat_min = "" geospatial_lat_max = "" metadata = { "coords": coords, "variables": list(data.data_vars.keys()), "time_variable": data.cf["T"].name, "lon_variable": data.cf["longitude"].name, "lat_variable": data.cf["latitude"].name, "geospatial_lon_min": geospatial_lon_min, "geospatial_lon_max": geospatial_lon_max, "geospatial_lat_min": geospatial_lat_min, "geospatial_lat_max": geospatial_lat_max, "time_coverage_start": time_coverage_start, "time_coverage_end": time_coverage_end, } file_intake.metadata = metadata file_intake.name = filename.split("/")[-1] lines += file_intake.yaml().strip("sources:") f = open(self.catalog_name, "w") f.write(lines) f.close()