Exemple #1
0
    def write_catalog(self):

        # if the catalog already exists, don't do this
        if os.path.exists(self.catalog_name):
            return

        else:
            lines = 'sources:\n'

            for filename in self.filenames:

                if 'csv' in filename:
                    file_intake = intake.open_csv(filename)
                    data = file_intake.read()
                    metadata = {
                        'variables': list(data.columns.values),
                        'geospatial_lon_min': float(data['longitude'].min()),
                        'geospatial_lat_min': float(data['latitude'].min()),
                        'geospatial_lon_max': float(data['longitude'].max()),
                        'geospatial_lat_max': float(data['latitude'].max()),
                        'time_coverage_start': data['time'].min(),
                        'time_coverage_end': data['time'].max()
                    }
                    file_intake.metadata = metadata
#                                             'time variables info': 'test', 'space variables info': 'test'}
                elif 'nc' in filename:
                    file_intake = intake.open_netcdf(filename)
                    data = file_intake.read()
                    metadata = {
                        'coords': list(data.coords.keys()),
                        'variables': list(data.data_vars.keys()),
                    }
                    file_intake.metadata = metadata

                file_intake.name = filename.split('/')[-1]
                lines += file_intake.yaml().strip('sources:')

            f = open(self.catalog_name, "w")
            f.write(lines)
            f.close()
Exemple #2
0
import numpy as np
import pandas as pd
import hvplot.pandas

import holoviews as hv
from holoviews.streams import Selection1D, Params
import panel as pn

import geoviews as gv
import geoviews.tile_sources as gts
import cartopy.crs as ccrs
import pyproj

hv.extension('bokeh')

df = intake.open_csv('./data/bird_migration/{species}.csv').read()


def fill_day(v):
    next_year = v.assign(day=v.day + v.day.max())
    last_year = v.assign(day=v.day - v.day.max())
    surrounding_years = pd.concat([last_year, v, next_year])
    filled = surrounding_years.assign(lat=surrounding_years.lat.interpolate(),
                                      lon=surrounding_years.lon.interpolate())
    this_year = filled[filled.day.isin(v.day)]
    return this_year


g = pyproj.Geod(ellps='WGS84')

Exemple #3
0
    def write_catalog(self):
        """Write catalog file."""

        # if the catalog already exists, don't do this
        if os.path.exists(self.catalog_name):
            return

        else:

            f = open(self.catalog_name, "w")

            if self.axds_type == "platform2":
                lines = "sources:\n"

                for dataset_id, dataset in self.search_results.items():
                    if self.filetype == "csv":
                        urlpath = dataset["source"]["files"]["data.csv.gz"][
                            "url"]
                        file_intake = intake.open_csv(
                            urlpath, csv_kwargs=dict(parse_dates=["time"]))
                    elif self.filetype == "netcdf":
                        key = [
                            key for key in dataset["source"]["files"].keys()
                            if ".nc" in key
                        ][0]
                        urlpath = dataset["source"]["files"][key]["url"]
                        file_intake = intake.open_netcdf(
                            urlpath
                        )  # , xarray_kwargs=dict(parse_dates=['time']))
                    # to get all metadata
                    # source = intake.open_textfiles(meta_url, decoder=json.loads)
                    # source.metadata = source.read()[0]
                    meta_url = dataset["source"]["files"]["meta.json"]["url"]
                    meta_url = meta_url.replace(" ", "%20")
                    attributes = pd.read_json(meta_url)["attributes"]
                    file_intake.description = attributes["summary"]
                    metadata = {
                        "urlpath": urlpath,
                        "meta_url": meta_url,
                        "platform_category": attributes["platform_category"],
                        "geospatial_lon_min": attributes["geospatial_lon_min"],
                        "geospatial_lat_min": attributes["geospatial_lat_min"],
                        "geospatial_lon_max": attributes["geospatial_lon_max"],
                        "geospatial_lat_max": attributes["geospatial_lat_max"],
                        "source_id": attributes["packrat_source_id"],
                        "packrat_uuid": attributes["packrat_uuid"],
                        "time_coverage_start":
                        attributes["time_coverage_start"],
                        "time_coverage_end": attributes["time_coverage_end"],
                    }
                    file_intake.metadata = metadata
                    file_intake.name = attributes["packrat_uuid"]
                    lines += file_intake.yaml().strip("sources:")

            elif self.axds_type == "layer_group":
                lines = """
plugins:
  source:
    - module: intake_xarray
sources:
"""
                # catalog entries are by module uuid and unique to opendap urls
                # dataset_ids are module uuids
                for dataset_id, dataset in self.search_results.items():

                    # layer_groups associated with module
                    layer_groups = dataset["data"]["layer_group_info"]

                    # get search results for layer_groups
                    urlpaths = []
                    for layer_group_uuid in layer_groups.keys():
                        url_layer_group = self.url_builder(
                            self.url_docs_base, dataset_id=layer_group_uuid)
                        search_results_lg = requests.get(
                            url_layer_group,
                            headers=self.search_headers).json()[0]

                        if "OPENDAP" in search_results_lg["data"][
                                "access_methods"]:
                            url = search_results_lg["source"]["layers"][0][
                                "thredds_opendap_url"]
                            if ".html" in url:
                                url = url.replace(".html", "")
                            urlpaths.append(url)
                        else:
                            urlpaths.append("")
                            logger.warning(
                                f"no opendap url for module: module uuid {dataset_id}, layer_group uuid {layer_group_uuid}"
                            )
                            continue

                    # there may be different urls for different layer_groups
                    # in which case associate the layer_group uuid with the dataset
                    # since the module uuid wouldn't be unique
                    # if there were no urlpaths for any of the layer_groups,
                    # urlpaths is like ['', '', '', '', '', '', '', '']
                    if len(set(urlpaths)) > 1:
                        logger.warning(
                            f"there are multiple urls for module: module uuid {dataset_id}. urls: {set(urlpaths)}"
                        )
                        for urlpath, layer_group_uuid in zip(
                                urlpaths, layer_groups.keys()):
                            lines += self.write_catalog_layer_group_entry(
                                dataset, layer_group_uuid, urlpath,
                                layer_groups)

                    # check for when no urlpaths, don't save entry
                    # if not opendap accessible
                    elif set(urlpaths) == {""}:
                        logger.warning(
                            f"no opendap url for module: module uuid {dataset_id} for any of its layer_groups. Do not include entry in catalog."
                        )
                        continue

                    else:
                        urlpath = list(set(urlpaths))[0]
                        # use module uuid
                        lines += self.write_catalog_layer_group_entry(
                            dataset, dataset_id, urlpath, layer_groups)

            f.write(lines)
            f.close()
import intake

# list items in catalog
print(list(intake.cat))

# read a csv file
source = intake.open_csv(
    'https://timeseries.weebly.com/uploads/2/1/0/8/21086414/sea_ice.csv')

# discover basic information about the data source
print(source.discover())

# display the number of partitions
print(source.npartitions)

# display the kind of containe the source produces
print(source.container)

# read the first partition
df = source.read_partition(0)
print(type(df))
print(df.shape)

# Create a catalog file from the source
print(source.yaml())

cat = intake.open_catalog('https://raw.githubusercontent.com/intake/'
                          'intake-examples/master/tutorial/sea.yaml')

# Create a catalog file from the catalog which acts as a source
print(cat.yaml())
Exemple #5
0
    def write_catalog(self):
        """Write catalog file."""

        # if the catalog already exists, don't do this
        if os.path.exists(self.catalog_name):
            return

        else:
            lines = "sources:\n"

            for filename in self.filenames:

                if "csv" in filename:
                    file_intake = intake.open_csv(filename)
                    data = file_intake.read()
                    #                     # Remove skiprows entry and input header entry that we want
                    #                     file_intake._csv_kwargs.pop("skiprows")
                    #                     file_intake._csv_kwargs.update({"header": [0, 1]})
                    metadata = {
                        "variables": list(data.columns.values),
                        "geospatial_lon_min": float(data["longitude"].min()),
                        "geospatial_lat_min": float(data["latitude"].min()),
                        "geospatial_lon_max": float(data["longitude"].max()),
                        "geospatial_lat_max": float(data["latitude"].max()),
                        "time_coverage_start": data["time"].min(),
                        "time_coverage_end": data["time"].max(),
                    }
                    file_intake.metadata = metadata

                elif "nc" in filename:
                    file_intake = intake.open_netcdf(filename)
                    data = file_intake.read()
                    coords = list(data.coords.keys())
                    if "T" in data.cf.get_valid_keys():
                        time_coverage_start = str(data.cf["T"].min().values)
                        time_coverage_end = str(data.cf["T"].max().values)
                    else:
                        time_coverage_start = ""
                        time_coverage_end = ""
                    if "longitude" in data.cf.get_valid_keys():
                        geospatial_lon_min = float(data.cf["longitude"].min())
                        geospatial_lon_max = float(data.cf["longitude"].max())
                    else:
                        geospatial_lon_min = ""
                        geospatial_lon_max = ""
                    if "latitude" in data.cf.get_valid_keys():
                        geospatial_lat_min = float(data.cf["latitude"].min())
                        geospatial_lat_max = float(data.cf["latitude"].max())
                    else:
                        geospatial_lat_min = ""
                        geospatial_lat_max = ""
                    metadata = {
                        "coords": coords,
                        "variables": list(data.data_vars.keys()),
                        "time_variable": data.cf["T"].name,
                        "lon_variable": data.cf["longitude"].name,
                        "lat_variable": data.cf["latitude"].name,
                        "geospatial_lon_min": geospatial_lon_min,
                        "geospatial_lon_max": geospatial_lon_max,
                        "geospatial_lat_min": geospatial_lat_min,
                        "geospatial_lat_max": geospatial_lat_max,
                        "time_coverage_start": time_coverage_start,
                        "time_coverage_end": time_coverage_end,
                    }
                    file_intake.metadata = metadata

                file_intake.name = filename.split("/")[-1]
                lines += file_intake.yaml().strip("sources:")

            f = open(self.catalog_name, "w")
            f.write(lines)
            f.close()