Python open_csv Exemples, intake.open_csv Python Exemples

Exemple #1

0

Afficher le fichier

    def write_catalog(self):

        # if the catalog already exists, don't do this
        if os.path.exists(self.catalog_name):
            return

        else:
            lines = 'sources:\n'

            for filename in self.filenames:

                if 'csv' in filename:
                    file_intake = intake.open_csv(filename)
                    data = file_intake.read()
                    metadata = {
                        'variables': list(data.columns.values),
                        'geospatial_lon_min': float(data['longitude'].min()),
                        'geospatial_lat_min': float(data['latitude'].min()),
                        'geospatial_lon_max': float(data['longitude'].max()),
                        'geospatial_lat_max': float(data['latitude'].max()),
                        'time_coverage_start': data['time'].min(),
                        'time_coverage_end': data['time'].max()
                    }
                    file_intake.metadata = metadata
#                                             'time variables info': 'test', 'space variables info': 'test'}
                elif 'nc' in filename:
                    file_intake = intake.open_netcdf(filename)
                    data = file_intake.read()
                    metadata = {
                        'coords': list(data.coords.keys()),
                        'variables': list(data.data_vars.keys()),
                    }
                    file_intake.metadata = metadata

                file_intake.name = filename.split('/')[-1]
                lines += file_intake.yaml().strip('sources:')

            f = open(self.catalog_name, "w")
            f.write(lines)
            f.close()

Exemple #2

0

Afficher le fichier

import numpy as np
import pandas as pd
import hvplot.pandas

import holoviews as hv
from holoviews.streams import Selection1D, Params
import panel as pn

import geoviews as gv
import geoviews.tile_sources as gts
import cartopy.crs as ccrs
import pyproj

hv.extension('bokeh')

df = intake.open_csv('./data/bird_migration/{species}.csv').read()


def fill_day(v):
    next_year = v.assign(day=v.day + v.day.max())
    last_year = v.assign(day=v.day - v.day.max())
    surrounding_years = pd.concat([last_year, v, next_year])
    filled = surrounding_years.assign(lat=surrounding_years.lat.interpolate(),
                                      lon=surrounding_years.lon.interpolate())
    this_year = filled[filled.day.isin(v.day)]
    return this_year


g = pyproj.Geod(ellps='WGS84')

Exemple #3

0

Afficher le fichier

Fichier : axds.py Projet : kthyng/ocean_data_gateway

    def write_catalog(self):
        """Write catalog file."""

        # if the catalog already exists, don't do this
        if os.path.exists(self.catalog_name):
            return

        else:

            f = open(self.catalog_name, "w")

            if self.axds_type == "platform2":
                lines = "sources:\n"

                for dataset_id, dataset in self.search_results.items():
                    if self.filetype == "csv":
                        urlpath = dataset["source"]["files"]["data.csv.gz"][
                            "url"]
                        file_intake = intake.open_csv(
                            urlpath, csv_kwargs=dict(parse_dates=["time"]))
                    elif self.filetype == "netcdf":
                        key = [
                            key for key in dataset["source"]["files"].keys()
                            if ".nc" in key
                        ][0]
                        urlpath = dataset["source"]["files"][key]["url"]
                        file_intake = intake.open_netcdf(
                            urlpath
                        )  # , xarray_kwargs=dict(parse_dates=['time']))
                    # to get all metadata
                    # source = intake.open_textfiles(meta_url, decoder=json.loads)
                    # source.metadata = source.read()[0]
                    meta_url = dataset["source"]["files"]["meta.json"]["url"]
                    meta_url = meta_url.replace(" ", "%20")
                    attributes = pd.read_json(meta_url)["attributes"]
                    file_intake.description = attributes["summary"]
                    metadata = {
                        "urlpath": urlpath,
                        "meta_url": meta_url,
                        "platform_category": attributes["platform_category"],
                        "geospatial_lon_min": attributes["geospatial_lon_min"],
                        "geospatial_lat_min": attributes["geospatial_lat_min"],
                        "geospatial_lon_max": attributes["geospatial_lon_max"],
                        "geospatial_lat_max": attributes["geospatial_lat_max"],
                        "source_id": attributes["packrat_source_id"],
                        "packrat_uuid": attributes["packrat_uuid"],
                        "time_coverage_start":
                        attributes["time_coverage_start"],
                        "time_coverage_end": attributes["time_coverage_end"],
                    }
                    file_intake.metadata = metadata
                    file_intake.name = attributes["packrat_uuid"]
                    lines += file_intake.yaml().strip("sources:")

            elif self.axds_type == "layer_group":
                lines = """
plugins:
  source:
    - module: intake_xarray
sources:
"""
                # catalog entries are by module uuid and unique to opendap urls
                # dataset_ids are module uuids
                for dataset_id, dataset in self.search_results.items():

                    # layer_groups associated with module
                    layer_groups = dataset["data"]["layer_group_info"]

                    # get search results for layer_groups
                    urlpaths = []
                    for layer_group_uuid in layer_groups.keys():
                        url_layer_group = self.url_builder(
                            self.url_docs_base, dataset_id=layer_group_uuid)
                        search_results_lg = requests.get(
                            url_layer_group,
                            headers=self.search_headers).json()[0]

                        if "OPENDAP" in search_results_lg["data"][
                                "access_methods"]:
                            url = search_results_lg["source"]["layers"][0][
                                "thredds_opendap_url"]
                            if ".html" in url:
                                url = url.replace(".html", "")
                            urlpaths.append(url)
                        else:
                            urlpaths.append("")
                            logger.warning(
                                f"no opendap url for module: module uuid {dataset_id}, layer_group uuid {layer_group_uuid}"
                            )
                            continue

                    # there may be different urls for different layer_groups
                    # in which case associate the layer_group uuid with the dataset
                    # since the module uuid wouldn't be unique
                    # if there were no urlpaths for any of the layer_groups,
                    # urlpaths is like ['', '', '', '', '', '', '', '']
                    if len(set(urlpaths)) > 1:
                        logger.warning(
                            f"there are multiple urls for module: module uuid {dataset_id}. urls: {set(urlpaths)}"
                        )
                        for urlpath, layer_group_uuid in zip(
                                urlpaths, layer_groups.keys()):
                            lines += self.write_catalog_layer_group_entry(
                                dataset, layer_group_uuid, urlpath,
                                layer_groups)

                    # check for when no urlpaths, don't save entry
                    # if not opendap accessible
                    elif set(urlpaths) == {""}:
                        logger.warning(
                            f"no opendap url for module: module uuid {dataset_id} for any of its layer_groups. Do not include entry in catalog."
                        )
                        continue

                    else:
                        urlpath = list(set(urlpaths))[0]
                        # use module uuid
                        lines += self.write_catalog_layer_group_entry(
                            dataset, dataset_id, urlpath, layer_groups)

            f.write(lines)
            f.close()

Exemple #4

0

Afficher le fichier

Fichier : intake-example.py Projet : fagan2888/presto-minio

import intake

# list items in catalog
print(list(intake.cat))

# read a csv file
source = intake.open_csv(
    'https://timeseries.weebly.com/uploads/2/1/0/8/21086414/sea_ice.csv')

# discover basic information about the data source
print(source.discover())

# display the number of partitions
print(source.npartitions)

# display the kind of containe the source produces
print(source.container)

# read the first partition
df = source.read_partition(0)
print(type(df))
print(df.shape)

# Create a catalog file from the source
print(source.yaml())

cat = intake.open_catalog('https://raw.githubusercontent.com/intake/'
                          'intake-examples/master/tutorial/sea.yaml')

# Create a catalog file from the catalog which acts as a source
print(cat.yaml())

Exemple #5

0

Afficher le fichier

    def write_catalog(self):
        """Write catalog file."""

        # if the catalog already exists, don't do this
        if os.path.exists(self.catalog_name):
            return

        else:
            lines = "sources:\n"

            for filename in self.filenames:

                if "csv" in filename:
                    file_intake = intake.open_csv(filename)
                    data = file_intake.read()
                    #                     # Remove skiprows entry and input header entry that we want
                    #                     file_intake._csv_kwargs.pop("skiprows")
                    #                     file_intake._csv_kwargs.update({"header": [0, 1]})
                    metadata = {
                        "variables": list(data.columns.values),
                        "geospatial_lon_min": float(data["longitude"].min()),
                        "geospatial_lat_min": float(data["latitude"].min()),
                        "geospatial_lon_max": float(data["longitude"].max()),
                        "geospatial_lat_max": float(data["latitude"].max()),
                        "time_coverage_start": data["time"].min(),
                        "time_coverage_end": data["time"].max(),
                    }
                    file_intake.metadata = metadata

                elif "nc" in filename:
                    file_intake = intake.open_netcdf(filename)
                    data = file_intake.read()
                    coords = list(data.coords.keys())
                    if "T" in data.cf.get_valid_keys():
                        time_coverage_start = str(data.cf["T"].min().values)
                        time_coverage_end = str(data.cf["T"].max().values)
                    else:
                        time_coverage_start = ""
                        time_coverage_end = ""
                    if "longitude" in data.cf.get_valid_keys():
                        geospatial_lon_min = float(data.cf["longitude"].min())
                        geospatial_lon_max = float(data.cf["longitude"].max())
                    else:
                        geospatial_lon_min = ""
                        geospatial_lon_max = ""
                    if "latitude" in data.cf.get_valid_keys():
                        geospatial_lat_min = float(data.cf["latitude"].min())
                        geospatial_lat_max = float(data.cf["latitude"].max())
                    else:
                        geospatial_lat_min = ""
                        geospatial_lat_max = ""
                    metadata = {
                        "coords": coords,
                        "variables": list(data.data_vars.keys()),
                        "time_variable": data.cf["T"].name,
                        "lon_variable": data.cf["longitude"].name,
                        "lat_variable": data.cf["latitude"].name,
                        "geospatial_lon_min": geospatial_lon_min,
                        "geospatial_lon_max": geospatial_lon_max,
                        "geospatial_lat_min": geospatial_lat_min,
                        "geospatial_lat_max": geospatial_lat_max,
                        "time_coverage_start": time_coverage_start,
                        "time_coverage_end": time_coverage_end,
                    }
                    file_intake.metadata = metadata

                file_intake.name = filename.split("/")[-1]
                lines += file_intake.yaml().strip("sources:")

            f = open(self.catalog_name, "w")
            f.write(lines)
            f.close()