Example #1
0
def update_stats():
    data = read_datapackage("data")
    filters = data['filters']
    filename = os.path.join('data', filters._metadata['path'])
    images = data['images']

    # Apply stats
    images.fillna('', inplace=True)
    # print(images.head)
    filters.dropna(subset=['Code', 'Column'], inplace=True)
    # print(filters.head)

    filters['Count'] = filters.apply(lambda row: (
        (row['Code'] == '.*' and len(images.loc[images[row['Column']].str.len(
        ) > 0])) or len(images.loc[images[row['Column']].str.match(
            '^' + row['Code'] + '$', case=True) | images[row[
                'Column']].str.contains(' ' + row['Code'] + ' ') | images[row[
                    'Column']].str.match('^' + row['Code'] + '[ |,]',
                                         case=True) | images[row['Column']].str
                                   .endswith(' ' + row['Code'])])),
                                     axis=1)

    print(filters.head(n=5))

    print("Writing to %s" % filename)
    filters.to_csv(filename, index=False)
def test_remote_package():
    url = ("https://github.com/rgieseke/pandas-datapackage-reader/"
           "raw/master/tests/test-package/datapackage.json")
    dp = read_datapackage(url)
    assert isinstance(dp, dict)
    assert "moredata" in dp.keys()
    assert isinstance(dp["moredata"], pd.DataFrame)
    assert "data" in dp.keys()
Example #3
0
 def read(self, filepath,
          **kwargs) -> Tuple[Dict[str, pd.DataFrame], Dict[str, Any]]:
     inputs = read_datapackage(filepath)
     default_resource = inputs.pop("default_values").set_index(
         "name").to_dict()
     default_values = default_resource["default_value"]
     inputs = self._check_index(inputs)
     return inputs, default_values
Example #4
0
def load_datapackage(url: str, package_name: str) -> Dict[str, pd.DataFrame]:
    """
    Downloads data package from online source and stores it as hdf-file in
    filip.data named by the <filename>.hdf.

    Args:
        url (str): Valid url to where the data package is hosted
        package_name (str): name of the cached file.

    Returns:
        Dict of dataframes
    """
    # validate arguments
    validate_http_url(url=url)

    # create directory for data if not exists
    validate_http_url(url=url)
    path = Path(__file__).parent.parent.absolute().joinpath('data')
    path.mkdir(parents=True, exist_ok=True)
    package_path = path.joinpath(package_name)

    if os.path.isdir(package_path):
        # read data from filip.data if exists
        logger.info("Found existing data package in 'filip.data'")

        data = {}
        for file in os.listdir(package_path):
            file_name = file[:-4]
            # read in each file as one dataframe, prevents the deletion of NaN
            # values with na_filter=False
            frame = pd.read_csv(package_path.joinpath(file),
                                index_col=0,
                                header=0,
                                na_filter=False)
            data[file_name] = frame

    else:
        # download external data and store data
        logger.info("Could not find data package in 'filip.data'. Will "
                    "try to download from %s", url)
        try:
            data = read_datapackage(url)
            # rename keys
            data = {k.replace('-', '_'): v for k, v in data.items()}
            os.mkdir(package_path)

            # store data in filip.data
            for k, v in data.items():
                v: DataFrame = v
                v.loc[:, :] = v[:].applymap(str)
                table_filepath = \
                    str(package_path) + f"\\{k.replace('-', '_')}.csv"
                v.to_csv(table_filepath)

        except:
            logger.error("Failed to load data package!")
            raise
    return data
Example #5
0
    def _get_package(self):

        if self.sql:
            engine = create_engine("sqlite:///{}".format(self.datapackage))
            package = Package(storage="sql", engine=engine)
        else:
            package = read_datapackage(self.datapackage)  # typing: datapackage.Package

        return package
def test_datetimes():
    # Default test date/time '2017-01-01 01:23:45'
    df = read_datapackage(os.path.join(path, "test-package"), "datetimes")
    assert df["date"].iloc[0] == date(2017, 1, 1)
    assert df["datetime"].iloc[0] == datetime(2017, 1, 1, 1, 23, 45)
    assert df["time"].iloc[0] == time(1, 23, 45)
    assert df.reset_index()["year"].iloc[0] == 2017
    assert df["yearmonth"].iloc[0] == pd.Period("2017-01")
    assert df["yearmonth"].iloc[0] == pd.Period("2017-01")
    assert df["dayfirstdate"].iloc[0] == date(2017, 12, 13)
from pandas_datapackage_reader import read_datapackage
from datetime import datetime
import json

from flask import Flask, Response, request
from flask_cors import CORS, cross_origin
app = Flask(__name__)
app.url_map.strict_slashes = False
cors = CORS(app)

data = read_datapackage("")
resources = list(data.keys())


def get_paginated_json(df, per_page, page):
    return df[page:page + per_page].to_json(orient='records',
                                            date_format='iso')


@app.route('/v1/<string:dataset>')
@app.route('/v1', defaults={'dataset': ''})
@cross_origin()
def api(dataset):
    try:
        per_page = int(request.args.get('per_page', 1000))
    except:
        per_page = 1000

    try:
        page = (int(request.args.get('page', 1)) - 1) * per_page
    except:
def test_missing_integer_values():
    df_wo_index = read_datapackage(os.path.join(path, "test-package"),
                                   "datawithoutindex")
    assert pd.isnull(df_wo_index.iloc[1].intvalue)
    assert df_wo_index["intvalue"].dtype == pd.Int64Dtype()
def test_missing_integer_values_with_index():
    df = read_datapackage(os.path.join(path, "test-package"), "datawithindex")
    assert pd.isnull(df.loc[2])
    assert df["intvalue"].dtype == pd.Int64Dtype()
def test_metadata():
    df = read_datapackage(os.path.join(path, "test-package"), "data")
    assert df._metadata["format"] == "csv"
def test_ignore_missing_values():
    df = read_datapackage(os.path.join(path, "test-package"), "data")
    assert df.loc["c"].value == "NA"
Example #12
0
    request, flash,
    render_template,
    send_from_directory,
)

try:
    from .util import *
except:
    from util import *

app = FlaskAPI(__name__)


# Create API endpoints

data = read_datapackage("data")

@app.route('/api/<resource>')
def api_dict(resource):
    return get_paginated(request.args, data[resource])

@app.route('/api/<resource>.json')
def api_json(resource):
    return get_paginated(request.args, data[resource], True)

@app.route('/api/<resource>/all.json')
def api_all_json(resource):
    return data[resource].to_json(orient='records')


# Static views
Example #13
0
import pandas as pd
from pandas_datapackage_reader import read_datapackage

from pymagicc.utils import apply_string_substitutions

DATA_HIERARCHY_SEPARATOR = "|"
"""str: String used to define different levels in our data hierarchies.

For example, "Emissions|CO2|Energy|Coal".

We copy this straight from pyam_ to maintain easy compatibility.
"""

path = Path(__file__).parent

_dtrm = read_datapackage(path, "magicc_dattype_regionmode_regions")

_region_cols = _dtrm.columns.to_series().apply(
    lambda x: x.startswith("region"))

DATTYPE_REGIONMODE_REGIONS = _dtrm.loc[:, ~_region_cols].copy()
""":obj:`pandas.DataFrame` Mapping between regions and whether a file is SCEN7 or not and the expected values of THISFILE_DATTYPE and THISFILE_REGIONMODE flags in MAGICC.
"""

DATTYPE_REGIONMODE_REGIONS["regions"] = [[
    r for r in raw if not pd.isnull(r)
] for raw in _dtrm.loc[:, _region_cols].values.tolist()]

MAGICC7_EMISSIONS_UNITS = read_datapackage(path, "magicc_emisssions_units")
""":obj:`pandas.DataFrame` Definitions of emissions variables and their expected units in MAGICC7.
"""
        data = pd.read_table(f, skiprows=7, delim_whitespace=True, nrows=15)
        data = data.set_index("Region").T.loc['1997':'2018']
        data.index = [int(i) for i in data.index]
        data = data.Global
        data = data / 10**(12 - factor)  # to Tg
        # Parse gas name from e.g. 'GFED4.1s_SO2.txt',
        # or 'GFED4.1s_Toluene_lump.txt'
        idx = path.suffixes[0].split("_", 1)[1]
        data.name = idx
        out[idx] = data

# From http://www.globalfiredata.org/ar6historic.html with NH3 removed
# as it is already contained in the data separately.
nmvoc = ["C2H6", "CH3OH", "C2H5OH", "C3H8", "C2H2", "C2H4", "C3H6", "C5H8", "C10H16", "C7H8", "C6H6", "C8H10", "Toluene_lump", "Higher_Alkenes", "Higher_Alkanes", "CH2O", "C2H4O", "C3H6O", "C2H6S", "HCN", "HCOOH", "CH3COOH", "MEK", "CH3COCHO", "HOCH2CHO"
]

df = pd.DataFrame(out)
df.index.name = "Year"
df["NMVOC"] = df[nmvoc].sum(axis=1).round(3)

gbbe = read_datapackage(root / "datapackage.json", "global-biomass-burning-emissions")

# Test for sufficient equality of NMVOC sum with GBBE data
# where it is already combined.
assert_almost_equal(
    gbbe.NMVOC.loc[1997:2015].round(0),
    df["NMVOC"].loc[1997:2015].round(0)
)

df.to_csv(root / "data/gfed4s.csv")
Example #15
0
def test_ignore_custom_missing_values():
    df = read_datapackage(os.path.join(path, "test-package"), "data")
    assert pd.isna(df.loc["c"].value)
def test_load_multiple_resources():
    dp = read_datapackage(os.path.join(path, "test-package"),
                          ["data", "moredata"])
    assert "data" in dp.keys()
    assert "moredata" in dp.keys()
def test_load_single_resource():
    df = pd.read_csv(os.path.join(path, "test-package/moredata.csv"))
    moredata = read_datapackage(os.path.join(path, "test-package"), "moredata")
    assert df.equals(moredata)
def test_local_package():
    dp = read_datapackage(os.path.join(path, "test-package"))
    assert isinstance(dp, dict)
    assert "moredata" in dp.keys()
    assert "data" in dp.keys()
def test_unsupported_format():
    dp = read_datapackage(os.path.join(path, "test-package"))
    assert "json-only" not in dp.keys()
def test_geojson():
    df = read_datapackage(os.path.join(path, "test-package"), "admin1-us")
    assert df._metadata["format"] == "geojson"
    assert "geometry" in df.columns
Example #21
0
def prepare(dp: frictionless.package.Package, name: str):
    """

    Prepare data in EnerMaps format.

    Parameters
    ----------
    dp : frictionless.package.Package
        Valid datapackage
    name : str
        Name of the dataset (used for constructing the FID)

    Returns
    -------
    DataFrame
        Data in EnerMaps format.
    GeoDataFrame
        Spatial data in EnerMaps format.

    """
    data = read_datapackage(dp)
    data["fid"] = name + "_" + data[ID].astype(str)

    spatial = gpd.GeoDataFrame(
        data["fid"],
        columns=["fid"],
        geometry=gpd.points_from_xy(data.longitude, data.latitude),
        crs="EPSG:4326",
    )

    # Other fields to json
    def np_encoder(object):
        """Source: https://stackoverflow.com/a/65151218."""
        if isinstance(object, np.generic):
            return object.item()

    other_cols = [
        x for x in data.columns if x not in VALUE_VARS + SPATIAL_VARS + ID_VARS
    ]

    # Int64 to int
    data.loc[:, other_cols].loc[:, data[other_cols].dtypes == "int64"] = (
        data.loc[:, other_cols].loc[:, data[other_cols].dtypes ==
                                    "int64"].astype(int))
    data = data.replace({np.nan: None})
    data["fields"] = data[other_cols].to_dict(orient="records")
    data["fields"] = data["fields"].apply(
        lambda x: json.dumps(x, default=np_encoder))

    # Unpivoting
    data = data.melt(id_vars=ID_VARS, value_vars=VALUE_VARS)

    # Remove nan
    data = data.dropna()

    # Conversion
    enermaps_data = pd.DataFrame(columns=[
        "start_at",
        "fields",
        "variable",
        "value",
        "ds_id",
        "fid",
        "dt",
        "z",
        "israster",
        "unit",
    ])
    enermaps_data["fid"] = data["fid"]
    enermaps_data["value"] = data["value"]
    enermaps_data["variable"] = data["variable"]
    enermaps_data["fields"] = data["fields"]
    enermaps_data["unit"] = UNIT
    enermaps_data["israster"] = ISRASTER

    return enermaps_data, spatial
Example #22
0
def test_ignore_default_missing_values():
    df = read_datapackage(os.path.join(path, "test-package"),
                          "datawithoutindex")
    assert pd.isna(df.iloc[1, 1])
import os
import pandas as pd

from countrygroups import EUROPEAN_UNION as eu
from shutil import copyfile
from pandas_datapackage_reader import read_datapackage
from pathlib import Path

root = Path(__file__).parents[1]
data_dir = root / "data"
ndcs_path = root / "cache/ndcs"
indcs_path = root / "cache/indcs"
latest_pdfs_path = root / "pdfs"

# NDCs
ndcs = read_datapackage(ndcs_path)

# Enable categorical sorting for language.
ndcs['Language'] = pd.Categorical(
    ndcs['Language'], ["English", "Arabic", "Spanish", "French", "Russian"])
# Set Preference for kind of document.
ndcs['FileType'] = pd.Categorical(ndcs['FileType'],
                                  ["Translation", "NDC", "Addendum"])

ndcs = ndcs.set_index("Code")

# Remove individual EU countries, except France which submitted a NDC for
# "pays et territoires d'outre-mer"
eu.remove("FRA")
ndcs = ndcs[~(ndcs.Title == "EU First NDC")]
ndcs = ndcs.drop(eu)  # Some don't have the above title.
def test_not_existing_remote_package():
    with pytest.raises(requests.exceptions.HTTPError):
        dp = read_datapackage("http://www.example.com")
Example #25
0
from pandas_datapackage_reader import read_datapackage
from util import root

from pandas.testing import assert_series_equal

dp = read_datapackage(root)

ffi = dp["fossil-fuel-cement"].drop("Source", axis=1).unstack("Category")
ffi.columns = ffi.columns.droplevel()

assert_series_equal(
    dp["global-carbon-budget"]["Fossil-Fuel-And-Industry"],
    ffi["Total"].astype(int) / 1000,  # convert to GtC
    check_exact=False,
    check_less_precise=True,
    check_names=False)

assert_series_equal(dp["global-carbon-budget"]["Land-Use-Change-Emissions"],
                    dp["land-use-change"]["Land-Use-Change"],
                    check_exact=False,
                    check_less_precise=True,
                    check_names=False)

assert_series_equal(dp["global-carbon-budget"]["Ocean-Sink"],
                    dp["ocean-sink"]["Ocean-Sink"])

print("Comparison in 'checks.py' ok.")
def test_github_url():
    url = "https://github.com/datasets/country-codes"
    dp = read_datapackage(url)
    assert isinstance(dp, pd.DataFrame)
Example #27
0
from pandas_datapackage_reader import read_datapackage
import geopandas
from utils import data_dir

df = read_datapackage("https://github.com/lhm/verwaltungsgebiete",
                      resource_name="vg250-districts")
# Regions that have sea territories have two entries. For now,
# select only land area in order to get unique records.
df = df[df.GF == 4]

output_path = data_dir / "districts.gpkg"
df.to_file(output_path, layer="districts", driver="GPKG")
def test_github_url_with_trailing_slash():
    url = "https://github.com/datasets/country-codes/"
    dp = read_datapackage(url)
    assert isinstance(dp, pd.DataFrame)
Example #29
0
from pathlib import Path
from pandas_datapackage_reader import read_datapackage

root = Path(__file__).parents[1]

df = read_datapackage(".")

# Python module header
py_out = '''"""
shortcountrynames
-----------------

from ._version import get_versions
__version__ = get_versions()['version']
del get_versions

"""


names = {}

'''

# JS module header
js_out = '''// Short Country Names

exports.names = {}
var names = exports.names
'''

for code, row in df.iterrows():
def test_pathlib_posixpath():
    from pathlib import Path

    path = Path(__file__).parents[0]
    dp = read_datapackage(path / "test-package")
    assert "data" in dp.keys()