def update_stats(): data = read_datapackage("data") filters = data['filters'] filename = os.path.join('data', filters._metadata['path']) images = data['images'] # Apply stats images.fillna('', inplace=True) # print(images.head) filters.dropna(subset=['Code', 'Column'], inplace=True) # print(filters.head) filters['Count'] = filters.apply(lambda row: ( (row['Code'] == '.*' and len(images.loc[images[row['Column']].str.len( ) > 0])) or len(images.loc[images[row['Column']].str.match( '^' + row['Code'] + '$', case=True) | images[row[ 'Column']].str.contains(' ' + row['Code'] + ' ') | images[row[ 'Column']].str.match('^' + row['Code'] + '[ |,]', case=True) | images[row['Column']].str .endswith(' ' + row['Code'])])), axis=1) print(filters.head(n=5)) print("Writing to %s" % filename) filters.to_csv(filename, index=False)
def test_remote_package(): url = ("https://github.com/rgieseke/pandas-datapackage-reader/" "raw/master/tests/test-package/datapackage.json") dp = read_datapackage(url) assert isinstance(dp, dict) assert "moredata" in dp.keys() assert isinstance(dp["moredata"], pd.DataFrame) assert "data" in dp.keys()
def read(self, filepath, **kwargs) -> Tuple[Dict[str, pd.DataFrame], Dict[str, Any]]: inputs = read_datapackage(filepath) default_resource = inputs.pop("default_values").set_index( "name").to_dict() default_values = default_resource["default_value"] inputs = self._check_index(inputs) return inputs, default_values
def load_datapackage(url: str, package_name: str) -> Dict[str, pd.DataFrame]: """ Downloads data package from online source and stores it as hdf-file in filip.data named by the <filename>.hdf. Args: url (str): Valid url to where the data package is hosted package_name (str): name of the cached file. Returns: Dict of dataframes """ # validate arguments validate_http_url(url=url) # create directory for data if not exists validate_http_url(url=url) path = Path(__file__).parent.parent.absolute().joinpath('data') path.mkdir(parents=True, exist_ok=True) package_path = path.joinpath(package_name) if os.path.isdir(package_path): # read data from filip.data if exists logger.info("Found existing data package in 'filip.data'") data = {} for file in os.listdir(package_path): file_name = file[:-4] # read in each file as one dataframe, prevents the deletion of NaN # values with na_filter=False frame = pd.read_csv(package_path.joinpath(file), index_col=0, header=0, na_filter=False) data[file_name] = frame else: # download external data and store data logger.info("Could not find data package in 'filip.data'. Will " "try to download from %s", url) try: data = read_datapackage(url) # rename keys data = {k.replace('-', '_'): v for k, v in data.items()} os.mkdir(package_path) # store data in filip.data for k, v in data.items(): v: DataFrame = v v.loc[:, :] = v[:].applymap(str) table_filepath = \ str(package_path) + f"\\{k.replace('-', '_')}.csv" v.to_csv(table_filepath) except: logger.error("Failed to load data package!") raise return data
def _get_package(self): if self.sql: engine = create_engine("sqlite:///{}".format(self.datapackage)) package = Package(storage="sql", engine=engine) else: package = read_datapackage(self.datapackage) # typing: datapackage.Package return package
def test_datetimes(): # Default test date/time '2017-01-01 01:23:45' df = read_datapackage(os.path.join(path, "test-package"), "datetimes") assert df["date"].iloc[0] == date(2017, 1, 1) assert df["datetime"].iloc[0] == datetime(2017, 1, 1, 1, 23, 45) assert df["time"].iloc[0] == time(1, 23, 45) assert df.reset_index()["year"].iloc[0] == 2017 assert df["yearmonth"].iloc[0] == pd.Period("2017-01") assert df["yearmonth"].iloc[0] == pd.Period("2017-01") assert df["dayfirstdate"].iloc[0] == date(2017, 12, 13)
from pandas_datapackage_reader import read_datapackage from datetime import datetime import json from flask import Flask, Response, request from flask_cors import CORS, cross_origin app = Flask(__name__) app.url_map.strict_slashes = False cors = CORS(app) data = read_datapackage("") resources = list(data.keys()) def get_paginated_json(df, per_page, page): return df[page:page + per_page].to_json(orient='records', date_format='iso') @app.route('/v1/<string:dataset>') @app.route('/v1', defaults={'dataset': ''}) @cross_origin() def api(dataset): try: per_page = int(request.args.get('per_page', 1000)) except: per_page = 1000 try: page = (int(request.args.get('page', 1)) - 1) * per_page except:
def test_missing_integer_values(): df_wo_index = read_datapackage(os.path.join(path, "test-package"), "datawithoutindex") assert pd.isnull(df_wo_index.iloc[1].intvalue) assert df_wo_index["intvalue"].dtype == pd.Int64Dtype()
def test_missing_integer_values_with_index(): df = read_datapackage(os.path.join(path, "test-package"), "datawithindex") assert pd.isnull(df.loc[2]) assert df["intvalue"].dtype == pd.Int64Dtype()
def test_metadata(): df = read_datapackage(os.path.join(path, "test-package"), "data") assert df._metadata["format"] == "csv"
def test_ignore_missing_values(): df = read_datapackage(os.path.join(path, "test-package"), "data") assert df.loc["c"].value == "NA"
request, flash, render_template, send_from_directory, ) try: from .util import * except: from util import * app = FlaskAPI(__name__) # Create API endpoints data = read_datapackage("data") @app.route('/api/<resource>') def api_dict(resource): return get_paginated(request.args, data[resource]) @app.route('/api/<resource>.json') def api_json(resource): return get_paginated(request.args, data[resource], True) @app.route('/api/<resource>/all.json') def api_all_json(resource): return data[resource].to_json(orient='records') # Static views
import pandas as pd from pandas_datapackage_reader import read_datapackage from pymagicc.utils import apply_string_substitutions DATA_HIERARCHY_SEPARATOR = "|" """str: String used to define different levels in our data hierarchies. For example, "Emissions|CO2|Energy|Coal". We copy this straight from pyam_ to maintain easy compatibility. """ path = Path(__file__).parent _dtrm = read_datapackage(path, "magicc_dattype_regionmode_regions") _region_cols = _dtrm.columns.to_series().apply( lambda x: x.startswith("region")) DATTYPE_REGIONMODE_REGIONS = _dtrm.loc[:, ~_region_cols].copy() """:obj:`pandas.DataFrame` Mapping between regions and whether a file is SCEN7 or not and the expected values of THISFILE_DATTYPE and THISFILE_REGIONMODE flags in MAGICC. """ DATTYPE_REGIONMODE_REGIONS["regions"] = [[ r for r in raw if not pd.isnull(r) ] for raw in _dtrm.loc[:, _region_cols].values.tolist()] MAGICC7_EMISSIONS_UNITS = read_datapackage(path, "magicc_emisssions_units") """:obj:`pandas.DataFrame` Definitions of emissions variables and their expected units in MAGICC7. """
data = pd.read_table(f, skiprows=7, delim_whitespace=True, nrows=15) data = data.set_index("Region").T.loc['1997':'2018'] data.index = [int(i) for i in data.index] data = data.Global data = data / 10**(12 - factor) # to Tg # Parse gas name from e.g. 'GFED4.1s_SO2.txt', # or 'GFED4.1s_Toluene_lump.txt' idx = path.suffixes[0].split("_", 1)[1] data.name = idx out[idx] = data # From http://www.globalfiredata.org/ar6historic.html with NH3 removed # as it is already contained in the data separately. nmvoc = ["C2H6", "CH3OH", "C2H5OH", "C3H8", "C2H2", "C2H4", "C3H6", "C5H8", "C10H16", "C7H8", "C6H6", "C8H10", "Toluene_lump", "Higher_Alkenes", "Higher_Alkanes", "CH2O", "C2H4O", "C3H6O", "C2H6S", "HCN", "HCOOH", "CH3COOH", "MEK", "CH3COCHO", "HOCH2CHO" ] df = pd.DataFrame(out) df.index.name = "Year" df["NMVOC"] = df[nmvoc].sum(axis=1).round(3) gbbe = read_datapackage(root / "datapackage.json", "global-biomass-burning-emissions") # Test for sufficient equality of NMVOC sum with GBBE data # where it is already combined. assert_almost_equal( gbbe.NMVOC.loc[1997:2015].round(0), df["NMVOC"].loc[1997:2015].round(0) ) df.to_csv(root / "data/gfed4s.csv")
def test_ignore_custom_missing_values(): df = read_datapackage(os.path.join(path, "test-package"), "data") assert pd.isna(df.loc["c"].value)
def test_load_multiple_resources(): dp = read_datapackage(os.path.join(path, "test-package"), ["data", "moredata"]) assert "data" in dp.keys() assert "moredata" in dp.keys()
def test_load_single_resource(): df = pd.read_csv(os.path.join(path, "test-package/moredata.csv")) moredata = read_datapackage(os.path.join(path, "test-package"), "moredata") assert df.equals(moredata)
def test_local_package(): dp = read_datapackage(os.path.join(path, "test-package")) assert isinstance(dp, dict) assert "moredata" in dp.keys() assert "data" in dp.keys()
def test_unsupported_format(): dp = read_datapackage(os.path.join(path, "test-package")) assert "json-only" not in dp.keys()
def test_geojson(): df = read_datapackage(os.path.join(path, "test-package"), "admin1-us") assert df._metadata["format"] == "geojson" assert "geometry" in df.columns
def prepare(dp: frictionless.package.Package, name: str): """ Prepare data in EnerMaps format. Parameters ---------- dp : frictionless.package.Package Valid datapackage name : str Name of the dataset (used for constructing the FID) Returns ------- DataFrame Data in EnerMaps format. GeoDataFrame Spatial data in EnerMaps format. """ data = read_datapackage(dp) data["fid"] = name + "_" + data[ID].astype(str) spatial = gpd.GeoDataFrame( data["fid"], columns=["fid"], geometry=gpd.points_from_xy(data.longitude, data.latitude), crs="EPSG:4326", ) # Other fields to json def np_encoder(object): """Source: https://stackoverflow.com/a/65151218.""" if isinstance(object, np.generic): return object.item() other_cols = [ x for x in data.columns if x not in VALUE_VARS + SPATIAL_VARS + ID_VARS ] # Int64 to int data.loc[:, other_cols].loc[:, data[other_cols].dtypes == "int64"] = ( data.loc[:, other_cols].loc[:, data[other_cols].dtypes == "int64"].astype(int)) data = data.replace({np.nan: None}) data["fields"] = data[other_cols].to_dict(orient="records") data["fields"] = data["fields"].apply( lambda x: json.dumps(x, default=np_encoder)) # Unpivoting data = data.melt(id_vars=ID_VARS, value_vars=VALUE_VARS) # Remove nan data = data.dropna() # Conversion enermaps_data = pd.DataFrame(columns=[ "start_at", "fields", "variable", "value", "ds_id", "fid", "dt", "z", "israster", "unit", ]) enermaps_data["fid"] = data["fid"] enermaps_data["value"] = data["value"] enermaps_data["variable"] = data["variable"] enermaps_data["fields"] = data["fields"] enermaps_data["unit"] = UNIT enermaps_data["israster"] = ISRASTER return enermaps_data, spatial
def test_ignore_default_missing_values(): df = read_datapackage(os.path.join(path, "test-package"), "datawithoutindex") assert pd.isna(df.iloc[1, 1])
import os import pandas as pd from countrygroups import EUROPEAN_UNION as eu from shutil import copyfile from pandas_datapackage_reader import read_datapackage from pathlib import Path root = Path(__file__).parents[1] data_dir = root / "data" ndcs_path = root / "cache/ndcs" indcs_path = root / "cache/indcs" latest_pdfs_path = root / "pdfs" # NDCs ndcs = read_datapackage(ndcs_path) # Enable categorical sorting for language. ndcs['Language'] = pd.Categorical( ndcs['Language'], ["English", "Arabic", "Spanish", "French", "Russian"]) # Set Preference for kind of document. ndcs['FileType'] = pd.Categorical(ndcs['FileType'], ["Translation", "NDC", "Addendum"]) ndcs = ndcs.set_index("Code") # Remove individual EU countries, except France which submitted a NDC for # "pays et territoires d'outre-mer" eu.remove("FRA") ndcs = ndcs[~(ndcs.Title == "EU First NDC")] ndcs = ndcs.drop(eu) # Some don't have the above title.
def test_not_existing_remote_package(): with pytest.raises(requests.exceptions.HTTPError): dp = read_datapackage("http://www.example.com")
from pandas_datapackage_reader import read_datapackage from util import root from pandas.testing import assert_series_equal dp = read_datapackage(root) ffi = dp["fossil-fuel-cement"].drop("Source", axis=1).unstack("Category") ffi.columns = ffi.columns.droplevel() assert_series_equal( dp["global-carbon-budget"]["Fossil-Fuel-And-Industry"], ffi["Total"].astype(int) / 1000, # convert to GtC check_exact=False, check_less_precise=True, check_names=False) assert_series_equal(dp["global-carbon-budget"]["Land-Use-Change-Emissions"], dp["land-use-change"]["Land-Use-Change"], check_exact=False, check_less_precise=True, check_names=False) assert_series_equal(dp["global-carbon-budget"]["Ocean-Sink"], dp["ocean-sink"]["Ocean-Sink"]) print("Comparison in 'checks.py' ok.")
def test_github_url(): url = "https://github.com/datasets/country-codes" dp = read_datapackage(url) assert isinstance(dp, pd.DataFrame)
from pandas_datapackage_reader import read_datapackage import geopandas from utils import data_dir df = read_datapackage("https://github.com/lhm/verwaltungsgebiete", resource_name="vg250-districts") # Regions that have sea territories have two entries. For now, # select only land area in order to get unique records. df = df[df.GF == 4] output_path = data_dir / "districts.gpkg" df.to_file(output_path, layer="districts", driver="GPKG")
def test_github_url_with_trailing_slash(): url = "https://github.com/datasets/country-codes/" dp = read_datapackage(url) assert isinstance(dp, pd.DataFrame)
from pathlib import Path from pandas_datapackage_reader import read_datapackage root = Path(__file__).parents[1] df = read_datapackage(".") # Python module header py_out = '''""" shortcountrynames ----------------- from ._version import get_versions __version__ = get_versions()['version'] del get_versions """ names = {} ''' # JS module header js_out = '''// Short Country Names exports.names = {} var names = exports.names ''' for code, row in df.iterrows():
def test_pathlib_posixpath(): from pathlib import Path path = Path(__file__).parents[0] dp = read_datapackage(path / "test-package") assert "data" in dp.keys()