def create_resource(path): """ """ from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'name' resource.descriptor[ 'description'] = 'Contains the hubs (nodes) for the energy system representation' resource.descriptor[ 'title'] = 'Energy system hubs for DE and its electrical neighbours' resource.descriptor['sources'] = [{ 'title': 'NUTS Shapefiles', 'path': 'http://ec.europa.eu/eurostat/cache/GISCO/geodatafiles/NUTS_2013_10M_SH.zip', 'files': [ 'NUTS_2013_10M_SH/data/NUTS_RG_10M_2013.shp', 'NUTS_2013_10M_SH/data/NUTS_RG_10M_2013.dbf' ] }] resource.commit() resource.descriptor if resource.valid: resource.save('resources/' + resource.name + '.json')
def create_resource(path): from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'name' resource.descriptor[ 'description'] = 'Installed transmission capacities from the e-highway 2050 scenario' resource.descriptor['title'] = 'Installed transmission capacities' resource.descriptor['sources'] = [{ 'title': 'E-Highway 2050 transmission capacities', 'path': 'http://www.e-highway2050.eu/fileadmin/documents/' + 'Results/e-Highway_database_per_country-08022016.xlsx' }] resource.descriptor['schema']['foreignKeys'] = [{ "fields": "from_bus", "reference": { "resource": "bus", "fields": "name" } }, { "fields": "to_bus", "reference": { "resource": "bus", "fields": "name" } }] resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json')
class iterable_loader(DataStreamProcessor): def __init__(self, iterable, name=None): super(iterable_loader, self).__init__() self.iterable = iterable self.name = name def handle_iterable(self): mode = None for x in self.iterable: if mode is None: assert isinstance(x, (dict, list)) mode = dict if isinstance(x, dict) else list assert isinstance(x, mode) if mode == dict: yield x else: yield dict(zip(('col{}'.format(i) for i in range(len(x))), x)) def process_datapackage(self, dp: Package): name = self.name if name is None: name = 'res_{}'.format(len(dp.resources) + 1) self.res = Resource(dict(name=name, path='{}.csv'.format(name)), storage=iterable_storage(self.handle_iterable())) self.res.infer() dp.descriptor.setdefault('resources', []).append(self.res.descriptor) return dp def process_resources(self, resources): yield from super(iterable_loader, self).process_resources(resources) yield self.res.iter(keyed=True)
def save_datasets_as_data_packages(self, folder_path): """ save each dataset from a data.json source as _datapackage_ """ for dataset in self.datasets: package = Package() #TODO check this, I'm learning datapackages resource = Resource({'data': dataset}) resource.infer() #adds "name": "inline" #FIXME identifier uses incompables characthers as paths (e.g. /). # could exist duplicates paths from different resources # use BASE64 or hashes idf = slugify(dataset['identifier']) resource_path = os.path.join(folder_path, f'resource_data_json_{idf}.json') if not resource.valid: raise Exception('Invalid resource') resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) package_path = os.path.join(folder_path, f'pkg_data_json_{idf}.zip') package.save(target=package_path)
def save_as_data_packages(row): """ save dataset from data.json as data package We will use this files as a queue to process later """ # TODO check if ckanext-datapackager is useful for import # or export resources: # https://github.com/frictionlessdata/ckanext-datapackager package = Package() # TODO check this, I'm learning datapackages. resource = Resource({'data': row}) resource.infer() # adds "name": "inline" if not resource.valid: raise Exception('Invalid resource') encoded_identifier = encode_identifier(identifier=row['identifier']) # resource_path = os.path.join(path, f'{prefix}_{encoded_identifier}.json') # resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) folder = config.get_data_packages_folder_path() filename = f'data-json-{encoded_identifier}.json' package_path = os.path.join(folder, filename) # no not rewrite if exists if not os.path.isfile(package_path): package.save(target=package_path)
def save_datasets_as_data_packages(self, folder_path): """ save each dataset from a data.json source as _datapackage_ """ for dataset in self.package_list: package = Package() #TODO check this, I'm learning datapackages resource = Resource({'data': dataset}) resource.infer() #adds "name": "inline" identifier = dataset['id'] bytes_identifier = identifier.encode('utf-8') encoded = base64.b64encode(bytes_identifier) encoded_identifier = str(encoded, "utf-8") resource_path = os.path.join( folder_path, f'resource_ckan_api_{encoded_identifier}.json') if not resource.valid: raise Exception('Invalid resource') resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) package_path = os.path.join( folder_path, f'pkg_ckan_api_{encoded_identifier}.zip') package.save(target=package_path)
def create_resource(path): """ """ from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'name' resource.descriptor[ 'description'] = 'Excess slacks for each electricity hub in the energy system representation' resource.descriptor[ 'title'] = 'Excess slacks for DE and its electrical neighbours' resource.descriptor['schema']['foreignKeys'] = [{ "fields": "bus", "reference": { "resource": "bus", "fields": "name" } }] resource.commit() resource.descriptor if resource.valid: resource.save('resources/' + resource.name + '.json')
def create_resource(path): """ """ mapper = {} from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'name' resource.descriptor[ 'description'] = 'Installed capacities, costs and technical parameters for components' resource.descriptor['title'] = '{} components'.format( resource.name.title()) resource.descriptor['sources'] = [{ 'title': 'E-Highway 2050 installed capacities', 'path': 'http://www.e-highway2050.eu/fileadmin/documents/Results/e-Highway2050_2050_Country_and_cluster_installed_capacities_31-03-2015.xlsx' }] resource.descriptor['schema']['foreignKeys'] = [{ "fields": "bus", "reference": { "resource": "bus", "fields": "name" } }] if 'demand' in resource.name: resource.descriptor['schema']['foreignKeys'].append({ "fields": "profile", "reference": { "resource": "demand-profiles" } }) elif 'volatile-generator' in resource.name: resource.descriptor['schema']['foreignKeys'].append({ "fields": "profile", "reference": { "resource": "generator-profiles" } }) resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json') else: print('Resource is not valid, writing resource anyway...') resource.save('resources/' + resource.name + '.json')
def create_resource(path): from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'timeindex' resource.descriptor['description'] = ( 'Profiles for Run of River (ROR) components. The profile is assumed' + ' to be constant during the year.') resource.descriptor['title'] = 'ROR profiles' resource.descriptor['sources'] = [{'title': 'Assumption'}] resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json')
def infer_resources(directory="data/elements"): """ Method looks at all files in `directory` and creates datapackage.Resource object that will be stored Parameters ---------- directory: string Path to directory from where resources are inferred """ if not os.path.exists("resources"): os.makedirs("resources") # create meta data resources for f in os.listdir(directory): r = Resource({"path": os.path.join(directory, f)}) r.infer() r.save(os.path.join("resources", f.replace(".csv", ".json")))
def create_resource(path): from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'timeindex' resource.descriptor['description'] = 'Demand profiles per country' resource.descriptor['title'] = 'Demand profiles' resource.descriptor['sources'] = [{ 'title': 'OPSD timeseries', 'path': 'https://data.open-power-system-data.org/time_series/2017-07-09/' + 'time_series_60min_singleindex.csv' }] resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json')
def create_resource(path): from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'timeindex' resource.descriptor[ 'description'] = 'PV profiles (capacity factors) from renewables ninja for each country' resource.descriptor['title'] = 'PV profiles' resource.descriptor['sources'] = [{ 'title': 'Renewables Ninja PV Capacity Factors', 'path': 'https://www.renewables.ninja/static/downloads/ninja_europe_pv_v1.1.zip' }] resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json')
def save_datasets_as_data_packages(self, folder_path, identifier_field): """ save each dataset from a data.json source as _datapackage_ """ for dataset in self.datasets: package = Package() #TODO check this, I'm learning datapackages resource = Resource({'data': dataset}) resource.infer() #adds "name": "inline" idf = slugify(dataset[identifier_field]) resource_path = os.path.join(folder_path, f'resource_data_json_{idf}.json') if not resource.valid: raise Exception('Invalid resource') resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) package_path = os.path.join(folder_path, f'pkg_data_json_{idf}.zip') package.save(target=package_path)
def create_resource(path, title): from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'name' resource.descriptor[ 'description'] = 'Installed capacities, costs and technical parameters for components' resource.descriptor['title'] = title resource.descriptor['sources'] = [{ 'title': 'Restore 2050 hydro inflow timeseries', 'path': 'https://zenodo.org/record/804244/files/Hydro_Inflow.zip' }, { 'title': 'E-Highway 2050 installed capacities', 'path': 'http://www.e-highway2050.eu/fileadmin/documents/Results/e-Highway2050_2050_Country_and_cluster_installed_capacities_31-03-2015.xlsx' }, { 'title': 'DIW Berlin - Current and Prospective Costs of Electricity Generation until 2050', 'path': 'https://www.diw.de/documents/publikationen/73/diw_01.c.424566.de/diw_datadoc_2013-068.pdf' }] resource.descriptor['schema']['foreignKeys'] = [{ "fields": "bus", "reference": { "resource": "bus", "fields": "name" } }] resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json')
class load(DataStreamProcessor): def __init__(self, load_source, name=None, resources=None, validate=False, strip=True, **options): super(load, self).__init__() self.load_source = load_source self.options = options self.name = name self.resources = resources self.load_dp = None self.validate = validate self.strip = strip self.force_strings = options.get('force_strings') is True def process_datapackage(self, dp: Package): if isinstance(self.load_source, tuple): datapackage_descriptor, _ = self.load_source dp.descriptor.setdefault('resources', []) self.resource_matcher = ResourceMatcher(self.resources, datapackage_descriptor) for resource_descriptor in datapackage_descriptor['resources']: if self.resource_matcher.match(resource_descriptor['name']): dp.add_resource(resource_descriptor) else: # load_source is string: if self.load_source.startswith('env://'): env_var = self.load_source[6:] self.load_source = os.environ.get(env_var) if self.load_source is None: raise ValueError( f"Couldn't find value for env var '{env_var}'") if os.path.basename(self.load_source) == 'datapackage.json': self.load_dp = Package(self.load_source) self.resource_matcher = ResourceMatcher( self.resources, self.load_dp) dp.descriptor.setdefault('resources', []) for resource in self.load_dp.resources: if self.resource_matcher.match(resource.name): dp.add_resource(resource.descriptor) else: if os.path.exists(self.load_source): base_path = os.path.dirname(self.load_source) or '.' self.load_source = os.path.basename(self.load_source) else: base_path = None descriptor = dict(path=self.load_source, profile='tabular-data-resource') descriptor['format'] = self.options.get('format') if 'encoding' in self.options: descriptor['encoding'] = self.options['encoding'] if descriptor['format'] == 'xml' or self.load_source.endswith( '.xml'): self.options.setdefault('custom_parsers', {})['xml'] = XMLParser self.options.setdefault('ignore_blank_headers', True) self.options.setdefault('headers', 1) self.res = Resource(descriptor, base_path=base_path, **self.options) self.res.infer(confidence=1, limit=1000) if self.name is not None: self.res.descriptor['name'] = self.name if self.force_strings: for f in self.res.descriptor['schema']['fields']: f['type'] = 'string' self.res.commit() self.res.descriptor['path'] = '{name}.{format}'.format( **self.res.descriptor) dp.add_resource(self.res.descriptor) return dp def stripper(self, iterator): for r in iterator: yield dict((k, v.strip()) if isinstance(v, str) else (k, v) for k, v in r.items()) def process_resources(self, resources): yield from super(load, self).process_resources(resources) if isinstance(self.load_source, tuple): datapackage_descriptor, resources = self.load_source yield from (resource for resource, descriptor in zip( resources, datapackage_descriptor['resources']) if self.resource_matcher.match(descriptor['name'])) elif self.load_dp is not None: yield from (resource.iter(keyed=True) for resource in self.load_dp.resources if self.resource_matcher.match(resource.name)) else: it = self.res.iter(keyed=True, cast=False) if self.validate: it = schema_validator(self.res, it) if self.strip: it = self.stripper(it) yield it
'total_flow': 'inflow to the power plant in mio m3', 'flo_river_ror': 'next downstream res_nr', 'status': 'operational status of the plant', 'company': None, 'turbtype': 'optional: turbine type', 'geodbid': 'specified id for geo referencing', 'river': 'river in which the plant is located', 'river_km': 'km from stream source', 'level_meter': 'assigned level meter for flow curve' } # create resource r = Resource({'path': 'data/runofriver.csv'}) # get basic metadata from data r.infer() # add description for fields based on mapper for i in range(len(r.descriptor['schema']['fields'])): r.descriptor['schema']['fields'][i]['description'] = \ description_mapper[r.descriptor['schema']['fields'][i]['name']] # commit (apply) changes to resource r.commit() # save the resource r.save('dataresource.json') # create a package p = Package()
def temporal_clustering(datapackage, n, path="/tmp", how="daily"): """ Creates a new datapackage by aggregating sequences inside the `sequence` folder of the specified datapackage by clustering `n` timesteps Parameters ---------- datapackage: string String of meta data file datapackage.json n: integer Number of clusters path: string Path to directory where the aggregated datapackage is stored how: string How to cluster 'daily' or 'hourly' """ if how == "weekly": raise NotImplementedError("Weekly clustering is not implemented!") p = Package(datapackage) cwd = os.getcwd() copied_package_name = (p.descriptor["name"] + "__temporal_cluster__" + how + "_" + str(n)) copy_path = os.path.join(path, p.descriptor["name"], copied_package_name) copied_root = copy_datapackage(datapackage, os.path.abspath(copy_path), subset="data") sequence_resources = [ r for r in p.resources if re.match(r"^data/sequences/.*$", r.descriptor["path"]) ] dfs = { r.name: pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float) for r in sequence_resources } sequences = pd.concat(dfs.values(), axis=1) if how == "daily": hoursPerPeriod = 24 elif how == "hourly": hoursPerPeriod = 1 elif how == "weekly": hoursPerPeriod = 24 * 7 aggregation = tsam.TimeSeriesAggregation( sequences, noTypicalPeriods=n, rescaleClusterPeriods=False, hoursPerPeriod=hoursPerPeriod, clusterMethod="hierarchical", ) cluster_weights = { aggregation.clusterCenterIndices[n]: w for n, w in aggregation.clusterPeriodNoOccur.items() } if how == "daily": temporal = pd.Series( { d: cluster_weights[d.dayofyear] for d in sequences.index if d.dayofyear in aggregation.clusterCenterIndices }, name="weighting", ) temporal.index.name = "timeindex" elif how == "hourly": temporal = pd.Series( { h: cluster_weights[sequences.index.get_loc(h)] for h in sequences.index if sequences.index.get_loc(h) in aggregation.clusterCenterIndices }, name="weighting", ) temporal.index.name = "timeindex" # write resources to copied package (should not interfer with meta data) # as columns are not removed and sorted when written. os.chdir(copied_root) for r in sequence_resources: write_sequences(r.name + ".csv", dfs[r.name].loc[temporal.index], replace=True) # write temporal information from clustering temporal.to_csv( "data/temporal.csv", header=True, sep=";", date_format="%Y-%m-%dT%H:%M:%SZ", ) # add meta data for new temporal information r = Resource({"path": "data/temporal.csv"}) r.infer() # TODO: Add meta-data description r.descriptor[ "description"] = "Temporal selection based on hierachical clustering..." # Update meta-data of copied package cp = Package("datapackage.json") cp.descriptor["name"] = copied_package_name cp.descriptor["resources"].append(r.descriptor) cp.commit() cp.save("datapackage.json") # set back to 'old' workdirectory os.chdir(cwd) return copied_root
def temporal_skip(datapackage, n, path="/tmp", name=None, *args): """ Creates a new datapackage by aggregating sequences inside the `sequence` folder of the specified datapackage by skipping `n` timesteps Parameters ---------- datapackage: string String of meta data file datapackage.json n: integer Number of timesteps to skip path: string Path to directory where the aggregated datapackage is stored name: string Name of the new, aggregated datapackage. If not specified a name will be given """ p = Package(datapackage) cwd = os.getcwd() if name is None: copied_package_name = (p.descriptor["name"] + "__temporal_skip__" + str(n)) else: copied_package_name = name copy_path = os.path.join(path, copied_package_name) copied_root = copy_datapackage(datapackage, os.path.abspath(copy_path), subset="data") sequence_resources = [ r for r in p.resources if re.match(r"^data/sequences/.*$", r.descriptor["path"]) ] dfs = { r.name: pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float) for r in sequence_resources } sequences = pd.concat(dfs.values(), axis=1) skip_sequences = sequences.loc[::n] temporal = pd.Series(data=n, index=skip_sequences.index, name="weighting") temporal.index.name = "timeindex" os.chdir(copied_root) for r in sequence_resources: write_sequences(r.name + ".csv", dfs[r.name].loc[temporal.index], replace=True) # write temporal information from clustering temporal.to_csv( "data/temporal.csv", header=True, sep=";", date_format="%Y-%m-%dT%H:%M:%SZ", ) # add meta data for new temporal information r = Resource({"path": "data/temporal.csv"}) r.infer() r.descriptor[ "description"] = "Temporal selection based on skipped timesteps. Skipped n={}".format( n) # Update meta-data of copied package cp = Package("datapackage.json") cp.descriptor["name"] = copied_package_name cp.descriptor["resources"].append(r.descriptor) cp.commit() cp.save("datapackage.json") # set back to 'old' workdirectory os.chdir(cwd) return copied_root
def infer_metadata( package_name="default-name", keep_resources=False, foreign_keys={ "bus": [ "volatile", "dispatchable", "storage", "load", "reservoir", "shortage", "excess", ], "profile": ["load", "volatile", "ror"], "from_to_bus": ["connection", "line", "conversion"], "chp": ["backpressure", "extraction", "chp"], }, path=None, ): """ Add basic meta data for a datapackage Parameters ---------- package_name: string Name of the data package keep_resource: boolean Flag indicating of the resources meta data json-files should be kept after main datapackage.json is created. The reource meta data will be stored in the `resources` directory. foreign_keys: dict Dictionary with foreign key specification. Keys for dictionary are: 'bus', 'profile', 'from_to_bus'. Values are list with strings with the name of the resources path: string Absoltue path to root-folder of the datapackage """ current_path = os.getcwd() if path: print("Setting current work directory to {}".format(path)) os.chdir(path) p = Package() p.descriptor["name"] = package_name p.descriptor["profile"] = "tabular-data-package" p.commit() if not os.path.exists("resources"): os.makedirs("resources") # create meta data resources elements if not os.path.exists("data/elements"): print("No data path found in directory {}. Skipping...".format( os.getcwd())) else: for f in os.listdir("data/elements"): r = Resource({"path": os.path.join("data/elements", f)}) r.infer() r.descriptor["schema"]["primaryKey"] = "name" if r.name in foreign_keys.get("bus", []): r.descriptor["schema"]["foreignKeys"] = [{ "fields": "bus", "reference": { "resource": "bus", "fields": "name" }, }] if r.name in foreign_keys.get("profile", []): r.descriptor["schema"]["foreignKeys"].append({ "fields": "profile", "reference": { "resource": r.name + "_profile" }, }) elif r.name in foreign_keys.get("from_to_bus", []): r.descriptor["schema"]["foreignKeys"] = [ { "fields": "from_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "to_bus", "reference": { "resource": "bus", "fields": "name" }, }, ] elif r.name in foreign_keys.get("chp", []): r.descriptor["schema"]["foreignKeys"] = [ { "fields": "fuel_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "electricity_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "heat_bus", "reference": { "resource": "bus", "fields": "name" }, }, ] r.commit() r.save(os.path.join("resources", f.replace(".csv", ".json"))) p.add_resource(r.descriptor) # create meta data resources elements if not os.path.exists("data/sequences"): print("No data path found in directory {}. Skipping...".format( os.getcwd())) else: for f in os.listdir("data/sequences"): r = Resource({"path": os.path.join("data/sequences", f)}) r.infer() r.commit() r.save(os.path.join("resources", f.replace(".csv", ".json"))) p.add_resource(r.descriptor) p.commit() p.save("datapackage.json") if not keep_resources: shutil.rmtree("resources") os.chdir(current_path)
from datapackage import Resource # Create resource = Resource({'path': 'data/data.csv'}) resource.tabular # true resource.headers # ['city', 'location'] print(resource.read(keyed=True)) # [ # {city: 'london', location: '51.50,-0.11'}, # {city: 'paris', location: '48.85,2.30'}, # {city: 'rome', location: 'N/A'}, # ] # Infer resource.infer() print(resource.descriptor) #{ path: 'data.csv', # profile: 'tabular-data-resource', # encoding: 'utf-8', # name: 'data', # format: 'csv', # mediatype: 'text/csv', # schema: { fields: [ [Object], [Object] ], missingValues: [ '' ] } } # resource.read(keyed=True) # Fails with a data validation error # Tweak resource.descriptor['schema']['missingValues'] = 'N/A' resource.commit() resource.valid # False print(resource.errors)