class iterable_loader(DataStreamProcessor): def __init__(self, iterable, name=None): super(iterable_loader, self).__init__() self.iterable = iterable self.name = name def handle_iterable(self): mode = None for x in self.iterable: if mode is None: assert isinstance(x, (dict, list)) mode = dict if isinstance(x, dict) else list assert isinstance(x, mode) if mode == dict: yield x else: yield dict(zip(('col{}'.format(i) for i in range(len(x))), x)) def process_datapackage(self, dp: Package): name = self.name if name is None: name = 'res_{}'.format(len(dp.resources) + 1) self.res = Resource(dict(name=name, path='{}.csv'.format(name)), storage=iterable_storage(self.handle_iterable())) self.res.infer() dp.descriptor.setdefault('resources', []).append(self.res.descriptor) return dp def process_resources(self, resources): yield from super(iterable_loader, self).process_resources(resources) yield self.res.iter(keyed=True)
def save_as_data_packages(row): """ save dataset from data.json as data package We will use this files as a queue to process later """ # TODO check if ckanext-datapackager is useful for import # or export resources: # https://github.com/frictionlessdata/ckanext-datapackager package = Package() # TODO check this, I'm learning datapackages. resource = Resource({'data': row}) resource.infer() # adds "name": "inline" if not resource.valid: raise Exception('Invalid resource') encoded_identifier = encode_identifier(identifier=row['identifier']) # resource_path = os.path.join(path, f'{prefix}_{encoded_identifier}.json') # resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) folder = config.get_data_packages_folder_path() filename = f'data-json-{encoded_identifier}.json' package_path = os.path.join(folder, filename) # no not rewrite if exists if not os.path.isfile(package_path): package.save(target=package_path)
def get_resource(self, idx=None, path=None, name=None, source_only=True): if idx is None: if path: all_paths = [ r.descriptor.get("path") for r in self.datapackage.resources ] if path in all_paths: idx = all_paths.index(path) else: logger.error(f"path = {path} is not in resources.") elif name: all_names = [ r.descriptor.get("name") for r in self.datapackage.resources ] if name in all_names: idx = all_names.index(name) else: logger.error(f"name = {name} is not in resources.") else: raise Exception( f"Please specify at least one of the keywords: idx, path, name." ) if self.is_local: logger.debug( f"Using local dataset for {self.id}, sync it if you need the updated version." ) r = self.datapackage.resources[idx] logger.debug(f"using base_path: {str(self.base_path)}") logger.debug(f"using descriptor: {r.descriptor}") resource = Resource(r.descriptor, base_path=str(self.base_path)) logger.debug(f"base_path of r_1: {resource._Resource__base_path}") elif (not self.is_local) and (self.source == "git"): logger.debug(f"Using remote data") self.remote_path = f"{self.metadata_uri[:-16]}" r = self.datapackage.resources[idx] resource = Resource( { **(r.descriptor), **{"path": self.remote_path + r.descriptor.get("path", "")}, } ) elif (not self.is_local) and (self.source == "s3"): logger.debug(f"Using remote data") logger.debug( f"Direct resource from S3 is not supported yet. " f"Please sync the dataset to local using the command line first.\n" f"TODO: Sync S3 to local after confirmation from here." ) resource = self.datapackage.resources[idx] else: logger.error("Resource is not supported. Currently supporting S3 and git.") resource = self.datapackage.resources[idx] if source_only: return resource.source else: return resource
def process_datapackage(self, dp: Package): name = self.name if name is None: name = 'res_{}'.format(len(dp.resources) + 1) self.res = Resource(dict(name=name, path='{}.csv'.format(name)), storage=iterable_storage(self.handle_iterable())) self.res.infer() dp.descriptor.setdefault('resources', []).append(self.res.descriptor) return dp
def process_datapackage(self, dp: Package): if isinstance(self.load_source, tuple): datapackage_descriptor, _ = self.load_source dp.descriptor.setdefault('resources', []) self.resource_matcher = ResourceMatcher(self.resources, datapackage_descriptor) for resource_descriptor in datapackage_descriptor['resources']: if self.resource_matcher.match(resource_descriptor['name']): dp.add_resource(resource_descriptor) else: # load_source is string: if self.load_source.startswith('env://'): env_var = self.load_source[6:] self.load_source = os.environ.get(env_var) if self.load_source is None: raise ValueError( f"Couldn't find value for env var '{env_var}'") if os.path.basename(self.load_source) == 'datapackage.json': self.load_dp = Package(self.load_source) self.resource_matcher = ResourceMatcher( self.resources, self.load_dp) dp.descriptor.setdefault('resources', []) for resource in self.load_dp.resources: if self.resource_matcher.match(resource.name): dp.add_resource(resource.descriptor) else: if os.path.exists(self.load_source): base_path = os.path.dirname(self.load_source) or '.' self.load_source = os.path.basename(self.load_source) else: base_path = None descriptor = dict(path=self.load_source, profile='tabular-data-resource') descriptor['format'] = self.options.get('format') if 'encoding' in self.options: descriptor['encoding'] = self.options['encoding'] if descriptor['format'] == 'xml' or self.load_source.endswith( '.xml'): self.options.setdefault('custom_parsers', {})['xml'] = XMLParser self.options.setdefault('ignore_blank_headers', True) self.options.setdefault('headers', 1) self.res = Resource(descriptor, base_path=base_path, **self.options) self.res.infer(confidence=1, limit=1000) if self.name is not None: self.res.descriptor['name'] = self.name if self.force_strings: for f in self.res.descriptor['schema']['fields']: f['type'] = 'string' self.res.commit() self.res.descriptor['path'] = '{name}.{format}'.format( **self.res.descriptor) dp.add_resource(self.res.descriptor) return dp
def test_raw_data(self, simpsons_dataset, simpsons_datapackage, simpsons_descriptor_path): for r in simpsons_datapackage.resources: resource = Resource( r.descriptor, base_path=path.dirname(simpsons_descriptor_path)) once = simpsons_dataset.raw_data[r.descriptor['name']] twice = simpsons_dataset.raw_data[r.descriptor['name']] assert_that(once, equal_to(resource.raw_read())) # Not a generator for _ in once: pass # Consume iterable assert_that(once, equal_to(twice))
def package_from_resources(resource_path, output_path, clean=True): """ Collects resource descriptors and merges them in a datapackage.json Parameters ---------- resource_path: string Path to directory with resources (in .json format) output_path: string Root path of datapackage where the newly created datapckage.json is stored clean: boolean If true, resources will be deleted """ p = Package() p.descriptor["profile"] = "tabular-data-package" p.commit() for f in os.listdir(resource_path): path = os.path.join(resource_path, f) r = Resource(path) p.add_resource(r.descriptor) p.commit() os.remove(path) if clean: os.rmdir(resource_path) p.save(os.path.join(output_path, "datapackage.json"))
def _make_package(source, publisher, config): os.chdir(source) files = [f for f in os.listdir('data') if f.endswith('.csv')] package = Package({'publisher': publisher}) for f in files: path = f"data/{f}" name = f.replace('.csv', '') schema = f"https://raw.githubusercontent.com/digital-land/alpha-data/master/schema/{name}-schema.json" resource = Resource({'path': path, 'schema': schema}) package.add_resource(resource.descriptor) package.commit() package.infer() errors = False for r in package.resources: try: r.read(keyed=True) r.check_relations() except (CastError, RelationError) as e: print('Error in', os.path.join(source, r.descriptor['path'])) print(e, e.errors) errors = True if not errors: package.save('datapackage.zip') print('saved datapackage.json to', source) s3 = boto3.client( 's3', aws_access_key_id=config['AWS_ACCESS_KEY_ID'], aws_secret_access_key=config['AWS_SECRET_ACCESS_KEY']) bucket = 'developer-contributions-datapackages' key = f'{publisher}/{uuid.uuid4()}/datapackage.zip' s3.upload_file(f'{source}/datapackage.zip', bucket, key, ExtraArgs={'ACL': 'public-read'}) config = s3._client_config config.signature_version = botocore.UNSIGNED datapackage_url = boto3.resource( 's3', config=config).meta.client.generate_presigned_url('get_object', ExpiresIn=0, Params={ 'Bucket': bucket, 'Key': key }) return datapackage_url
def save_datasets_as_data_packages(self, folder_path): """ save each dataset from a data.json source as _datapackage_ """ for dataset in self.package_list: package = Package() #TODO check this, I'm learning datapackages resource = Resource({'data': dataset}) resource.infer() #adds "name": "inline" identifier = dataset['id'] bytes_identifier = identifier.encode('utf-8') encoded = base64.b64encode(bytes_identifier) encoded_identifier = str(encoded, "utf-8") resource_path = os.path.join( folder_path, f'resource_ckan_api_{encoded_identifier}.json') if not resource.valid: raise Exception('Invalid resource') resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) package_path = os.path.join( folder_path, f'pkg_ckan_api_{encoded_identifier}.zip') package.save(target=package_path)
def save_datasets_as_data_packages(self, folder_path): """ save each dataset from a data.json source as _datapackage_ """ for dataset in self.datasets: package = Package() #TODO check this, I'm learning datapackages resource = Resource({'data': dataset}) resource.infer() #adds "name": "inline" #FIXME identifier uses incompables characthers as paths (e.g. /). # could exist duplicates paths from different resources # use BASE64 or hashes idf = slugify(dataset['identifier']) resource_path = os.path.join(folder_path, f'resource_data_json_{idf}.json') if not resource.valid: raise Exception('Invalid resource') resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) package_path = os.path.join(folder_path, f'pkg_data_json_{idf}.zip') package.save(target=package_path)
def update_package_descriptor(): """ """ p = Package("datapackage.json") for f in os.listdir("resources"): path = os.path.join("resources", f) r = Resource(path) p.add_resource(r.descriptor) p.commit() os.remove(path) os.rmdir("resources") p.save("datapackage.json")
def infer_resources(directory="data/elements"): """ Method looks at all files in `directory` and creates datapackage.Resource object that will be stored Parameters ---------- directory: string Path to directory from where resources are inferred """ if not os.path.exists("resources"): os.makedirs("resources") # create meta data resources for f in os.listdir(directory): r = Resource({"path": os.path.join(directory, f)}) r.infer() r.save(os.path.join("resources", f.replace(".csv", ".json")))
def save_datasets_as_data_packages(self, folder_path, identifier_field): """ save each dataset from a data.json source as _datapackage_ """ for dataset in self.datasets: package = Package() #TODO check this, I'm learning datapackages resource = Resource({'data': dataset}) resource.infer() #adds "name": "inline" idf = slugify(dataset[identifier_field]) resource_path = os.path.join(folder_path, f'resource_data_json_{idf}.json') if not resource.valid: raise Exception('Invalid resource') resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) package_path = os.path.join(folder_path, f'pkg_data_json_{idf}.zip') package.save(target=package_path)
def infer_metadata( package_name="default-name", keep_resources=False, foreign_keys={ "bus": [ "volatile", "dispatchable", "storage", "load", "reservoir", "shortage", "excess", ], "profile": ["load", "volatile", "ror"], "from_to_bus": ["connection", "line", "conversion"], "chp": ["backpressure", "extraction", "chp"], }, path=None, ): """ Add basic meta data for a datapackage Parameters ---------- package_name: string Name of the data package keep_resource: boolean Flag indicating of the resources meta data json-files should be kept after main datapackage.json is created. The reource meta data will be stored in the `resources` directory. foreign_keys: dict Dictionary with foreign key specification. Keys for dictionary are: 'bus', 'profile', 'from_to_bus'. Values are list with strings with the name of the resources path: string Absoltue path to root-folder of the datapackage """ current_path = os.getcwd() if path: print("Setting current work directory to {}".format(path)) os.chdir(path) p = Package() p.descriptor["name"] = package_name p.descriptor["profile"] = "tabular-data-package" p.commit() if not os.path.exists("resources"): os.makedirs("resources") # create meta data resources elements if not os.path.exists("data/elements"): print("No data path found in directory {}. Skipping...".format( os.getcwd())) else: for f in os.listdir("data/elements"): r = Resource({"path": os.path.join("data/elements", f)}) r.infer() r.descriptor["schema"]["primaryKey"] = "name" if r.name in foreign_keys.get("bus", []): r.descriptor["schema"]["foreignKeys"] = [{ "fields": "bus", "reference": { "resource": "bus", "fields": "name" }, }] if r.name in foreign_keys.get("profile", []): r.descriptor["schema"]["foreignKeys"].append({ "fields": "profile", "reference": { "resource": r.name + "_profile" }, }) elif r.name in foreign_keys.get("from_to_bus", []): r.descriptor["schema"]["foreignKeys"] = [ { "fields": "from_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "to_bus", "reference": { "resource": "bus", "fields": "name" }, }, ] elif r.name in foreign_keys.get("chp", []): r.descriptor["schema"]["foreignKeys"] = [ { "fields": "fuel_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "electricity_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "heat_bus", "reference": { "resource": "bus", "fields": "name" }, }, ] r.commit() r.save(os.path.join("resources", f.replace(".csv", ".json"))) p.add_resource(r.descriptor) # create meta data resources elements if not os.path.exists("data/sequences"): print("No data path found in directory {}. Skipping...".format( os.getcwd())) else: for f in os.listdir("data/sequences"): r = Resource({"path": os.path.join("data/sequences", f)}) r.infer() r.commit() r.save(os.path.join("resources", f.replace(".csv", ".json"))) p.add_resource(r.descriptor) p.commit() p.save("datapackage.json") if not keep_resources: shutil.rmtree("resources") os.chdir(current_path)
def create_resource(path): from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'timeindex' resource.descriptor['description'] = ( 'Profiles for Run of River (ROR) components. The profile is assumed' + ' to be constant during the year.') resource.descriptor['title'] = 'ROR profiles' resource.descriptor['sources'] = [{'title': 'Assumption'}] resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json')
def create_resource(path): """ """ mapper = {} from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'name' resource.descriptor[ 'description'] = 'Installed capacities, costs and technical parameters for components' resource.descriptor['title'] = '{} components'.format( resource.name.title()) resource.descriptor['sources'] = [{ 'title': 'E-Highway 2050 installed capacities', 'path': 'http://www.e-highway2050.eu/fileadmin/documents/Results/e-Highway2050_2050_Country_and_cluster_installed_capacities_31-03-2015.xlsx' }] resource.descriptor['schema']['foreignKeys'] = [{ "fields": "bus", "reference": { "resource": "bus", "fields": "name" } }] if 'demand' in resource.name: resource.descriptor['schema']['foreignKeys'].append({ "fields": "profile", "reference": { "resource": "demand-profiles" } }) elif 'volatile-generator' in resource.name: resource.descriptor['schema']['foreignKeys'].append({ "fields": "profile", "reference": { "resource": "generator-profiles" } }) resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json') else: print('Resource is not valid, writing resource anyway...') resource.save('resources/' + resource.name + '.json')
herb = herb.get("herb") return herb else: logger.error(f"Could not find herb {id}") return if __name__ == "__main__": from datapackage import Resource fl = Flora(flora="/Users/leima/dataherb/flora/flora.json") hb = fl.herb("git-data-science-job") print(f"herb base_path: {hb.base_path}") rs = hb.resources[0] rs_1 = Resource(rs.descriptor, base_path=str(hb.base_path)) print(f"{rs.tabular}") # rs_2.read() rs.read() print(hb.get_resource(path="dataset/stackoverflow_job_listing.csv")) logger.debug("End of Game")
'Pinst': 'installed capacity in MW', 'efactor': 'energy that can be gained from the water kwh/m3', 'head': 'difference in altitude in m', 'total_flow': 'inflow to the power plant in mio m3', 'flo_river_ror': 'next downstream res_nr', 'status': 'operational status of the plant', 'company': None, 'turbtype': 'optional: turbine type', 'geodbid': 'specified id for geo referencing', 'river': 'river in which the plant is located', 'river_km': 'km from stream source', 'level_meter': 'assigned level meter for flow curve' } # create resource r = Resource({'path': 'data/runofriver.csv'}) # get basic metadata from data r.infer() # add description for fields based on mapper for i in range(len(r.descriptor['schema']['fields'])): r.descriptor['schema']['fields'][i]['description'] = \ description_mapper[r.descriptor['schema']['fields'][i]['name']] # commit (apply) changes to resource r.commit() # save the resource r.save('dataresource.json')
def create_resource(path, title): from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'name' resource.descriptor[ 'description'] = 'Installed capacities, costs and technical parameters for components' resource.descriptor['title'] = title resource.descriptor['sources'] = [{ 'title': 'Restore 2050 hydro inflow timeseries', 'path': 'https://zenodo.org/record/804244/files/Hydro_Inflow.zip' }, { 'title': 'E-Highway 2050 installed capacities', 'path': 'http://www.e-highway2050.eu/fileadmin/documents/Results/e-Highway2050_2050_Country_and_cluster_installed_capacities_31-03-2015.xlsx' }, { 'title': 'DIW Berlin - Current and Prospective Costs of Electricity Generation until 2050', 'path': 'https://www.diw.de/documents/publikationen/73/diw_01.c.424566.de/diw_datadoc_2013-068.pdf' }] resource.descriptor['schema']['foreignKeys'] = [{ "fields": "bus", "reference": { "resource": "bus", "fields": "name" } }] resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json')
def create_resource(path): from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'timeindex' resource.descriptor['description'] = 'Demand profiles per country' resource.descriptor['title'] = 'Demand profiles' resource.descriptor['sources'] = [{ 'title': 'OPSD timeseries', 'path': 'https://data.open-power-system-data.org/time_series/2017-07-09/' + 'time_series_60min_singleindex.csv' }] resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json')
def temporal_clustering(datapackage, n, path="/tmp", how="daily"): """ Creates a new datapackage by aggregating sequences inside the `sequence` folder of the specified datapackage by clustering `n` timesteps Parameters ---------- datapackage: string String of meta data file datapackage.json n: integer Number of clusters path: string Path to directory where the aggregated datapackage is stored how: string How to cluster 'daily' or 'hourly' """ if how == "weekly": raise NotImplementedError("Weekly clustering is not implemented!") p = Package(datapackage) cwd = os.getcwd() copied_package_name = (p.descriptor["name"] + "__temporal_cluster__" + how + "_" + str(n)) copy_path = os.path.join(path, p.descriptor["name"], copied_package_name) copied_root = copy_datapackage(datapackage, os.path.abspath(copy_path), subset="data") sequence_resources = [ r for r in p.resources if re.match(r"^data/sequences/.*$", r.descriptor["path"]) ] dfs = { r.name: pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float) for r in sequence_resources } sequences = pd.concat(dfs.values(), axis=1) if how == "daily": hoursPerPeriod = 24 elif how == "hourly": hoursPerPeriod = 1 elif how == "weekly": hoursPerPeriod = 24 * 7 aggregation = tsam.TimeSeriesAggregation( sequences, noTypicalPeriods=n, rescaleClusterPeriods=False, hoursPerPeriod=hoursPerPeriod, clusterMethod="hierarchical", ) cluster_weights = { aggregation.clusterCenterIndices[n]: w for n, w in aggregation.clusterPeriodNoOccur.items() } if how == "daily": temporal = pd.Series( { d: cluster_weights[d.dayofyear] for d in sequences.index if d.dayofyear in aggregation.clusterCenterIndices }, name="weighting", ) temporal.index.name = "timeindex" elif how == "hourly": temporal = pd.Series( { h: cluster_weights[sequences.index.get_loc(h)] for h in sequences.index if sequences.index.get_loc(h) in aggregation.clusterCenterIndices }, name="weighting", ) temporal.index.name = "timeindex" # write resources to copied package (should not interfer with meta data) # as columns are not removed and sorted when written. os.chdir(copied_root) for r in sequence_resources: write_sequences(r.name + ".csv", dfs[r.name].loc[temporal.index], replace=True) # write temporal information from clustering temporal.to_csv( "data/temporal.csv", header=True, sep=";", date_format="%Y-%m-%dT%H:%M:%SZ", ) # add meta data for new temporal information r = Resource({"path": "data/temporal.csv"}) r.infer() # TODO: Add meta-data description r.descriptor[ "description"] = "Temporal selection based on hierachical clustering..." # Update meta-data of copied package cp = Package("datapackage.json") cp.descriptor["name"] = copied_package_name cp.descriptor["resources"].append(r.descriptor) cp.commit() cp.save("datapackage.json") # set back to 'old' workdirectory os.chdir(cwd) return copied_root
# -*- coding: utf-8 -*- """ """ import os from datapackage import Package, Resource p = Package('datapackage.json') p.descriptor['profile'] = 'tabular-data-package' for f in os.listdir('resources'): path = os.path.join('resources', f) r = Resource(path) p.add_resource(r.descriptor) p.commit() os.remove(path) os.rmdir('resources') p.save('datapackage.json')
from datapackage import Resource # Create resource = Resource({'path': 'data/data.csv'}) resource.tabular # true resource.headers # ['city', 'location'] print(resource.read(keyed=True)) # [ # {city: 'london', location: '51.50,-0.11'}, # {city: 'paris', location: '48.85,2.30'}, # {city: 'rome', location: 'N/A'}, # ] # Infer resource.infer() print(resource.descriptor) #{ path: 'data.csv', # profile: 'tabular-data-resource', # encoding: 'utf-8', # name: 'data', # format: 'csv', # mediatype: 'text/csv', # schema: { fields: [ [Object], [Object] ], missingValues: [ '' ] } } # resource.read(keyed=True) # Fails with a data validation error # Tweak resource.descriptor['schema']['missingValues'] = 'N/A' resource.commit() resource.valid # False print(resource.errors)
def create_resource(path): """ """ from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'name' resource.descriptor[ 'description'] = 'Contains the hubs (nodes) for the energy system representation' resource.descriptor[ 'title'] = 'Energy system hubs for DE and its electrical neighbours' resource.descriptor['sources'] = [{ 'title': 'NUTS Shapefiles', 'path': 'http://ec.europa.eu/eurostat/cache/GISCO/geodatafiles/NUTS_2013_10M_SH.zip', 'files': [ 'NUTS_2013_10M_SH/data/NUTS_RG_10M_2013.shp', 'NUTS_2013_10M_SH/data/NUTS_RG_10M_2013.dbf' ] }] resource.commit() resource.descriptor if resource.valid: resource.save('resources/' + resource.name + '.json')
def create_resource(path): from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'name' resource.descriptor[ 'description'] = 'Installed transmission capacities from the e-highway 2050 scenario' resource.descriptor['title'] = 'Installed transmission capacities' resource.descriptor['sources'] = [{ 'title': 'E-Highway 2050 transmission capacities', 'path': 'http://www.e-highway2050.eu/fileadmin/documents/' + 'Results/e-Highway_database_per_country-08022016.xlsx' }] resource.descriptor['schema']['foreignKeys'] = [{ "fields": "from_bus", "reference": { "resource": "bus", "fields": "name" } }, { "fields": "to_bus", "reference": { "resource": "bus", "fields": "name" } }] resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json')
def create_resource(path): from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'timeindex' resource.descriptor[ 'description'] = 'PV profiles (capacity factors) from renewables ninja for each country' resource.descriptor['title'] = 'PV profiles' resource.descriptor['sources'] = [{ 'title': 'Renewables Ninja PV Capacity Factors', 'path': 'https://www.renewables.ninja/static/downloads/ninja_europe_pv_v1.1.zip' }] resource.commit() if resource.valid: resource.save('resources/' + resource.name + '.json')
def temporal_skip(datapackage, n, path="/tmp", name=None, *args): """ Creates a new datapackage by aggregating sequences inside the `sequence` folder of the specified datapackage by skipping `n` timesteps Parameters ---------- datapackage: string String of meta data file datapackage.json n: integer Number of timesteps to skip path: string Path to directory where the aggregated datapackage is stored name: string Name of the new, aggregated datapackage. If not specified a name will be given """ p = Package(datapackage) cwd = os.getcwd() if name is None: copied_package_name = (p.descriptor["name"] + "__temporal_skip__" + str(n)) else: copied_package_name = name copy_path = os.path.join(path, copied_package_name) copied_root = copy_datapackage(datapackage, os.path.abspath(copy_path), subset="data") sequence_resources = [ r for r in p.resources if re.match(r"^data/sequences/.*$", r.descriptor["path"]) ] dfs = { r.name: pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float) for r in sequence_resources } sequences = pd.concat(dfs.values(), axis=1) skip_sequences = sequences.loc[::n] temporal = pd.Series(data=n, index=skip_sequences.index, name="weighting") temporal.index.name = "timeindex" os.chdir(copied_root) for r in sequence_resources: write_sequences(r.name + ".csv", dfs[r.name].loc[temporal.index], replace=True) # write temporal information from clustering temporal.to_csv( "data/temporal.csv", header=True, sep=";", date_format="%Y-%m-%dT%H:%M:%SZ", ) # add meta data for new temporal information r = Resource({"path": "data/temporal.csv"}) r.infer() r.descriptor[ "description"] = "Temporal selection based on skipped timesteps. Skipped n={}".format( n) # Update meta-data of copied package cp = Package("datapackage.json") cp.descriptor["name"] = copied_package_name cp.descriptor["resources"].append(r.descriptor) cp.commit() cp.save("datapackage.json") # set back to 'old' workdirectory os.chdir(cwd) return copied_root
def create_resource(path): """ """ from datapackage import Resource resource = Resource({'path': path}) resource.infer() resource.descriptor['schema']['primaryKey'] = 'name' resource.descriptor[ 'description'] = 'Excess slacks for each electricity hub in the energy system representation' resource.descriptor[ 'title'] = 'Excess slacks for DE and its electrical neighbours' resource.descriptor['schema']['foreignKeys'] = [{ "fields": "bus", "reference": { "resource": "bus", "fields": "name" } }] resource.commit() resource.descriptor if resource.valid: resource.save('resources/' + resource.name + '.json')
class load(DataStreamProcessor): def __init__(self, load_source, name=None, resources=None, validate=False, strip=True, **options): super(load, self).__init__() self.load_source = load_source self.options = options self.name = name self.resources = resources self.load_dp = None self.validate = validate self.strip = strip self.force_strings = options.get('force_strings') is True def process_datapackage(self, dp: Package): if isinstance(self.load_source, tuple): datapackage_descriptor, _ = self.load_source dp.descriptor.setdefault('resources', []) self.resource_matcher = ResourceMatcher(self.resources, datapackage_descriptor) for resource_descriptor in datapackage_descriptor['resources']: if self.resource_matcher.match(resource_descriptor['name']): dp.add_resource(resource_descriptor) else: # load_source is string: if self.load_source.startswith('env://'): env_var = self.load_source[6:] self.load_source = os.environ.get(env_var) if self.load_source is None: raise ValueError( f"Couldn't find value for env var '{env_var}'") if os.path.basename(self.load_source) == 'datapackage.json': self.load_dp = Package(self.load_source) self.resource_matcher = ResourceMatcher( self.resources, self.load_dp) dp.descriptor.setdefault('resources', []) for resource in self.load_dp.resources: if self.resource_matcher.match(resource.name): dp.add_resource(resource.descriptor) else: if os.path.exists(self.load_source): base_path = os.path.dirname(self.load_source) or '.' self.load_source = os.path.basename(self.load_source) else: base_path = None descriptor = dict(path=self.load_source, profile='tabular-data-resource') descriptor['format'] = self.options.get('format') if 'encoding' in self.options: descriptor['encoding'] = self.options['encoding'] if descriptor['format'] == 'xml' or self.load_source.endswith( '.xml'): self.options.setdefault('custom_parsers', {})['xml'] = XMLParser self.options.setdefault('ignore_blank_headers', True) self.options.setdefault('headers', 1) self.res = Resource(descriptor, base_path=base_path, **self.options) self.res.infer(confidence=1, limit=1000) if self.name is not None: self.res.descriptor['name'] = self.name if self.force_strings: for f in self.res.descriptor['schema']['fields']: f['type'] = 'string' self.res.commit() self.res.descriptor['path'] = '{name}.{format}'.format( **self.res.descriptor) dp.add_resource(self.res.descriptor) return dp def stripper(self, iterator): for r in iterator: yield dict((k, v.strip()) if isinstance(v, str) else (k, v) for k, v in r.items()) def process_resources(self, resources): yield from super(load, self).process_resources(resources) if isinstance(self.load_source, tuple): datapackage_descriptor, resources = self.load_source yield from (resource for resource, descriptor in zip( resources, datapackage_descriptor['resources']) if self.resource_matcher.match(descriptor['name'])) elif self.load_dp is not None: yield from (resource.iter(keyed=True) for resource in self.load_dp.resources if self.resource_matcher.match(resource.name)) else: it = self.res.iter(keyed=True, cast=False) if self.validate: it = schema_validator(self.res, it) if self.strip: it = self.stripper(it) yield it