def datapackage_creator(location, title, name, source_title, source_path): package = Package() package.descriptor['title'] = title package.descriptor['name'] = name package.descriptor['sources'] = [{}] package.descriptor['sources'][0]['title'] = source_title package.descriptor['sources'][0]['path'] = source_path package.descriptor['licences'] = [{}] package.descriptor['licences'][0]['name'] = 'odc-pddl' package.descriptor['licences'][0]['title'] = 'Open Data Commons Public Domain Dedication and Licence (PDDL)' package.descriptor['licences'][0]['path'] = 'http://opendatacommons.org/licenses/pddl/' package.commit() package.infer(location + '/data/*.csv') package_json = package.descriptor del package_json['profile'] for resource in package_json['resources']: resource['path'] = resource['path'][len(location) + 1:] if package.valid: with open(location + '/datapackage.json', 'w') as data_file: json.dump(package_json, data_file, indent=4, sort_keys=True) return True else: print('DATAPACKAGE IS NOT VALID') return False
def package_from_resources(resource_path, output_path, clean=True): """ Collects resource descriptors and merges them in a datapackage.json Parameters ---------- resource_path: string Path to directory with resources (in .json format) output_path: string Root path of datapackage where the newly created datapckage.json is stored clean: boolean If true, resources will be deleted """ p = Package() p.descriptor["profile"] = "tabular-data-package" p.commit() for f in os.listdir(resource_path): path = os.path.join(resource_path, f) r = Resource(path) p.add_resource(r.descriptor) p.commit() os.remove(path) if clean: os.rmdir(resource_path) p.save(os.path.join(output_path, "datapackage.json"))
def test_changing_resources_in_descriptor_changes_datapackage(): descriptor = { 'resources': [ {'data': '万事开头难'} ] } package = Package(descriptor) package.descriptor['resources'][0]['name'] = 'saying' package.commit() assert package.descriptor['resources'][0]['name'] == 'saying'
def test_changing_resources_in_descriptor_changes_datapackage(): descriptor = { 'resources': [ {'data': '万事开头难'} ] } package = Package(descriptor) package.descriptor['resources'][0]['name'] = 'saying' package.commit() assert package.descriptor['resources'][0]['name'] == 'saying'
def _make_package(source, publisher, config): os.chdir(source) files = [f for f in os.listdir('data') if f.endswith('.csv')] package = Package({'publisher': publisher}) for f in files: path = f"data/{f}" name = f.replace('.csv', '') schema = f"https://raw.githubusercontent.com/digital-land/alpha-data/master/schema/{name}-schema.json" resource = Resource({'path': path, 'schema': schema}) package.add_resource(resource.descriptor) package.commit() package.infer() errors = False for r in package.resources: try: r.read(keyed=True) r.check_relations() except (CastError, RelationError) as e: print('Error in', os.path.join(source, r.descriptor['path'])) print(e, e.errors) errors = True if not errors: package.save('datapackage.zip') print('saved datapackage.json to', source) s3 = boto3.client( 's3', aws_access_key_id=config['AWS_ACCESS_KEY_ID'], aws_secret_access_key=config['AWS_SECRET_ACCESS_KEY']) bucket = 'developer-contributions-datapackages' key = f'{publisher}/{uuid.uuid4()}/datapackage.zip' s3.upload_file(f'{source}/datapackage.zip', bucket, key, ExtraArgs={'ACL': 'public-read'}) config = s3._client_config config.signature_version = botocore.UNSIGNED datapackage_url = boto3.resource( 's3', config=config).meta.client.generate_presigned_url('get_object', ExpiresIn=0, Params={ 'Bucket': bucket, 'Key': key }) return datapackage_url
def test_can_add_resource_to_descriptor_in_place(): resource = { 'data': '万事开头难', } package = Package() resources = package.descriptor.get('resources', []) resources.append(resource) package.descriptor['resources'] = resources package.commit() assert len(package.resources) == 1 assert package.resources[0].source == '万事开头难'
def test_can_add_resource_to_descriptor_in_place(): resource = { 'data': '万事开头难', } package = Package() resources = package.descriptor.get('resources', []) resources.append(resource) package.descriptor['resources'] = resources package.commit() assert len(package.resources) == 1 assert package.resources[0].source == '万事开头难'
def process_datapackage(self, dp: Package): super().process_datapackage(dp) descriptor = dp.descriptor source: DataStream for source in self.sources: res1 = descriptor.pop('resources', []) res2 = source.dp.descriptor['resources'] descriptor.update(source.dp.descriptor) descriptor['resources'] = res1 + res2 dp.commit() return dp
def test_can_remove_resource_from_descriptor_in_place(): descriptor = { 'resources': [ {'data': '万事开头难'}, {'data': 'All beginnings are hard'} ] } package = Package(descriptor) del package.descriptor['resources'][1] package.commit() assert len(package.resources) == 1 assert package.resources[0].source == '万事开头难'
def test_can_remove_resource_from_descriptor_in_place(): descriptor = { 'resources': [ {'data': '万事开头难'}, {'data': 'All beginnings are hard'} ] } package = Package(descriptor) del package.descriptor['resources'][1] package.commit() assert len(package.resources) == 1 assert package.resources[0].source == '万事开头难'
def convert_hdx_dataset(self, dataset_id, path): dataset = Dataset.read_from_hdx(dataset_id) package = Package({'id': dataset['id'], 'name': dataset['name'], 'title': dataset['title'], 'description': dataset['notes']}) for hdx_resource in dataset.get_resources(): name = hdx_resource['name'].lower().replace(' ', '_') package.add_resource({'name': name, 'path': hdx_resource['url'], 'format': hdx_resource['format'].lower(), 'title': hdx_resource['description']}) try: package.infer() except tabulator.exceptions.FormatError: pass for frictionless_resource in package.descriptor['resources']: self.convert_hxl_url(frictionless_resource) package.commit() package.save(path)
def update_package_descriptor(): """ """ p = Package("datapackage.json") for f in os.listdir("resources"): path = os.path.join("resources", f) r = Resource(path) p.add_resource(r.descriptor) p.commit() os.remove(path) os.rmdir("resources") p.save("datapackage.json")
def datapackage_creator(location, title, name, source_title, source_path): package = Package() package.descriptor['title'] = title package.descriptor['name'] = name package.descriptor['sources'] = [{}] package.descriptor['sources'][0]['title'] = source_title package.descriptor['sources'][0]['path'] = source_path package.descriptor['licences'] = [{}] package.descriptor['licences'][0]['name'] = 'odc-pddl' package.descriptor['licences'][0]['title'] = 'Open Data Commons Public Domain Dedication and Licence (PDDL)' package.descriptor['licences'][0]['path'] = 'http://opendatacommons.org/licenses/pddl/' package.commit() package.infer(location + '/data/*.csv') package_json = package.descriptor del package_json['profile'] with open(location + '/datapackage.json', 'w') as data_file: json.dump(package_json, data_file, indent=4, sort_keys=True)
def datapackage_creator(location, title, name, source_title, source_path): package = Package() package.descriptor['title'] = title package.descriptor['name'] = name package.descriptor['sources'] = [{}] package.descriptor['sources'][0]['title'] = source_title package.descriptor['sources'][0]['path'] = source_path package.descriptor['licences'] = [{}] package.descriptor['licences'][0]['name'] = 'odc-pddl' package.descriptor['licences'][0][ 'title'] = 'Open Data Commons Public Domain Dedication and Licence (PDDL)' package.descriptor['licences'][0][ 'path'] = 'http://opendatacommons.org/licenses/pddl/' package.commit() package.infer(location + '/data/*.csv') package_json = package.descriptor del package_json['profile'] with open(location + '/datapackage.json', 'w') as data_file: json.dump(package_json, data_file, indent=4, sort_keys=True)
def temporal_clustering(datapackage, n, path="/tmp", how="daily"): """ Creates a new datapackage by aggregating sequences inside the `sequence` folder of the specified datapackage by clustering `n` timesteps Parameters ---------- datapackage: string String of meta data file datapackage.json n: integer Number of clusters path: string Path to directory where the aggregated datapackage is stored how: string How to cluster 'daily' or 'hourly' """ if how == "weekly": raise NotImplementedError("Weekly clustering is not implemented!") p = Package(datapackage) cwd = os.getcwd() copied_package_name = (p.descriptor["name"] + "__temporal_cluster__" + how + "_" + str(n)) copy_path = os.path.join(path, p.descriptor["name"], copied_package_name) copied_root = copy_datapackage(datapackage, os.path.abspath(copy_path), subset="data") sequence_resources = [ r for r in p.resources if re.match(r"^data/sequences/.*$", r.descriptor["path"]) ] dfs = { r.name: pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float) for r in sequence_resources } sequences = pd.concat(dfs.values(), axis=1) if how == "daily": hoursPerPeriod = 24 elif how == "hourly": hoursPerPeriod = 1 elif how == "weekly": hoursPerPeriod = 24 * 7 aggregation = tsam.TimeSeriesAggregation( sequences, noTypicalPeriods=n, rescaleClusterPeriods=False, hoursPerPeriod=hoursPerPeriod, clusterMethod="hierarchical", ) cluster_weights = { aggregation.clusterCenterIndices[n]: w for n, w in aggregation.clusterPeriodNoOccur.items() } if how == "daily": temporal = pd.Series( { d: cluster_weights[d.dayofyear] for d in sequences.index if d.dayofyear in aggregation.clusterCenterIndices }, name="weighting", ) temporal.index.name = "timeindex" elif how == "hourly": temporal = pd.Series( { h: cluster_weights[sequences.index.get_loc(h)] for h in sequences.index if sequences.index.get_loc(h) in aggregation.clusterCenterIndices }, name="weighting", ) temporal.index.name = "timeindex" # write resources to copied package (should not interfer with meta data) # as columns are not removed and sorted when written. os.chdir(copied_root) for r in sequence_resources: write_sequences(r.name + ".csv", dfs[r.name].loc[temporal.index], replace=True) # write temporal information from clustering temporal.to_csv( "data/temporal.csv", header=True, sep=";", date_format="%Y-%m-%dT%H:%M:%SZ", ) # add meta data for new temporal information r = Resource({"path": "data/temporal.csv"}) r.infer() # TODO: Add meta-data description r.descriptor[ "description"] = "Temporal selection based on hierachical clustering..." # Update meta-data of copied package cp = Package("datapackage.json") cp.descriptor["name"] = copied_package_name cp.descriptor["resources"].append(r.descriptor) cp.commit() cp.save("datapackage.json") # set back to 'old' workdirectory os.chdir(cwd) return copied_root
fields[i]["description"] = desc_list[i] display(merops_pkg.descriptor["resources"][0]["schema"]["fields"]) # Add additional metadata to package. merops_pkg.descriptor["keywords"] = ["peptide", "protein", "peptidase", "proteinase", "protease", "bioinformatics", "protein informatics", "MEROPS", "cleavage", "proteolysis"] merops_pkg.descriptor["title"] = "Human peptidase families" merops_pkg.descriptor["contributors"] = {"title": "JRMA Maasch", "role": "author"} merops_pkg.descriptor["licenses"] = [{"name": "CC0-1.0", "title": "CC0 1.0", "path": "https://creativecommons.org/publicdomain/zero/1.0/"}] merops_pkg.descriptor["description"] = "A dataset of human peptidase families as scraped from the MEROPS Peptidase Database in June 2020 (https://www.ebi.ac.uk/merops/index.shtml)." # Display updated package. display(merops_pkg.descriptor) # Save data package. merops_pkg.commit() merops_pkg.save("merops_data_pkg.zip")
def generate_package(path_to_package): """Creates a datapackage in folder ``path_to_package`` [{'fields': 'REGION', 'reference': {'resource': 'REGION', 'fields': 'VALUE'}}] """ datapath = os.path.join(path_to_package) package = Package(base_path=datapath) package.infer("data/*.csv") package.descriptor["licenses"] = [{ "name": "CC-BY-4.0", "path": "https://creativecommons.org/licenses/by/4.0/", "title": "Creative Commons Attribution 4.0", }] package.descriptor["title"] = "The OSeMOSYS Simplicity Example Model" package.descriptor["name"] = "osemosys_model_simplicity" package.descriptor["contributors"] = [{ "title": "Will Usher", "email": "*****@*****.**", "path": "http://www.kth.se/wusher", "role": "author", }] package.commit() config = read_packaged_file("config.yaml", "otoole.preprocess") new_resources = [] for resource in package.resources: descriptor = resource.descriptor name = resource.name if config[name]["type"] == "param": indices = config[name]["indices"] logger.debug("Indices of %s are %s", name, indices) foreign_keys = [] for index in indices: key = { "fields": index, "reference": { "resource": index, "fields": "VALUE" }, } foreign_keys.append(key) descriptor["schema"]["foreignKeys"] = foreign_keys descriptor["schema"]["primaryKey"] = indices descriptor["schema"]["missingValues"] = [""] new_resources.append(descriptor) package.descriptor["resources"] = new_resources package.commit() filepath = os.path.join(path_to_package, "datapackage.json") package.save(filepath)
class DataStreamProcessor: def __init__(self): self.stats = {} self.source = None self.datapackage = None self.position = None def __call__(self, source=None, position=None): if source is None: source = DataStream() self.source = source self.position = position return self def process_resource(self, resource: ResourceWrapper): for row in resource: yield self.process_row(row) def process_resources(self, resources): for res in resources: yield self.process_resource(res) def process_row(self, row): return row def process_datapackage(self, dp: Package): return dp def get_res(self, current_dp, name): ret = self.datapackage.get_resource(name) if ret is None: ret = current_dp.get_resource(name) assert ret is not None return ret def get_iterator(self, datastream): current_dp = datastream.dp res_iter_ = datastream.res_iter def func(): res_iter = (ResourceWrapper(self.get_res(current_dp, rw.res.name), rw.it) for rw in res_iter_) res_iter = self.process_resources(res_iter) res_iter = (it if isinstance(it, ResourceWrapper) else ResourceWrapper(res, it) for res, it in itertools.zip_longest( self.datapackage.resources, res_iter)) return res_iter return func def _process(self): datastream = self.source._process() try: self.datapackage = Package( descriptor=copy.deepcopy(datastream.dp.descriptor)) self.datapackage = self.process_datapackage(self.datapackage) self.datapackage.commit() return DataStream(self.datapackage, LazyIterator(self.get_iterator(datastream)), datastream.stats + [self.stats]) except Exception as exception: self.raise_exception(exception) def raise_exception(self, cause): if not isinstance(cause, exceptions.ProcessorError): error = exceptions.ProcessorError( cause, processor_name=self.__class__.__name__, processor_object=self, processor_position=self.position) raise error from cause raise cause def safe_process(self, on_error=None): results = [] try: ds = self._process() for res in ds.res_iter: if on_error is not None: results.append( list(schema_validator(res.res, res, on_error=on_error))) else: collections.deque(res, maxlen=0) except UniqueKeyError as e: self.raise_exception(e) except CastError as e: for err in e.errors: logging.error('%s', err) except Exception as exception: self.raise_exception(exception) return ds, results def process(self): ds, _ = self.safe_process() return ds.dp, ds.merge_stats() def results(self, on_error=None): if on_error is None: on_error = raise_exception ds, results = self.safe_process(on_error=on_error) return results, ds.dp, ds.merge_stats()
} source = { 'name': 'Rothamsted electronic archive (e-RA)', 'web': 'http://www.era.rothamsted.ac.uk/Broadbalk' } package.descriptor['licenses'] = [licence] package.descriptor['publishers'] = [publisher] package.descriptor['maintainers'] = [maintainer] package.descriptor['contributors'] = [contributor] package.descriptor['sources'] = [source] spatialCoverage = { '@type': 'Place', 'geo': { '@type': 'GeoCoordinates', 'latitude': '51.809450', 'longitude': '-0.372898' } } package.descriptor['spatialCoverage'] = spatialCoverage package.descriptor['latitude'] = '51.809450' package.descriptor['longitude'] = '-0.372898' package.descriptor['altitude'] = '130' package.descriptor['startYear'] = '1968' package.descriptor['endYear'] = '2018' package.commit() package.valid print('done') package.save('broadbalkWheatData.zip')
class DataStreamProcessor: def __init__(self): self.stats = {} self.source = None self.datapackage = None def __call__(self, source=None): if source is None: source = DataStream() self.source = source return self def process_resource(self, resource: ResourceWrapper): for row in resource: yield self.process_row(row) def process_resources(self, resources): for res in resources: yield self.process_resource(res) def process_row(self, row): return row def process_datapackage(self, dp: Package): return dp def get_res(self, current_dp, name): ret = self.datapackage.get_resource(name) if ret is None: ret = current_dp.get_resource(name) assert ret is not None return ret def get_iterator(self, datastream): current_dp = datastream.dp res_iter_ = datastream.res_iter def func(): res_iter = (ResourceWrapper(self.get_res(current_dp, rw.res.name), rw.it) for rw in res_iter_) res_iter = self.process_resources(res_iter) res_iter = (it if isinstance(it, ResourceWrapper) else ResourceWrapper(res, it) for res, it in itertools.zip_longest( self.datapackage.resources, res_iter)) return res_iter return func def _process(self): datastream = self.source._process() self.datapackage = Package( descriptor=copy.deepcopy(datastream.dp.descriptor)) self.datapackage = self.process_datapackage(self.datapackage) self.datapackage.commit() return DataStream(self.datapackage, LazyIterator(self.get_iterator(datastream)), datastream.stats + [self.stats]) def process(self): ds = self._process() try: for res in ds.res_iter: collections.deque(res, maxlen=0) except CastError as e: for err in e.errors: logging.error('%s', err) return ds.dp, ds.merge_stats() def results(self, on_error=None): ds = self._process() results = [ list(schema_validator(res.res, res, on_error=on_error)) for res in ds.res_iter ] return results, ds.dp, ds.merge_stats()
def temporal_skip(datapackage, n, path="/tmp", name=None, *args): """ Creates a new datapackage by aggregating sequences inside the `sequence` folder of the specified datapackage by skipping `n` timesteps Parameters ---------- datapackage: string String of meta data file datapackage.json n: integer Number of timesteps to skip path: string Path to directory where the aggregated datapackage is stored name: string Name of the new, aggregated datapackage. If not specified a name will be given """ p = Package(datapackage) cwd = os.getcwd() if name is None: copied_package_name = (p.descriptor["name"] + "__temporal_skip__" + str(n)) else: copied_package_name = name copy_path = os.path.join(path, copied_package_name) copied_root = copy_datapackage(datapackage, os.path.abspath(copy_path), subset="data") sequence_resources = [ r for r in p.resources if re.match(r"^data/sequences/.*$", r.descriptor["path"]) ] dfs = { r.name: pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float) for r in sequence_resources } sequences = pd.concat(dfs.values(), axis=1) skip_sequences = sequences.loc[::n] temporal = pd.Series(data=n, index=skip_sequences.index, name="weighting") temporal.index.name = "timeindex" os.chdir(copied_root) for r in sequence_resources: write_sequences(r.name + ".csv", dfs[r.name].loc[temporal.index], replace=True) # write temporal information from clustering temporal.to_csv( "data/temporal.csv", header=True, sep=";", date_format="%Y-%m-%dT%H:%M:%SZ", ) # add meta data for new temporal information r = Resource({"path": "data/temporal.csv"}) r.infer() r.descriptor[ "description"] = "Temporal selection based on skipped timesteps. Skipped n={}".format( n) # Update meta-data of copied package cp = Package("datapackage.json") cp.descriptor["name"] = copied_package_name cp.descriptor["resources"].append(r.descriptor) cp.commit() cp.save("datapackage.json") # set back to 'old' workdirectory os.chdir(cwd) return copied_root
def infer_metadata( package_name="default-name", keep_resources=False, foreign_keys={ "bus": [ "volatile", "dispatchable", "storage", "load", "reservoir", "shortage", "excess", ], "profile": ["load", "volatile", "ror"], "from_to_bus": ["connection", "line", "conversion"], "chp": ["backpressure", "extraction", "chp"], }, path=None, ): """ Add basic meta data for a datapackage Parameters ---------- package_name: string Name of the data package keep_resource: boolean Flag indicating of the resources meta data json-files should be kept after main datapackage.json is created. The reource meta data will be stored in the `resources` directory. foreign_keys: dict Dictionary with foreign key specification. Keys for dictionary are: 'bus', 'profile', 'from_to_bus'. Values are list with strings with the name of the resources path: string Absoltue path to root-folder of the datapackage """ current_path = os.getcwd() if path: print("Setting current work directory to {}".format(path)) os.chdir(path) p = Package() p.descriptor["name"] = package_name p.descriptor["profile"] = "tabular-data-package" p.commit() if not os.path.exists("resources"): os.makedirs("resources") # create meta data resources elements if not os.path.exists("data/elements"): print("No data path found in directory {}. Skipping...".format( os.getcwd())) else: for f in os.listdir("data/elements"): r = Resource({"path": os.path.join("data/elements", f)}) r.infer() r.descriptor["schema"]["primaryKey"] = "name" if r.name in foreign_keys.get("bus", []): r.descriptor["schema"]["foreignKeys"] = [{ "fields": "bus", "reference": { "resource": "bus", "fields": "name" }, }] if r.name in foreign_keys.get("profile", []): r.descriptor["schema"]["foreignKeys"].append({ "fields": "profile", "reference": { "resource": r.name + "_profile" }, }) elif r.name in foreign_keys.get("from_to_bus", []): r.descriptor["schema"]["foreignKeys"] = [ { "fields": "from_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "to_bus", "reference": { "resource": "bus", "fields": "name" }, }, ] elif r.name in foreign_keys.get("chp", []): r.descriptor["schema"]["foreignKeys"] = [ { "fields": "fuel_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "electricity_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "heat_bus", "reference": { "resource": "bus", "fields": "name" }, }, ] r.commit() r.save(os.path.join("resources", f.replace(".csv", ".json"))) p.add_resource(r.descriptor) # create meta data resources elements if not os.path.exists("data/sequences"): print("No data path found in directory {}. Skipping...".format( os.getcwd())) else: for f in os.listdir("data/sequences"): r = Resource({"path": os.path.join("data/sequences", f)}) r.infer() r.commit() r.save(os.path.join("resources", f.replace(".csv", ".json"))) p.add_resource(r.descriptor) p.commit() p.save("datapackage.json") if not keep_resources: shutil.rmtree("resources") os.chdir(current_path)
# write each tracked kpi to its own csv file for kpid in kpis.keys(): filename = 'data/kpis/' + kpis[kpid][-1] + '.csv' with open(filename, "w", newline='') as f: writer = csv.writer(f) writer.writerows(data[kpid]) ## # Package KPIS (Subsets) ## os.chdir(cwd + '/data/kpis') kpiPackage = Package() kpiPackage.infer('*.csv') kpiPackage.descriptor['name'] = 'montreal-kpis' kpiPackage.descriptor['license'] = 'https://creativecommons.org/publicdomain/zero/1.0/' kpiPackage.commit() kpiPackage.save(cwd + '/data/kpis/datapackage.json') kpiPackage.save(cwd + '/data/kpis/datapackage.zip') ## # Package Indicators (Master) ## os.chdir(cwd + '/data/indicators') indPackage = Package() indPackage.basePath = cwd + '/data/indicators' indPackage.infer(cwd + '/data/indicators/*.csv') indPackage.descriptor['name'] = 'montreal-indicators' indPackage.descriptor['license'] = 'https://creativecommons.org/publicdomain/zero/1.0/' indPackage.commit() indPackage.name = "montreal-city-indicators" indPackage.save(cwd + '/data/indicators/datapackage.json')
def build(config: Dict) -> Package: """Builds a datapackage.Datapackage object from a config dictionary. The configuration dictionary should contain the following keys: "metadata", "files". Information about the corresponding study can be placed in metadata. Example: { 'metadata': { 'name': 'ddionrails-study', 'id': 'doi' } } The desired files to be included in the Tabular Data Package can be placed in 'files': Example: { 'files': [ 'concepts.csv' ] } See: examples/example-config.yml The resulting Tabular Data Package is written to disk as 'datapackage.json' in the directory the command line tool is run. Args: config: The configuration of the Datapackage to be created. """ if "metadata" not in config or "files" not in config: raise ValueError("Config must contain 'metadata' and 'files'") # Read the descriptor base dictionary from disk # and update it with values from the config file descriptor = read_yaml(DATAPACKAGE_BASE_FILE) descriptor["name"] = config["metadata"].get("name") descriptor["id"] = config["metadata"].get("id") descriptor["title"] = config["metadata"].get("title") # Remove empty keys from the dictionary descriptor = {key: value for key, value in descriptor.items() if value} # Create a Datapackage object from the descriptor dictionary package = Package(descriptor=descriptor) wanted_files = [file.split(".")[0] for file in config["files"]] for file in wanted_files: # If a filename ends with "_strict" # create the basic Tabular Data Resource first # then add the "stricter" rules from the "_strict" file if "_strict" in file: basic_file = file.replace("_strict", "") resource = read_tabular_data_resource(basic_file) strict_resource = read_tabular_data_resource(file) merge(resource, strict_resource) else: resource = read_tabular_data_resource(file) package.add_resource(resource) package.commit() if not package.valid: for error in package.errors: LOGGER.error(error) return package
from datapackage import Package # Init package = Package() # Infer package.infer('**/*.csv') print(package.descriptor) # Tweak package.descriptor['resources'][1]['schema']['fields'][1]['type'] = 'year' package.commit() print(package.valid) # true # Read print(package.get_resource('population').read(keyed=True)) #[ { city: 'london', year: 2017, population: 8780000 }, # { city: 'paris', year: 2017, population: 2240000 }, # { city: 'rome', year: 2017, population: 2860000 } ] # Save package.save('tmp/datapackage.zip') # Load package = Package('tmp/datapackage.zip', base_path='tmp') print(package.descriptor)
def data_package_from_dataset(row: dict) -> Tuple[str, Package]: """ Make a data package definition from a dataset row. """ assert "dataset" == row["Type"], row uid: str = row["U ID"] # Initialise the data package from the CSV data. package = Package() csv_path = f"raw-csv/{uid}.csv" package.infer(csv_path) # Set a more readable name package.descriptor["name"] = derive_name(row) # Update standard descriptor fields from the row metadata. package.descriptor["title"] = row["Name"] package.descriptor["description"] = row["Description"] # Sources require a title: fall back to using the link for it, otherwise skip it. source_title: Optional[str] = ( row["data_provided_by"] if row["data_provided_by"] else row["source_link"] if row["source_link"] else None ) if source_title: package.descriptor["sources"] = [ { "title": source_title, **({"path": row["source_link"]} if row["source_link"] else {}), } ] package.descriptor["contributors"] = [ { "title": row["Owner"], # XXX: Ugly but compact. **({"email": row["Contact Email"]} if row["Contact Email"] else {}), } ] keywords: List[str] = row["Keywords"].split(",") package.descriptor["keywords"] = keywords # Example value: "09/22/2014 05:34:00 PM +0000" socrata_datetime_format = "%m/%d/%Y %H:%M:%S %p %z" created: dt.datetime = dt.datetime.strptime( row["Creation Date"], socrata_datetime_format ) package.descriptor["created"] = created.isoformat() # XXX: Update non-standard descriptor fields from the row metadata. # (Prefix these with "x_" to flag non-standard status.) if row["License"]: # TODO: Use licenses field instead package.descriptor["x_license_name"] = row["License"] if row["Category"]: package.descriptor["x_category"] = row["Category"] success = package.commit() assert success, package.descriptor # Check descriptor, and return. descriptor: dict = package.descriptor assert "tabular-data-package" == descriptor["profile"], descriptor assert 1 == len(descriptor["resources"]), descriptor["resources"] [resource] = descriptor["resources"] assert csv_path == resource["path"], resource # assert False, descriptor assert package.valid, package.errors return (uid, package)
from datapackage import Package p = Package() p.infer('*.csv') p.descriptor['title'] = 'Openmod-Example' p.descriptor['spatial'] = 'Random' p.descriptor['sources'] = 'Created by hand' p.commit() p.save('datapackage.json')