def _load_raw_data(self, resource_name): """Extract raw data from resource""" # Instantiating the resource again as a simple `Resource` ensures that # ``data`` will be returned as bytes. upcast_resource = datapackage.Resource( self.__resources[resource_name].descriptor, default_base_path=self.__base_path) return upcast_resource.data
def test_descriptor_are_available(self): resource_dict = { 'name': 'foo', 'url': 'http://someplace.com/foo.json', 'path': 'foo.json', 'data': {'foo': 'bar'}, } resource = datapackage.Resource(resource_dict) assert resource.descriptor == resource_dict
def test_create_new_datapackage(self): """Checks if it's possible to create a new datapackage""" joker = datapackage.Resource(datapackage_uri='http://gotham.us/', name="joker", url="http://gotham.us/villains.csv") villains = datapackage.DataPackage(name="villains", license="ODC-PDDL-1.0", resources=[joker]) assert villains.name == "villains" assert len(villains.resources) == 1 assert villains.resources[0].name == "joker"
def get_tabular_data_resource(table_name, pkg_dir, partitions=False): """ Create a Tabular Data Resource descriptor for a PUDL table. Based on the information in the database, and some additional metadata this function will generate a valid Tabular Data Resource descriptor, according to the Frictionless Data specification, which can be found here: https://frictionlessdata.io/specs/tabular-data-resource/ Args: table_name (string): table name for which you want to generate a Tabular Data Resource descriptor pkg_dir (path-like): The location of the directory for this package. The data package directory will be a subdirectory in the `datapackage_dir` directory, with the name of the package as the name of the subdirectory. Returns: Tabular Data Resource descriptor: A JSON object containing key information about the selected table """ # every time we want to generate the cems table, we want it compressed if 'epacems' in table_name: abs_path = pathlib.Path(pkg_dir, 'data', f'{table_name}.csv.gz') else: abs_path = pathlib.Path(pkg_dir, 'data', f'{table_name}.csv') # pull the skeleton of the descriptor from the megadata file descriptor = pull_resource_from_megadata(table_name) descriptor['path'] = str(abs_path.relative_to(abs_path.parent.parent)) descriptor['bytes'] = abs_path.stat().st_size descriptor['hash'] = hash_csv(abs_path) descriptor['created'] = ( datetime.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z') if partitions: for part in partitions.keys(): if part in table_name: descriptor['group'] = part resource = datapackage.Resource(descriptor) if resource.valid: logger.debug(f"{table_name} is a valid resource") if not resource.valid: raise AssertionError(f""" Invalid tabular data resource: {resource.name} Errors: {resource.errors} """) return descriptor
def save_dpkg(dataset_path, data, meta, events, participants): dp = dpkg.DataPackage(name=dataset_path.name, licenses=['odc-by']) dp['version'] = '1.0.0' dp.add_contributor("Jessica B. Hamrick", "*****@*****.**") dp.add_contributor("Thomas L. Griffiths", "*****@*****.**") dp.add_contributor("Peter W. Battaglia", "*****@*****.**") dp.add_contributor("Joshua B. Tenenbaum", "*****@*****.**") # add experiment data, and save it as csv r1 = dpkg.Resource(name="experiment.csv", fmt="csv", pth="./experiment.csv", data=data) r1['mediaformat'] = 'text/csv' dp.add_resource(r1) # add metadata, and save it inline as json r2 = dpkg.Resource(name="metadata", fmt="json", data=meta) r2['mediaformat'] = 'application/json' dp.add_resource(r2) # add event data, and save it as csv r3 = dpkg.Resource(name="events.csv", fmt="csv", pth="./events.csv", data=events) r3['mediaformat'] = 'text/csv' dp.add_resource(r3) # add participant info, and save it as csv r3 = dpkg.Resource(name="participants.csv", fmt="csv", pth="./participants.csv", data=participants) r3['mediaformat'] = 'text/csv' dp.add_resource(r3) # save the datapackage dp.save(dataset_path.dirname()) logger.info("Saved to '%s'", dataset_path.relpath())
def get_tabular_data_resource_2(table_name, pkg_dir, testing=False): """ Create a Tabular Data Resource descriptor for a PUDL table. Based on the information in the database, and some additional metadata, stored elsewhere (Where?!?!) this function will generate a valid Tabular Data Resource descriptor, according to the Frictionless Data specification, which can be found here: https://frictionlessdata.io/specs/tabular-data-resource/ """ # Where the CSV file holding the data is, relative to datapackage.json # This is the value that has to be embedded in the data package. csv_relpath = os.path.join('data', f'{table_name}.csv') # We need to access the file to calculate hash and size too: csv_abspath = os.path.join(os.path.abspath(pkg_dir), csv_relpath) # pull the skeleton of the descriptor from the megadata file descriptor = pull_resource_from_megadata(table_name, mega_pkg_dir=os.path.join( SETTINGS['meta_dir'], 'pudl-test')) descriptor['path'] = csv_relpath descriptor['bytes'] = os.path.getsize(csv_abspath) descriptor['hash'] = pudl.output.export.hash_csv(csv_abspath) descriptor['created'] = ( datetime.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'), resource = datapackage.Resource(descriptor) if resource.valid: logger.debug(f"{table_name} is a valid resource") if not resource.valid: raise AssertionError(f""" Invalid tabular data resource: {resource.name} Errors: {resource.errors} """) return descriptor
def test_create_resource_missing_required_field(self): datapackage.Resource()
def setup(self): self.dpkg = datapackage.DataPackage("tests/test.dpkg") kwargs = self.dpkg['resources'][0] kwargs['datapackage_uri'] = compat.str(self.dpkg.base) self.resource = datapackage.Resource(**kwargs)
def get_tabular_data_resource(resource_name, datapkg_dir, datapkg_settings, partitions=False): """ Create a Tabular Data Resource descriptor for a PUDL table. Based on the information in the database, and some additional metadata this function will generate a valid Tabular Data Resource descriptor, according to the Frictionless Data specification, which can be found here: https://frictionlessdata.io/specs/tabular-data-resource/ Args: resource_name (string): name of the tabular data resource for which you want to generate a Tabular Data Resource descriptor. This is the resource name, rather than the database table name, because we partition large tables into resource groups consisting of many files. datapkg_dir (path-like): The location of the directory for this package. The data package directory will be a subdirectory in the `datapkg_dir` directory, with the name of the package as the name of the subdirectory. datapkg_settings (dict): Python dictionary represeting the ETL parameters read in from the settings file, pertaining to the tabular datapackage this resource is part of. partitions (dict): A dictionary with PUDL database table names as the keys (e.g. hourly_emissions_epacems), and lists of partition variables (e.g. ["epacems_years", "epacems_states"]) as the keys. Returns: dict: A Python dictionary representing a tabular data resource descriptor that complies with the Frictionless Data specification. """ # every time we want to generate the cems table, we want it compressed abs_path = pathlib.Path(datapkg_dir, "data", f"{resource_name}.csv") if "hourly_emissions_epacems" in resource_name: abs_path = pathlib.Path(abs_path.parent, abs_path.name + ".gz") # pull the skeleton of the descriptor from the megadata file descriptor = pull_resource_from_megadata(resource_name) descriptor["path"] = str(abs_path.relative_to(abs_path.parent.parent)) descriptor["bytes"] = abs_path.stat().st_size descriptor["hash"] = hash_csv(abs_path) descriptor["created"] = ( datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z") unpartitioned_tables = get_unpartitioned_tables([resource_name], datapkg_settings) data_sources = data_sources_from_tables(unpartitioned_tables) descriptor["sources"] = [pc.data_source_info[src] for src in data_sources] descriptor["coverage"] = { "temporal": temporal_coverage(resource_name, datapkg_settings), "spatial": spatial_coverage(resource_name), } if partitions: for part in partitions.keys(): if part in resource_name: descriptor["group"] = part resource = datapackage.Resource(descriptor) if resource.valid: logger.debug(f"{resource_name} is a valid resource") else: logger.info(resource) raise ValueError(f""" Invalid tabular data resource descriptor: {resource.name} Errors: {resource.errors} """) return descriptor
def main(arguments): # noqa: C901 """The main function.""" parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-d', '--download', help="Download fresh data directly from MSHA.", default=False, action='store_true') parser.add_argument( '-r', '--row_limit', help="Maximum number of rows to use in data validation.", default=10000, action='store', type=int) args = parser.parse_args(arguments) # Construct some paths we'll need later... input_dir = os.path.join(SETTINGS['pudl_dir'], "scripts", "data_pkgs", "pudl-msha") # Generate package output directories based on name of the data package output_dir = os.path.join(SETTINGS['pudl_dir'], "results", "data_pkgs", "pudl-msha") archive_dir = os.path.join(output_dir, "archive") os.makedirs(archive_dir, exist_ok=True) scripts_dir = os.path.join(output_dir, "scripts") os.makedirs(scripts_dir, exist_ok=True) data_dir = os.path.join(output_dir, "data") os.makedirs(data_dir, exist_ok=True) # Dictionary with one element pertaining to each of the data resources # that are going to be part of the output data package. The initial data # and defs sub-dictionary elements will be joined by other useful items # that exist for each of the data resources as we go. # - "data" the filename of the original data file from MSHA # - "defs" the filename of the data definition file from MSHA # - "data_df" pandas dataframe containing the MSHA data. # - "defs_df" pandas dataframe containing the MSHA file definition. # - "resource" is a datapackage.Resource() object. # - "json" is a JSON data package descriptor resources = { "mines": { "data": "Mines.zip", "defs": "Mines_Definition_File.txt" }, "controller-operator-history": { "data": "ControllerOperatorHistory.zip", "defs": "Controller_Operator_History_Definition_File.txt" }, "employment-production-quarterly": { "data": "MinesProdQuarterly.zip", "defs": "MineSProdQuarterly_Definition_File.txt" } # "contractor-employment-production-quarterly": { # "data": "ContractorProdQuarterly.zip", # "defs": "ContractorProdQuarterly_Definition_File.txt" # } } if args.download: # Get the data directly from MSHA data_path = pc.base_data_urls["msha"] for res in resources: for d in ["data", "defs"]: # Construct the full URL url_parts = urllib.parse.urlparse(pc.base_data_urls['msha']) new_path = url_parts.path + '/' + resources[res][d] res_url = urllib.parse.urlunparse( list(url_parts[0:2]) + [new_path, '', '', '']) # Download the data file to data_dir print(f"Downloading {res_url}") urllib.request.urlretrieve( # nosec res_url, filename=os.path.join(archive_dir, resources[res][d])) else: # Get the data from our local PUDL datastore. data_path = os.path.join(SETTINGS['data_dir'], "msha") for res in resources: for d in ["data", "defs"]: src_file = os.path.join(data_path, resources[res][d]) dst_file = os.path.join(archive_dir, resources[res][d]) shutil.copyfile(src_file, dst_file) for res in resources: # Create dataframes from input data & definition files (local or # remote): for d in ['data', 'defs']: resources[res][f"{d}_df"] = \ pd.read_csv(f"{archive_dir}/{resources[res][d]}", delimiter="|", encoding="iso-8859-1") # Read the input tabular data resource JSON file we've prepared resources[res]["json"] = json.load( open(os.path.join(input_dir, f"{res}.json"))) # OMFG even the MSHA data is broken. *sigh* resources["employment-production-quarterly"]["data_df"].columns = \ list(resources["employment-production-quarterly"] ["defs_df"]['COLUMN_NAME']) # Create a data package to contain our resources, based on the template # JSON file that we have already prepared as an input. pkg = datapackage.Package(os.path.join(input_dir, "datapackage.json")) for res in resources: # Convert the definitions to a dictionary of field descriptions field_desc = resources[res]["defs_df"].set_index( 'COLUMN_NAME').to_dict()['FIELD_DESCRIPTION'] # Set the description attribute of the fields in the schema using field # descriptions. for field in resources[res]["json"]["schema"]["fields"]: field['description'] = field_desc[field['name']] resources[res]["resource"] = datapackage.Resource( descriptor=resources[res]["json"]) # Make sure we didn't miss or re-name any fields accidentally json_fields = resources[res]["resource"].schema.field_names defs_fields = list(resources[res]["defs_df"]['COLUMN_NAME']) data_fields = list(resources[res]['data_df'].columns) assert json_fields == defs_fields, "json vs. defs missing field: {}".format( set(json_fields).symmetric_difference(set(defs_fields))) assert data_fields == defs_fields, "data vs. defs missing field: {}".format( set(data_fields).symmetric_difference(set(defs_fields))) resources[res]["resource"].infer() resources[res]["resource"].commit() # Need to clean up the integer NA values in the data before outputting: for field in resources[res]["resource"].schema.field_names: if resources[res]["resource"].schema.get_field( field).type == 'integer': resources[res]["data_df"][field] = fix_int_na( resources[res]["data_df"][field]) # Force boolean values to use canonical True/False values. for field in resources[res]["resource"].schema.field_names: if resources[res]["resource"].schema.get_field( field).type == 'boolean': resources[res]["data_df"][field] = resources[res]["data_df"][ field].replace('Y', True) resources[res]["data_df"][field] = resources[res]["data_df"][ field].replace('N', False) # the data itself goes in output -- this is what we're packaging up output_csv = os.path.join(data_dir, f"{res}.csv") resources[res]['data_df'].to_csv(output_csv, index=False, encoding='utf-8') # calculate some useful information about the output file, and add it to the resource: # resource file size: resources[res]["resource"].descriptor["bytes"] = os.path.getsize( output_csv) # resource file hash: blocksize = 65536 hasher = hashlib.sha256() with open(output_csv, 'rb') as afile: buf = afile.read(blocksize) while len(buf) > 0: hasher.update(buf) buf = afile.read(blocksize) resources[res]["resource"].descriptor[ "hash"] = f"sha256:{hasher.hexdigest()}" # Check our work... print("Validating {} tabular data resource".format( resources[res]['resource'].descriptor['name'])) if not resources[res]["resource"].valid: print(f"TABULAR DATA RESOURCE {res} IS NOT VALID.") return 1 # Add the completed resource to the data package pkg.add_resource(descriptor=resources[res]["resource"].descriptor) # Automatically fill in some additional metadata pkg.infer() # Timestamp indicating when packaging occured pkg.descriptor['created'] = datetime.datetime.utcnow().replace( microsecond=0).isoformat() + 'Z' # Have to set this to 'data-package' rather than 'tabular-data-package' # due to a DataHub.io bug pkg.descriptor['profile'] = 'data-package' pkg.commit() # save the datapackage print("Validating pudl-msha data package") if not pkg.valid: print("PUDL MSHA DATA PACKAGE IS NOT VALID.") return 1 pkg.save(os.path.join(output_dir, 'datapackage.json')) # Validate some of the data... print("Validating pudl-msha data") report = goodtables.validate(os.path.join(output_dir, 'datapackage.json'), row_limit=args.row_limit) if not report['valid']: print("PUDL MSHA DATA TABLES FAILED TO VALIDATE") pprint(report) return 1 shutil.copyfile(os.path.join(input_dir, "README.md"), os.path.join(output_dir, "README.md")) shutil.copyfile(os.path.join(input_dir, sys.argv[0]), os.path.join(output_dir, "scripts", sys.argv[0])) return 0
def test_data_cant_be_assigned(self): resource_dict = {} resource = datapackage.Resource(resource_dict) with pytest.raises(AttributeError): resource.data = 'foo'
def test_data_returns_the_resource_data(self): resource_dict = { 'data': 'foo', } resource = datapackage.Resource(resource_dict) assert resource.data == resource_dict['data']
def test_data_is_none_by_default(self): resource_dict = {} resource = datapackage.Resource(resource_dict) assert resource.data is None
def test_create_datapackage_missing_required_field(self): """Checks if DataPackage creation fails if a required field is missing""" joker = datapackage.Resource(datapackage_uri='http://gotham.us', name='joker') villains = datapackage.DataPackage(name="villains")
dp.add_contributor("Peter W. Battaglia", "*****@*****.**") dp.add_contributor("Joshua B. Tenenbaum", "*****@*****.**") # add event data, and save it as csv events_G = dp_G.load_resource("events.csv") events_G['version'] = 'G' events_H = dp_H.load_resource("events.csv") events_H['version'] = 'H' events_I = dp_I.load_resource("events.csv") events_I['version'] = 'I' events = pd.concat([events_G, events_H, events_I])\ .set_index(['version', 'timestamp'])\ .sortlevel() r = dpkg.Resource( name="events.csv", fmt="csv", pth="./events.csv", data=events) r['mediaformat'] = 'text/csv' dp.add_resource(r) # add participant info, and save it as csv participants_G = dp_G.load_resource('participants.csv').reset_index() participants_G['version'] = 'G' participants_H = dp_H.load_resource('participants.csv').reset_index() participants_H['version'] = 'H' participants_I = dp_I.load_resource('participants.csv').reset_index() participants_I['version'] = 'I' participants = pd.concat([participants_G, participants_H, participants_I])\ .set_index(['version', 'timestamp'])\ .sortlevel()
def get_tabular_data_resource(tablename, pkg_dir, testing=False): """ Create a Tabular Data Resource descriptor for a PUDL DB table. Based on the information in the database, and some additional metadata, stored elsewhere (Where?!?!) this function will generate a valid Tabular Data Resource descriptor, according to the Frictionless Data specification, which can be found here: https://frictionlessdata.io/specs/tabular-data-resource/ """ table = get_table(tablename, testing=testing) # Where the CSV file holding the data is, relative to datapackage.json # This is the value that has to be embedded in the data package. csv_relpath = os.path.join('data', f'{tablename}.csv') # We need to access the file to calculate hash and size too: csv_abspath = os.path.join(os.path.abspath(pkg_dir), csv_relpath) descriptor = {} descriptor['profile'] = "tabular-data-resource" descriptor['name'] = tablename descriptor['path'] = csv_relpath descriptor['title'] = tablename # maybe we should make this pretty... if table.comment: descriptor['description'] = table.comment descriptor['encoding'] = "utf-8" descriptor['mediatype'] = "text/csv" descriptor['format'] = "csv" descriptor['dialect'] = { "delimiter": ",", "header": True, "quoteChar": "\"", "doubleQuote": True, "lineTerminator": "\r\n", "skipInitialSpace": True, } descriptor['schema'] = get_table_schema(table) descriptor['bytes'] = os.path.getsize(csv_abspath) descriptor['hash'] = hash_csv(csv_abspath) # If omitted, icenses are inherited from the containing data package. descriptor["licenses"] = [ pudl.constants.licenses['cc-by-4.0'], ] data_sources = \ pudl.helpers.data_sources_from_tables([table.name, ]) # descriptor["sources"] = \ # [pudl.constants.data_sources[src] for src in data_sources] descriptor["sources"] = [] for src in data_sources: if src in pudl.constants.data_sources: descriptor["sources"].append({"title": src, "path": "idfk"}) resource = datapackage.Resource(descriptor) if not resource.valid: raise AssertionError(f""" Invalid tabular data resource: {resource.name} Errors: {resource.errors} """) return descriptor