Beispiel #1
0
 def _load_raw_data(self, resource_name):
     """Extract raw data from resource"""
     # Instantiating the resource again as a simple `Resource` ensures that
     # ``data`` will be returned as bytes.
     upcast_resource = datapackage.Resource(
         self.__resources[resource_name].descriptor,
         default_base_path=self.__base_path)
     return upcast_resource.data
Beispiel #2
0
 def test_descriptor_are_available(self):
     resource_dict = {
         'name': 'foo',
         'url': 'http://someplace.com/foo.json',
         'path': 'foo.json',
         'data': {'foo': 'bar'},
     }
     resource = datapackage.Resource(resource_dict)
     assert resource.descriptor == resource_dict
Beispiel #3
0
 def test_create_new_datapackage(self):
     """Checks if it's possible to create a new datapackage"""
     joker = datapackage.Resource(datapackage_uri='http://gotham.us/',
                                  name="joker",
                                  url="http://gotham.us/villains.csv")
     villains = datapackage.DataPackage(name="villains",
                                        license="ODC-PDDL-1.0",
                                        resources=[joker])
     assert villains.name == "villains"
     assert len(villains.resources) == 1
     assert villains.resources[0].name == "joker"
Beispiel #4
0
def get_tabular_data_resource(table_name, pkg_dir, partitions=False):
    """
    Create a Tabular Data Resource descriptor for a PUDL table.

    Based on the information in the database, and some additional metadata this
    function will generate a valid Tabular Data Resource descriptor, according
    to the Frictionless Data specification, which can be found here:
    https://frictionlessdata.io/specs/tabular-data-resource/

    Args:
        table_name (string): table name for which you want to generate a
            Tabular Data Resource descriptor
        pkg_dir (path-like): The location of the directory for this package.
            The data package directory will be a subdirectory in the
            `datapackage_dir` directory, with the name of the package as the
            name of the subdirectory.

    Returns:
        Tabular Data Resource descriptor: A JSON object containing key
        information about the selected table

    """
    # every time we want to generate the cems table, we want it compressed
    if 'epacems' in table_name:
        abs_path = pathlib.Path(pkg_dir, 'data', f'{table_name}.csv.gz')
    else:
        abs_path = pathlib.Path(pkg_dir, 'data', f'{table_name}.csv')

    # pull the skeleton of the descriptor from the megadata file
    descriptor = pull_resource_from_megadata(table_name)
    descriptor['path'] = str(abs_path.relative_to(abs_path.parent.parent))
    descriptor['bytes'] = abs_path.stat().st_size
    descriptor['hash'] = hash_csv(abs_path)
    descriptor['created'] = (
        datetime.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z')

    if partitions:
        for part in partitions.keys():
            if part in table_name:
                descriptor['group'] = part

    resource = datapackage.Resource(descriptor)
    if resource.valid:
        logger.debug(f"{table_name} is a valid resource")
    if not resource.valid:
        raise AssertionError(f"""
            Invalid tabular data resource: {resource.name}

            Errors:
            {resource.errors}
            """)

    return descriptor
Beispiel #5
0
def save_dpkg(dataset_path, data, meta, events, participants):
    dp = dpkg.DataPackage(name=dataset_path.name, licenses=['odc-by'])
    dp['version'] = '1.0.0'
    dp.add_contributor("Jessica B. Hamrick", "*****@*****.**")
    dp.add_contributor("Thomas L. Griffiths", "*****@*****.**")
    dp.add_contributor("Peter W. Battaglia", "*****@*****.**")
    dp.add_contributor("Joshua B. Tenenbaum", "*****@*****.**")

    # add experiment data, and save it as csv
    r1 = dpkg.Resource(name="experiment.csv",
                       fmt="csv",
                       pth="./experiment.csv",
                       data=data)
    r1['mediaformat'] = 'text/csv'
    dp.add_resource(r1)

    # add metadata, and save it inline as json
    r2 = dpkg.Resource(name="metadata", fmt="json", data=meta)
    r2['mediaformat'] = 'application/json'
    dp.add_resource(r2)

    # add event data, and save it as csv
    r3 = dpkg.Resource(name="events.csv",
                       fmt="csv",
                       pth="./events.csv",
                       data=events)
    r3['mediaformat'] = 'text/csv'
    dp.add_resource(r3)

    # add participant info, and save it as csv
    r3 = dpkg.Resource(name="participants.csv",
                       fmt="csv",
                       pth="./participants.csv",
                       data=participants)
    r3['mediaformat'] = 'text/csv'
    dp.add_resource(r3)

    # save the datapackage
    dp.save(dataset_path.dirname())
    logger.info("Saved to '%s'", dataset_path.relpath())
Beispiel #6
0
def get_tabular_data_resource_2(table_name, pkg_dir, testing=False):
    """
    Create a Tabular Data Resource descriptor for a PUDL table.

    Based on the information in the database, and some additional metadata,
    stored elsewhere (Where?!?!) this function will generate a valid Tabular
    Data Resource descriptor, according to the Frictionless Data specification,
    which can be found here:

    https://frictionlessdata.io/specs/tabular-data-resource/
    """
    # Where the CSV file holding the data is, relative to datapackage.json
    # This is the value that has to be embedded in the data package.
    csv_relpath = os.path.join('data', f'{table_name}.csv')
    # We need to access the file to calculate hash and size too:
    csv_abspath = os.path.join(os.path.abspath(pkg_dir), csv_relpath)

    # pull the skeleton of the descriptor from the megadata file
    descriptor = pull_resource_from_megadata(table_name,
                                             mega_pkg_dir=os.path.join(
                                                 SETTINGS['meta_dir'],
                                                 'pudl-test'))
    descriptor['path'] = csv_relpath
    descriptor['bytes'] = os.path.getsize(csv_abspath)
    descriptor['hash'] = pudl.output.export.hash_csv(csv_abspath)
    descriptor['created'] = (
        datetime.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'),

    resource = datapackage.Resource(descriptor)
    if resource.valid:
        logger.debug(f"{table_name} is a valid resource")
    if not resource.valid:
        raise AssertionError(f"""
            Invalid tabular data resource: {resource.name}

            Errors:
            {resource.errors}
            """)

    return descriptor
Beispiel #7
0
 def test_create_resource_missing_required_field(self):
     datapackage.Resource()
Beispiel #8
0
 def setup(self):
     self.dpkg = datapackage.DataPackage("tests/test.dpkg")
     kwargs = self.dpkg['resources'][0]
     kwargs['datapackage_uri'] = compat.str(self.dpkg.base)
     self.resource = datapackage.Resource(**kwargs)
Beispiel #9
0
def get_tabular_data_resource(resource_name,
                              datapkg_dir,
                              datapkg_settings,
                              partitions=False):
    """
    Create a Tabular Data Resource descriptor for a PUDL table.

    Based on the information in the database, and some additional metadata this
    function will generate a valid Tabular Data Resource descriptor, according
    to the Frictionless Data specification, which can be found here:
    https://frictionlessdata.io/specs/tabular-data-resource/

    Args:
        resource_name (string): name of the tabular data resource for which you
            want to generate a Tabular Data Resource descriptor. This is the
            resource name, rather than the database table name, because we
            partition large tables into resource groups consisting of many
            files.
        datapkg_dir (path-like): The location of the directory for this
            package. The data package directory will be a subdirectory in the
            `datapkg_dir` directory, with the name of the package as the name
            of the subdirectory.
        datapkg_settings (dict): Python dictionary represeting the ETL
            parameters read in from the settings file, pertaining to the
            tabular datapackage this resource is part of.
        partitions (dict): A dictionary with PUDL database table names as the
            keys (e.g. hourly_emissions_epacems), and lists of partition
            variables (e.g. ["epacems_years", "epacems_states"]) as the keys.

    Returns:
        dict: A Python dictionary representing a tabular data resource
        descriptor that complies with the Frictionless Data specification.

    """
    # every time we want to generate the cems table, we want it compressed
    abs_path = pathlib.Path(datapkg_dir, "data", f"{resource_name}.csv")
    if "hourly_emissions_epacems" in resource_name:
        abs_path = pathlib.Path(abs_path.parent, abs_path.name + ".gz")

    # pull the skeleton of the descriptor from the megadata file
    descriptor = pull_resource_from_megadata(resource_name)
    descriptor["path"] = str(abs_path.relative_to(abs_path.parent.parent))
    descriptor["bytes"] = abs_path.stat().st_size
    descriptor["hash"] = hash_csv(abs_path)
    descriptor["created"] = (
        datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z")
    unpartitioned_tables = get_unpartitioned_tables([resource_name],
                                                    datapkg_settings)
    data_sources = data_sources_from_tables(unpartitioned_tables)
    descriptor["sources"] = [pc.data_source_info[src] for src in data_sources]
    descriptor["coverage"] = {
        "temporal": temporal_coverage(resource_name, datapkg_settings),
        "spatial": spatial_coverage(resource_name),
    }

    if partitions:
        for part in partitions.keys():
            if part in resource_name:
                descriptor["group"] = part

    resource = datapackage.Resource(descriptor)

    if resource.valid:
        logger.debug(f"{resource_name} is a valid resource")
    else:
        logger.info(resource)
        raise ValueError(f"""
            Invalid tabular data resource descriptor: {resource.name}

            Errors:
            {resource.errors}
            """)

    return descriptor
Beispiel #10
0
def main(arguments):  # noqa: C901
    """The main function."""
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('-d',
                        '--download',
                        help="Download fresh data directly from MSHA.",
                        default=False,
                        action='store_true')
    parser.add_argument(
        '-r',
        '--row_limit',
        help="Maximum number of rows to use in data validation.",
        default=10000,
        action='store',
        type=int)

    args = parser.parse_args(arguments)

    # Construct some paths we'll need later...
    input_dir = os.path.join(SETTINGS['pudl_dir'], "scripts", "data_pkgs",
                             "pudl-msha")

    # Generate package output directories based on name of the data package
    output_dir = os.path.join(SETTINGS['pudl_dir'], "results", "data_pkgs",
                              "pudl-msha")

    archive_dir = os.path.join(output_dir, "archive")
    os.makedirs(archive_dir, exist_ok=True)

    scripts_dir = os.path.join(output_dir, "scripts")
    os.makedirs(scripts_dir, exist_ok=True)

    data_dir = os.path.join(output_dir, "data")
    os.makedirs(data_dir, exist_ok=True)

    # Dictionary with one element pertaining to each of the data resources
    # that are going to be part of the output data package. The initial data
    # and defs sub-dictionary elements will be joined by other useful items
    # that exist for each of the data resources as we go.
    #  - "data" the filename of the original data file from MSHA
    #  - "defs" the filename of the data definition file from MSHA
    #  - "data_df" pandas dataframe containing the MSHA data.
    #  - "defs_df" pandas dataframe containing the MSHA file definition.
    #  - "resource" is a datapackage.Resource() object.
    #  - "json" is a JSON data package descriptor
    resources = {
        "mines": {
            "data": "Mines.zip",
            "defs": "Mines_Definition_File.txt"
        },
        "controller-operator-history": {
            "data": "ControllerOperatorHistory.zip",
            "defs": "Controller_Operator_History_Definition_File.txt"
        },
        "employment-production-quarterly": {
            "data": "MinesProdQuarterly.zip",
            "defs": "MineSProdQuarterly_Definition_File.txt"
        }
        # "contractor-employment-production-quarterly": {
        #       "data": "ContractorProdQuarterly.zip",
        #       "defs": "ContractorProdQuarterly_Definition_File.txt"
        #   }
    }

    if args.download:
        # Get the data directly from MSHA
        data_path = pc.base_data_urls["msha"]
        for res in resources:
            for d in ["data", "defs"]:
                # Construct the full URL
                url_parts = urllib.parse.urlparse(pc.base_data_urls['msha'])
                new_path = url_parts.path + '/' + resources[res][d]
                res_url = urllib.parse.urlunparse(
                    list(url_parts[0:2]) + [new_path, '', '', ''])

                # Download the data file to data_dir
                print(f"Downloading {res_url}")
                urllib.request.urlretrieve(  # nosec
                    res_url,
                    filename=os.path.join(archive_dir, resources[res][d]))
    else:
        # Get the data from our local PUDL datastore.
        data_path = os.path.join(SETTINGS['data_dir'], "msha")
        for res in resources:
            for d in ["data", "defs"]:
                src_file = os.path.join(data_path, resources[res][d])
                dst_file = os.path.join(archive_dir, resources[res][d])
                shutil.copyfile(src_file, dst_file)

    for res in resources:
        # Create dataframes from input data & definition files (local or
        # remote):
        for d in ['data', 'defs']:
            resources[res][f"{d}_df"] = \
                pd.read_csv(f"{archive_dir}/{resources[res][d]}",
                            delimiter="|",
                            encoding="iso-8859-1")
        # Read the input tabular data resource JSON file we've prepared
        resources[res]["json"] = json.load(
            open(os.path.join(input_dir, f"{res}.json")))

    # OMFG even the MSHA data is broken. *sigh*
    resources["employment-production-quarterly"]["data_df"].columns = \
        list(resources["employment-production-quarterly"]
             ["defs_df"]['COLUMN_NAME'])

    # Create a data package to contain our resources, based on the template
    # JSON file that we have already prepared as an input.
    pkg = datapackage.Package(os.path.join(input_dir, "datapackage.json"))

    for res in resources:
        # Convert the definitions to a dictionary of field descriptions
        field_desc = resources[res]["defs_df"].set_index(
            'COLUMN_NAME').to_dict()['FIELD_DESCRIPTION']

        # Set the description attribute of the fields in the schema using field
        # descriptions.
        for field in resources[res]["json"]["schema"]["fields"]:
            field['description'] = field_desc[field['name']]
        resources[res]["resource"] = datapackage.Resource(
            descriptor=resources[res]["json"])

        # Make sure we didn't miss or re-name any fields accidentally
        json_fields = resources[res]["resource"].schema.field_names
        defs_fields = list(resources[res]["defs_df"]['COLUMN_NAME'])
        data_fields = list(resources[res]['data_df'].columns)
        assert json_fields == defs_fields, "json vs. defs missing field: {}".format(
            set(json_fields).symmetric_difference(set(defs_fields)))
        assert data_fields == defs_fields, "data vs. defs missing field: {}".format(
            set(data_fields).symmetric_difference(set(defs_fields)))
        resources[res]["resource"].infer()
        resources[res]["resource"].commit()

        # Need to clean up the integer NA values in the data before outputting:
        for field in resources[res]["resource"].schema.field_names:
            if resources[res]["resource"].schema.get_field(
                    field).type == 'integer':
                resources[res]["data_df"][field] = fix_int_na(
                    resources[res]["data_df"][field])

        # Force boolean values to use canonical True/False values.
        for field in resources[res]["resource"].schema.field_names:
            if resources[res]["resource"].schema.get_field(
                    field).type == 'boolean':
                resources[res]["data_df"][field] = resources[res]["data_df"][
                    field].replace('Y', True)
                resources[res]["data_df"][field] = resources[res]["data_df"][
                    field].replace('N', False)

        # the data itself goes in output -- this is what we're packaging up
        output_csv = os.path.join(data_dir, f"{res}.csv")
        resources[res]['data_df'].to_csv(output_csv,
                                         index=False,
                                         encoding='utf-8')

        # calculate some useful information about the output file, and add it to the resource:
        # resource file size:
        resources[res]["resource"].descriptor["bytes"] = os.path.getsize(
            output_csv)

        # resource file hash:
        blocksize = 65536
        hasher = hashlib.sha256()
        with open(output_csv, 'rb') as afile:
            buf = afile.read(blocksize)
            while len(buf) > 0:
                hasher.update(buf)
                buf = afile.read(blocksize)

        resources[res]["resource"].descriptor[
            "hash"] = f"sha256:{hasher.hexdigest()}"

        # Check our work...
        print("Validating {} tabular data resource".format(
            resources[res]['resource'].descriptor['name']))
        if not resources[res]["resource"].valid:
            print(f"TABULAR DATA RESOURCE {res} IS NOT VALID.")
            return 1

        # Add the completed resource to the data package
        pkg.add_resource(descriptor=resources[res]["resource"].descriptor)

    # Automatically fill in some additional metadata
    pkg.infer()

    # Timestamp indicating when packaging occured
    pkg.descriptor['created'] = datetime.datetime.utcnow().replace(
        microsecond=0).isoformat() + 'Z'
    # Have to set this to 'data-package' rather than 'tabular-data-package'
    # due to a DataHub.io bug
    pkg.descriptor['profile'] = 'data-package'
    pkg.commit()

    # save the datapackage
    print("Validating pudl-msha data package")
    if not pkg.valid:
        print("PUDL MSHA DATA PACKAGE IS NOT VALID.")
        return 1
    pkg.save(os.path.join(output_dir, 'datapackage.json'))

    # Validate some of the data...
    print("Validating pudl-msha data")
    report = goodtables.validate(os.path.join(output_dir, 'datapackage.json'),
                                 row_limit=args.row_limit)
    if not report['valid']:
        print("PUDL MSHA DATA TABLES FAILED TO VALIDATE")
        pprint(report)
        return 1

    shutil.copyfile(os.path.join(input_dir, "README.md"),
                    os.path.join(output_dir, "README.md"))
    shutil.copyfile(os.path.join(input_dir, sys.argv[0]),
                    os.path.join(output_dir, "scripts", sys.argv[0]))
    return 0
Beispiel #11
0
 def test_data_cant_be_assigned(self):
     resource_dict = {}
     resource = datapackage.Resource(resource_dict)
     with pytest.raises(AttributeError):
         resource.data = 'foo'
Beispiel #12
0
 def test_data_returns_the_resource_data(self):
     resource_dict = {
         'data': 'foo',
     }
     resource = datapackage.Resource(resource_dict)
     assert resource.data == resource_dict['data']
Beispiel #13
0
 def test_data_is_none_by_default(self):
     resource_dict = {}
     resource = datapackage.Resource(resource_dict)
     assert resource.data is None
Beispiel #14
0
 def test_create_datapackage_missing_required_field(self):
     """Checks if DataPackage creation fails if a required field
     is missing"""
     joker = datapackage.Resource(datapackage_uri='http://gotham.us',
                                  name='joker')
     villains = datapackage.DataPackage(name="villains")
Beispiel #15
0
dp.add_contributor("Peter W. Battaglia", "*****@*****.**")
dp.add_contributor("Joshua B. Tenenbaum", "*****@*****.**")

# add event data, and save it as csv
events_G = dp_G.load_resource("events.csv")
events_G['version'] = 'G'
events_H = dp_H.load_resource("events.csv")
events_H['version'] = 'H'
events_I = dp_I.load_resource("events.csv")
events_I['version'] = 'I'
events = pd.concat([events_G, events_H, events_I])\
           .set_index(['version', 'timestamp'])\
           .sortlevel()

r = dpkg.Resource(
    name="events.csv", fmt="csv",
    pth="./events.csv",
    data=events)
r['mediaformat'] = 'text/csv'
dp.add_resource(r)

# add participant info, and save it as csv
participants_G = dp_G.load_resource('participants.csv').reset_index()
participants_G['version'] = 'G'
participants_H = dp_H.load_resource('participants.csv').reset_index()
participants_H['version'] = 'H'
participants_I = dp_I.load_resource('participants.csv').reset_index()
participants_I['version'] = 'I'
participants = pd.concat([participants_G, participants_H, participants_I])\
                 .set_index(['version', 'timestamp'])\
                 .sortlevel()
Beispiel #16
0
def get_tabular_data_resource(tablename, pkg_dir, testing=False):
    """
    Create a Tabular Data Resource descriptor for a PUDL DB table.

    Based on the information in the database, and some additional metadata,
    stored elsewhere (Where?!?!) this function will generate a valid Tabular
    Data Resource descriptor, according to the Frictionless Data specification,
    which can be found here:

    https://frictionlessdata.io/specs/tabular-data-resource/
    """
    table = get_table(tablename, testing=testing)

    # Where the CSV file holding the data is, relative to datapackage.json
    # This is the value that has to be embedded in the data package.
    csv_relpath = os.path.join('data', f'{tablename}.csv')
    # We need to access the file to calculate hash and size too:
    csv_abspath = os.path.join(os.path.abspath(pkg_dir), csv_relpath)

    descriptor = {}
    descriptor['profile'] = "tabular-data-resource"
    descriptor['name'] = tablename
    descriptor['path'] = csv_relpath
    descriptor['title'] = tablename  # maybe we should make this pretty...
    if table.comment:
        descriptor['description'] = table.comment
    descriptor['encoding'] = "utf-8"
    descriptor['mediatype'] = "text/csv"
    descriptor['format'] = "csv"
    descriptor['dialect'] = {
        "delimiter": ",",
        "header": True,
        "quoteChar": "\"",
        "doubleQuote": True,
        "lineTerminator": "\r\n",
        "skipInitialSpace": True,
    }
    descriptor['schema'] = get_table_schema(table)
    descriptor['bytes'] = os.path.getsize(csv_abspath)
    descriptor['hash'] = hash_csv(csv_abspath)

    # If omitted, icenses are inherited from the containing data package.
    descriptor["licenses"] = [
        pudl.constants.licenses['cc-by-4.0'],
    ]

    data_sources = \
        pudl.helpers.data_sources_from_tables([table.name, ])
    # descriptor["sources"] = \
    #    [pudl.constants.data_sources[src] for src in data_sources]
    descriptor["sources"] = []
    for src in data_sources:
        if src in pudl.constants.data_sources:
            descriptor["sources"].append({"title": src, "path": "idfk"})

    resource = datapackage.Resource(descriptor)
    if not resource.valid:
        raise AssertionError(f"""
            Invalid tabular data resource: {resource.name}

            Errors:
            {resource.errors}
            """)

    return descriptor