Esempi in Python per Datastore, esempi in Python per pudl.workspace.datastore.Datastore

Esempio n. 1

0

Mostra file

def censusdp1tract_to_sqlite(pudl_settings=None, year=2010):
    """
    Use GDAL's ogr2ogr utility to convert the Census DP1 GeoDB to an SQLite DB.

    The Census DP1 GeoDB is read from the datastore, where it is stored as a
    zipped archive. This archive is unzipped into a temporary directory so
    that ogr2ogr can operate on the ESRI GeoDB, and convert it to SQLite. The
    resulting SQLite DB file is put in the PUDL output directory alongside the
    ferc1 and pudl SQLite databases.

    Args:
        pudl_settings (dict): A PUDL settings dictionary.
        year (int): Year of Census data to extract (currently must be 2010)

    Returns:
        None

    """
    if pudl_settings is None:
        pudl_settings = pudl.workspace.setup.get_defaults()
    ds = Datastore(local_cache_path=pudl_settings["data_dir"])

    # If we're in a conda environment, use the version of ogr2ogr that has been
    # installed by conda. Otherwise, try and use a system installed version
    # at /usr/bin/ogr2ogr  This allows us to avoid simply running whatever
    # program happens to be in the user's path and named ogr2ogr. This is a
    # fragile solution that will not work on all platforms, but should cover
    # conda environments, Docker, and continuous integration on GitHub.
    ogr2ogr = os.environ.get("CONDA_PREFIX", "/usr") + "/bin/ogr2ogr"

    # Extract the sippzed GeoDB archive from the Datastore into a temporary
    # directory so that ogr2ogr can operate on it. Output the resulting SQLite
    # database into the user's PUDL workspace. We do not need to keep the
    # unzipped GeoDB around after this conversion. Using a temporary directory
    # makes the cleanup automatic.
    with TemporaryDirectory() as tmpdir:
        # Use datastore to grab the Census DP1 zipfile
        tmpdir_path = Path(tmpdir)
        zip_ref = ds.get_zipfile_resource("censusdp1tract", year=year)
        extract_root = tmpdir_path / Path(zip_ref.filelist[0].filename)
        out_path = Path(pudl_settings["sqlite_dir"]) / "censusdp1tract.sqlite"
        logger.info("Extracting the Census DP1 GeoDB to %s", out_path)
        zip_ref.extractall(tmpdir_path)
        logger.info("extract_root = %s", extract_root)
        logger.info("out_path = %s", out_path)
        subprocess.run(  # nosec: B603 Trying to use absolute paths.
            [ogr2ogr, str(out_path), str(extract_root)],
            check=True)

Esempio n. 2

0

Mostra file

File: epaipm.py Progetto: yashkumar1803/pudl

def extract(epaipm_tables: List[str],
            ds: Datastore) -> Dict[str, pd.DataFrame]:
    """Extracts data from IPM files.

    Args:
        epaipm_tables (iterable): A tuple or list of table names to extract
        ds (:class:`EpaIpmDatastore`): Initialized datastore

    Returns:
        dict: dictionary of DataFrames with extracted (but not yet transformed)
        data from each file.

    """
    # Prep for ingesting EPA IPM
    logger.info('Beginning ETL for EPA IPM.')
    ds = EpaIpmDatastore(ds)

    if "plant_region_map_epaipm" in epaipm_tables:
        # NEEDS is the only IPM data file with multiple sheets. Keeping the overall
        # code simpler but adding this if statement to read both sheets (active and
        # retired by 2021).
        epaipm_tables.remove("plant_region_map_epaipm")
        epaipm_tables.extend([
            "plant_region_map_epaipm_active", "plant_region_map_epaipm_retired"
        ])

    return {f: ds.get_dataframe(f) for f in epaipm_tables}

Esempio n. 3

0

Mostra file

File: epacems.py Progetto: yashkumar1803/pudl

def extract(epacems_years, states, ds: Datastore):
    """
    Coordinate the extraction of EPA CEMS hourly DataFrames.

    Args:
        epacems_years (list): The years of CEMS data to extract, as 4-digit
            integers.
        states (list): The states whose CEMS data we want to extract, indicated
            by 2-letter US state codes.
        ds (:class:`Datastore`): Initialized datastore

    Yields:
        dict: a dictionary with a single EPA CEMS tabular data resource name as
        the key, having the form "hourly_emissions_epacems_YEAR_STATE" where
        YEAR is a 4 digit number and STATE is a lower case 2-letter code for a
        US state. The value is a :class:`pandas.DataFrame` containing all the
        raw EPA CEMS hourly emissions data for the indicated state and year.
    """
    ds = EpaCemsDatastore(ds)
    for year in epacems_years:
        # The keys of the us_states dictionary are the state abbrevs
        for state in states:
            partition = EpaCemsPartition(state=state, year=year)
            logger.info(f"Performing ETL for EPA CEMS hourly {state}-{year}")
            # Return a dictionary where the key identifies this dataset
            # (just like the other extract functions), but unlike the
            # others, this is yielded as a generator (and it's a one-item
            # dictionary).
            yield {
                ("hourly_emissions_epacems_" + str(year) + "_" + state.lower()):
                ds.get_data_frame(partition)
            }

Esempio n. 4

0

Mostra file

def main():  # noqa: C901
    """Clone the FERC Form 1 FoxPro database into SQLite."""
    # Display logged output from the PUDL package:
    pudl_logger = logging.getLogger("pudl")
    log_format = '%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s'
    coloredlogs.install(fmt=log_format, level='INFO', logger=pudl_logger)

    args = parse_command_line(sys.argv)
    if args.logfile:
        file_logger = logging.FileHandler(args.logfile)
        file_logger.setFormatter(logging.Formatter(log_format))
        pudl_logger.addHandler(file_logger)
    with pathlib.Path(args.settings_file).open() as f:
        script_settings = yaml.safe_load(f)

    defaults = pudl.workspace.setup.get_defaults()
    pudl_in = script_settings.get("pudl_in", defaults["pudl_in"])
    pudl_out = script_settings.get("pudl_out", defaults["pudl_out"])

    pudl_settings = pudl.workspace.setup.derive_paths(pudl_in=pudl_in,
                                                      pudl_out=pudl_out)

    script_settings = Ferc1ToSqliteSettings().parse_obj(
        script_settings["ferc1_to_sqlite_settings"])

    pudl_settings["sandbox"] = args.sandbox
    pudl.extract.ferc1.dbf2sqlite(
        tables=script_settings.tables,
        years=script_settings.years,
        refyear=script_settings.refyear,
        pudl_settings=pudl_settings,
        bad_cols=script_settings.bad_cols,
        clobber=args.clobber,
        datastore=Datastore(local_cache_path=(Path(pudl_in) / "data"),
                            sandbox=args.sandbox))

Esempio n. 5

0

Mostra file

File: etl.py Progetto: yashkumar1803/pudl

def _etl_epaipm(etl_params, datapkg_dir, pudl_settings, ds_kwargs):
    """Extract, transform and load CSVs for EPA IPM.

    Args:
        etl_params (dict): ETL parameters required by this data source.
        datapkg_dir (path-like): The location of the directory for this
            package, wihch will contain a datapackage.json file and a data
            directory in which the CSV file are stored.
        pudl_settings (dict) : a dictionary filled with settings that mostly
            describe paths to various resources and outputs.

    Returns:
        list: Names of PUDL DB tables output by the ETL for this data source.

    """
    epaipm_dict = _validate_params_epaipm(etl_params)
    epaipm_tables = epaipm_dict['epaipm_tables']
    if not epaipm_tables:
        logger.info('Not ingesting EPA IPM.')
        return []
    static_tables = _load_static_tables_epaipm(datapkg_dir)

    # Extract IPM tables
    ds = pudl.extract.epaipm.EpaIpmDatastore(Datastore(**ds_kwargs))
    epaipm_raw_dfs = pudl.extract.epaipm.extract(epaipm_tables, ds)

    epaipm_transformed_dfs = pudl.transform.epaipm.transform(
        epaipm_raw_dfs, epaipm_tables)

    pudl.load.csv.dict_dump(epaipm_transformed_dfs,
                            "EPA IPM",
                            datapkg_dir=datapkg_dir)

    return list(epaipm_transformed_dfs.keys()) + static_tables

Esempio n. 6

0

Mostra file

File: ferc1_to_sqlite.py Progetto: karldw/pudl

def main():  # noqa: C901
    """Clone the FERC Form 1 FoxPro database into SQLite."""
    # Display logged output from the PUDL package:
    pudl_logger = logging.getLogger("pudl")
    log_format = '%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s'
    coloredlogs.install(fmt=log_format, level='INFO', logger=pudl_logger)

    args = parse_command_line(sys.argv)
    with pathlib.Path(args.settings_file).open() as f:
        script_settings = yaml.safe_load(f)

    try:
        pudl_in = script_settings["pudl_in"]
    except KeyError:
        pudl_in = pudl.workspace.setup.get_defaults()["pudl_in"]
    try:
        pudl_out = script_settings["pudl_out"]
    except KeyError:
        pudl_out = pudl.workspace.setup.get_defaults()["pudl_out"]

    pudl_settings = pudl.workspace.setup.derive_paths(pudl_in=pudl_in,
                                                      pudl_out=pudl_out)

    # Check args for basic validity:
    for table in script_settings['ferc1_to_sqlite_tables']:
        if table not in pc.ferc1_tbl2dbf:
            raise ValueError(f"{table} was not found in the list of "
                             f"available FERC Form 1 tables.")
    if script_settings['ferc1_to_sqlite_refyear'] \
            not in pc.data_years['ferc1']:
        raise ValueError(
            f"Reference year {script_settings['ferc1_to_sqlite_refyear']} "
            f"is outside the range of available FERC Form 1 data "
            f"({min(pc.data_years['ferc1'])}-"
            f"{max(pc.data_years['ferc1'])}).")
    for year in script_settings['ferc1_to_sqlite_years']:
        if year not in pc.data_years['ferc1']:
            raise ValueError(
                f"Requested data from {year} is outside the range of "
                f"available FERC Form 1 data "
                f"({min(pc.data_years['ferc1'])}-"
                f"{max(pc.data_years['ferc1'])}).")

    try:
        # This field is optional and generally unused...
        bad_cols = script_settings['ferc1_to_sqlite_bad_cols']
    except KeyError:
        bad_cols = ()

    pudl_settings["sandbox"] = args.sandbox
    pudl.extract.ferc1.dbf2sqlite(
        tables=script_settings['ferc1_to_sqlite_tables'],
        years=script_settings['ferc1_to_sqlite_years'],
        refyear=script_settings['ferc1_to_sqlite_refyear'],
        pudl_settings=pudl_settings,
        bad_cols=bad_cols,
        clobber=args.clobber,
        datastore=Datastore(local_cache_path=(Path(pudl_in) / "data"),
                            sandbox=args.sandbox))

Esempio n. 7

0

Mostra file

File: etl.py Progetto: yashkumar1803/pudl

def _etl_epacems(etl_params, datapkg_dir, pudl_settings, ds_kwargs):
    """Extract, transform and load CSVs for EPA CEMS.

    Args:
        etl_params (dict): ETL parameters required by this data source.
        datapkg_dir (path-like): The location of the directory for this
            package, wihch will contain a datapackage.json file and a data
            directory in which the CSV file are stored.
        pudl_settings (dict) : a dictionary filled with settings that mostly
            describe paths to various resources and outputs.

    Returns:
        list: Names of PUDL DB tables output by the ETL for this data source.

    """
    epacems_dict = pudl.etl._validate_params_epacems(etl_params)
    epacems_years = epacems_dict['epacems_years']
    epacems_states = epacems_dict['epacems_states']
    # If we're not doing CEMS, just stop here to avoid printing messages like
    # "Reading EPA CEMS data...", which could be confusing.
    if not epacems_states or not epacems_years:
        logger.info('Not ingesting EPA CEMS.')

    # NOTE: This a generator for raw dataframes
    epacems_raw_dfs = pudl.extract.epacems.extract(epacems_years,
                                                   epacems_states,
                                                   Datastore(**ds_kwargs))

    # NOTE: This is a generator for transformed dataframes
    epacems_transformed_dfs = pudl.transform.epacems.transform(
        epacems_raw_dfs=epacems_raw_dfs, datapkg_dir=datapkg_dir)

    logger.info("Loading tables from EPA CEMS into PUDL:")
    if logger.isEnabledFor(logging.INFO):
        start_time = time.monotonic()
    epacems_tables = []
    # run the cems generator dfs through the load step
    for transformed_df_dict in epacems_transformed_dfs:
        pudl.load.csv.dict_dump(transformed_df_dict,
                                "EPA CEMS",
                                datapkg_dir=datapkg_dir)
        epacems_tables.append(list(transformed_df_dict.keys())[0])
    if logger.isEnabledFor(logging.INFO):
        delta_t = time.strftime("%H:%M:%S",
                                time.gmtime(time.monotonic() - start_time))
        time_message = f"Loading EPA CEMS took {delta_t}"
        logger.info(time_message)
        start_time = time.monotonic()

    return epacems_tables

Esempio n. 8

0

Mostra file

    def etl_ferc714(self, update: bool = False):
        """
        A single function that runs the temporary FERC 714 ETL and sets all DFs.

        This is an interim solution, so that we can have a (relatively) standard way of
        accessing the FERC 714 data prior to getting it integrated into the PUDL DB.
        Some of these are not yet cleaned up, but there are dummy transform functions
        which pass through the raw DFs with some minor alterations, so all the data is
        available as it exists right now.

        An attempt to access *any* of the dataframes results in all of them being
        populated, since generating all of them is almost the same amount of work as
        generating one of them.

        Args:
            update: Whether to overwrite the existing dataframes if they exist.

        """
        if isinstance(self.ds, Datastore):
            pass
        elif self.ds is None:
            pudl_settings = pudl.workspace.setup.get_defaults()
            if pudl_settings["pudl_in"] is None:
                raise FileNotFoundError(
                    "In order to run the ad-hoc FERC-714 ETL PUDL needs a valid "
                    "Datastore, but none was found. Run 'pudl_setup --help' "
                    "to see how to create one.")
            self.ds = Datastore(local_cache_path=pudl_settings["data_dir"])
        else:
            raise TypeError(
                "PudlTabl needs a PUDL Datastore object, but we got "
                f"a {type(self.ds)}.")

        if update or self._dfs["respondent_id_ferc714"] is None:
            logger.warning("Running the interim FERC 714 ETL process!")
            ferc714_raw_dfs = pudl.extract.ferc714.extract(ds=self.ds)
            ferc714_tfr_dfs = pudl.transform.ferc714.transform(ferc714_raw_dfs)
            self._dfs.update(ferc714_tfr_dfs)

Esempio n. 9

0

Mostra file

def extract(epacems_years, states, ds: Datastore):
    """
    Coordinate the extraction of EPA CEMS hourly DataFrames.

    Args:
        epacems_years (list): The years of CEMS data to extract, as 4-digit
            integers.
        states (list): The states whose CEMS data we want to extract, indicated
            by 2-letter US state codes.
        ds (:class:`Datastore`): Initialized datastore

    Yields:
        pandas.DataFrame: A single state-year of EPA CEMS hourly emissions data.

    """
    ds = EpaCemsDatastore(ds)
    for year in epacems_years:
        for state in states:
            partition = EpaCemsPartition(state=state, year=year)
            logger.info(f"Processing EPA CEMS hourly data for {state}-{year}")
            # We have to assign the reporting year for partitioning purposes
            df = (ds.get_data_frame(partition).assign(year=year))
            yield df

Esempio n. 10

0

Mostra file

    def etl_eia861(self, update: bool = False):
        """
        A single function that runs the temporary EIA 861 ETL and sets all DFs.

        This is an interim solution that provides a (somewhat) standard way of accessing
        the EIA 861 data prior to its being fully integrated into the PUDL database. If
        any of the dataframes is attempted to be accessed, all of them are set. Only
        the tables that have actual transform functions are included, and as new
        transform functions are completed, they would need to be added to the list
        below. Surely there is a way to do this automatically / magically but that's
        beyond my knowledge right now.

        Args:
            update: Whether to overwrite the existing dataframes if they exist.

        """
        if isinstance(self.ds, Datastore):
            pass
        elif self.ds is None:
            pudl_settings = pudl.workspace.setup.get_defaults()
            if pudl_settings["pudl_in"] is None:
                raise FileNotFoundError(
                    "In order to run the ad-hoc EIA-861 ETL PUDL needs a valid "
                    "Datastore, but none was found. Run 'pudl_setup --help' "
                    "to see how to create one.")
            self.ds = Datastore(local_cache_path=pudl_settings["data_dir"])
        else:
            raise TypeError(
                "PudlTabl needs a PUDL Datastore object, but we got "
                f"a {type(self.ds)}.")

        if update or self._dfs["balancing_authority_eia861"] is None:
            logger.warning("Running the interim EIA 861 ETL process!")

            eia861_raw_dfs = (pudl.extract.eia861.Extractor(self.ds).extract(
                year=pc.WORKING_PARTITIONS["eia861"]["years"]))
            self._dfs.update(pudl.transform.eia861.transform(eia861_raw_dfs))

Esempio n. 11

0

Mostra file

File: zenodo_datapackage_test.py Progetto: karldw/pudl

 def test_prod_datapackages(self):
     """All datasets point to valid descriptors with 1 or more resources."""
     ds = Datastore(sandbox=False)
     for dataset in ds.get_known_datasets():
         desc = ds.get_datapackage_descriptor(dataset)
         assert list(desc.get_resources())

Esempio n. 12

0

Mostra file

def _etl_eia(
    etl_settings: EiaSettings,
    ds_kwargs: Dict[str, Any]
) -> Dict[str, pd.DataFrame]:
    """Extract, transform and load CSVs for the EIA datasets.

    Args:
        etl_settings: Validated ETL parameters required by this data source.
        ds_kwargs: Keyword arguments for instantiating a PUDL datastore,
            so that the ETL can access the raw input data.

    Returns:
        A dictionary of EIA dataframes ready for loading into the PUDL DB.

    """
    eia860_tables = etl_settings.eia860.tables
    eia860_years = etl_settings.eia860.years
    eia860m = etl_settings.eia860.eia860m
    eia923_tables = etl_settings.eia923.tables
    eia923_years = etl_settings.eia923.years

    if (
        (not eia923_tables or not eia923_years)
        and (not eia860_tables or not eia860_years)
    ):
        logger.info('Not loading EIA.')
        return []

    # generate dataframes for the static EIA tables
    out_dfs = _read_static_tables_eia()

    ds = Datastore(**ds_kwargs)
    # Extract EIA forms 923, 860
    eia923_raw_dfs = pudl.extract.eia923.Extractor(ds).extract(
        year=eia923_years)
    eia860_raw_dfs = pudl.extract.eia860.Extractor(ds).extract(
        year=eia860_years)
    # if we are trying to add the EIA 860M YTD data, then extract it and append
    if eia860m:
        eia860m_raw_dfs = pudl.extract.eia860m.Extractor(ds).extract(
            year_month=pc.WORKING_PARTITIONS['eia860m']['year_month'])
        eia860_raw_dfs = pudl.extract.eia860m.append_eia860m(
            eia860_raw_dfs=eia860_raw_dfs, eia860m_raw_dfs=eia860m_raw_dfs)

    # Transform EIA forms 923, 860
    eia860_transformed_dfs = pudl.transform.eia860.transform(
        eia860_raw_dfs, eia860_tables=eia860_tables)
    eia923_transformed_dfs = pudl.transform.eia923.transform(
        eia923_raw_dfs, eia923_tables=eia923_tables)
    # create an eia transformed dfs dictionary
    eia_transformed_dfs = eia860_transformed_dfs.copy()
    eia_transformed_dfs.update(eia923_transformed_dfs.copy())

    # convert types..
    eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes(
        eia_transformed_dfs, 'eia')

    entities_dfs, eia_transformed_dfs = pudl.transform.eia.transform(
        eia_transformed_dfs,
        eia860_years=eia860_years,
        eia923_years=eia923_years,
        eia860m=eia860m,
    )
    # convert types..
    entities_dfs = pudl.helpers.convert_dfs_dict_dtypes(entities_dfs, 'eia')
    for table in entities_dfs:
        entities_dfs[table] = PUDL_META.get_resource(table).encode(entities_dfs[table])

    out_dfs.update(entities_dfs)
    out_dfs.update(eia_transformed_dfs)
    return out_dfs

Esempio n. 13

0

Mostra file

def etl_epacems(
    etl_settings: EpaCemsSettings,
    pudl_settings: Dict[str, Any],
    ds_kwargs: Dict[str, Any],
) -> None:
    """Extract, transform and load CSVs for EPA CEMS.

    Args:
        etl_settings: Validated ETL parameters required by this data source.
        pudl_settings: a dictionary filled with settings that mostly describe paths to
            various resources and outputs.
        ds_kwargs: Keyword arguments for instantiating a PUDL datastore, so that the ETL
            can access the raw input data.

    Returns:
        Unlike the other ETL functions, the EPACEMS writes its output to Parquet as it
        goes, since the dataset is too large to hold in memory.  So it doesn't return a
        dictionary of dataframes.

    """
    epacems_years = etl_settings.years
    epacems_states = etl_settings.states

    # If we're not doing CEMS, just stop here to avoid printing messages like
    # "Reading EPA CEMS data...", which could be confusing.
    if not epacems_states or not epacems_years:
        logger.info('Not ingesting EPA CEMS.')

    pudl_engine = sa.create_engine(pudl_settings["pudl_db"])

    # Verify that we have a PUDL DB with plant attributes:
    inspector = sa.inspect(pudl_engine)
    if "plants_eia860" not in inspector.get_table_names():
        raise RuntimeError(
            "No plants_eia860 available in the PUDL DB! Have you run the ETL? "
            f"Trying to access PUDL DB: {pudl_engine}"
        )

    eia_plant_years = pd.read_sql(
        """
        SELECT DISTINCT strftime('%Y', report_date)
        AS year
        FROM plants_eia860
        ORDER BY year ASC
        """, pudl_engine).year.astype(int)
    missing_years = list(set(epacems_years) - set(eia_plant_years))
    if missing_years:
        logger.info(
            f"EPA CEMS years with no EIA plant data: {missing_years} "
            "Some timezones may be estimated based on plant state."
        )

    # NOTE: This is a generator for raw dataframes
    epacems_raw_dfs = pudl.extract.epacems.extract(
        epacems_years, epacems_states, Datastore(**ds_kwargs))

    # NOTE: This is a generator for transformed dataframes
    epacems_transformed_dfs = pudl.transform.epacems.transform(
        epacems_raw_dfs=epacems_raw_dfs,
        pudl_engine=pudl_engine,
    )

    logger.info("Processing EPA CEMS data and writing it to Apache Parquet.")
    if logger.isEnabledFor(logging.INFO):
        start_time = time.monotonic()

    # run the cems generator dfs through the load step
    for df in epacems_transformed_dfs:
        pudl.load.parquet.epacems_to_parquet(
            df,
            root_path=Path(pudl_settings["parquet_dir"]) / "epacems",
        )

    if logger.isEnabledFor(logging.INFO):
        delta_t = time.strftime("%H:%M:%S", time.gmtime(
            time.monotonic() - start_time))
        time_message = f"Processing EPA CEMS took {delta_t}"
        logger.info(time_message)
        start_time = time.monotonic()

Esempio n. 14

0

Mostra file

File: etl.py Progetto: yashkumar1803/pudl

def _etl_eia(etl_params, datapkg_dir, pudl_settings, ds_kwargs):
    """Extract, transform and load CSVs for the EIA datasets.

    Args:
        etl_params (dict): ETL parameters required by this data source.
        datapkg_dir (path-like): The location of the directory for this
            package, wihch will contain a datapackage.json file and a data
            directory in which the CSV file are stored.
        pudl_settings (dict) : a dictionary filled with settings that mostly
            describe paths to various resources and outputs.

    Returns:
        list: Names of PUDL DB tables output by the ETL for this data source.

    """
    eia_inputs = _validate_params_eia(etl_params)
    eia860_tables = eia_inputs["eia860_tables"]
    eia860_years = eia_inputs["eia860_years"]
    eia860_ytd = eia_inputs["eia860_ytd"]
    eia923_tables = eia_inputs["eia923_tables"]
    eia923_years = eia_inputs["eia923_years"]

    if ((not eia923_tables or not eia923_years)
            and (not eia860_tables or not eia860_years)):
        logger.info('Not loading EIA.')
        return []

    # generate CSVs for the static EIA tables, return the list of tables
    static_tables = _load_static_tables_eia(datapkg_dir)

    ds = Datastore(**ds_kwargs)
    # Extract EIA forms 923, 860
    eia923_raw_dfs = pudl.extract.eia923.Extractor(ds).extract(
        year=eia923_years)
    eia860_raw_dfs = pudl.extract.eia860.Extractor(ds).extract(
        year=eia860_years)
    # if we are trying to add the EIA 860M YTD data, then extract it and append
    if eia860_ytd:
        eia860m_raw_dfs = pudl.extract.eia860m.Extractor(ds).extract(
            year_month=pc.working_partitions['eia860m']['year_month'])
        eia860_raw_dfs = pudl.extract.eia860m.append_eia860m(
            eia860_raw_dfs=eia860_raw_dfs, eia860m_raw_dfs=eia860m_raw_dfs)

    # Transform EIA forms 923, 860
    eia860_transformed_dfs = pudl.transform.eia860.transform(
        eia860_raw_dfs, eia860_tables=eia860_tables)
    eia923_transformed_dfs = pudl.transform.eia923.transform(
        eia923_raw_dfs, eia923_tables=eia923_tables)
    # create an eia transformed dfs dictionary
    eia_transformed_dfs = eia860_transformed_dfs.copy()
    eia_transformed_dfs.update(eia923_transformed_dfs.copy())

    # Add EIA-EPA crosswalk tables
    eia_transformed_dfs = _add_eia_epacems_crosswalk(eia_transformed_dfs)

    # convert types..
    eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes(
        eia_transformed_dfs, 'eia')

    entities_dfs, eia_transformed_dfs = pudl.transform.eia.transform(
        eia_transformed_dfs,
        eia860_years=eia860_years,
        eia923_years=eia923_years,
        eia860_ytd=eia860_ytd,
    )
    # convert types..
    entities_dfs = pudl.helpers.convert_dfs_dict_dtypes(entities_dfs, 'eia')

    # Compile transformed dfs for loading...
    transformed_dfs = {"Entities": entities_dfs, "EIA": eia_transformed_dfs}
    # Load step
    for data_source, transformed_df in transformed_dfs.items():
        pudl.load.csv.dict_dump(transformed_df,
                                data_source,
                                datapkg_dir=datapkg_dir)

    return (list(eia_transformed_dfs.keys()) + list(entities_dfs.keys()) +
            static_tables)

Esempio n. 15

0

Mostra file

 def test_sandbox_datapackages(self):
     """All datasets point to valid descriptors and each specifies non-zero resources."""
     ds = Datastore(sandbox=True)
     for dataset in ds.get_known_datasets():
         desc = ds.get_datapackage_descriptor(dataset)
         assert list(desc.get_resources())