Exemple #1
0
def get_eia923_file(yr, data_dir):
    """Construct the appopriate path for a given year's EIA923 Excel file.

    Args:
        year (int): The year that we're trying to read data for.
        data_dir (str): Top level datastore directory.

    Returns:
        str: path to EIA 923 spreadsheets corresponding to a given year.

    """
    if yr < min(pc.working_years['eia923']):
        raise ValueError(
            f"EIA923 file selection only works for 2009 & later "
            f"but file for {yr} was requested."
        )

    eia923_dir = datastore.path('eia923', year=yr, file=False,
                                data_dir=data_dir)
    eia923_globs = glob.glob(os.path.join(eia923_dir, '*2_3_4*'))

    # There can only be one!
    if len(eia923_globs) > 1:
        raise AssertionError(
            f'Multiple matching EIA923 spreadsheets found for {yr}!'
        )

    return eia923_globs[0]
Exemple #2
0
def get_eia860_file(yr, file, data_dir):
    """
    Construct the appopriate path for a given EIA860 Excel file.

    Args:
        year (int): The year that we're trying to read data for.
        file (str): A string containing part of the file name for a given EIA
            860 file (e.g. '*Generat*')
        data_dir (str): Top level datastore directory.

    Returns:
        str: Path to EIA 860 spreadsheets corresponding to a given year.

    Raises:
        AssertionError: If the requested year is not in the list of working
            years for EIA 860.

    """
    if yr not in pc.working_years['eia860']:
        raise AssertionError(
            f"Requested non-working EIA 860 year: {yr}.\n"
            f"EIA 860 is only working for: {pc.working_years['eia860']}\n")

    eia860_dir = datastore.path('eia860',
                                year=yr,
                                file=False,
                                data_dir=data_dir)
    eia860_file = glob.glob(os.path.join(eia860_dir, file))[0]

    return eia860_file
Exemple #3
0
    def get_file(self, yr, file_name):
        """
        Construct the appopriate path for a given EIA860 Excel file.

        Args:
            year (int): The year that we're trying to read data for.
            file_name (str): A string containing part of the file name for a
                given EIA 860 file (e.g. '*Generat*')

        Returns:
            str: Path to EIA 861 spreadsheets corresponding to a given year.

        Raises:
            ValueError: If the requested year is not in the list of working
                years for EIA 861.

        """
        if yr not in pc.working_years[self.dataset_name]:
            raise ValueError(
                f"Requested non-working {self.dataset_name} year: {yr}.\n"
                f"{self.dataset_name} is only working for: {pc.working_years[self.dataset_name]}\n"
            )

        eia860_dir = datastore.path(self.dataset_name, year=yr, file=False,
                                    data_dir=self.data_dir)
        eia860_file = pathlib.Path(
            eia860_dir, self.get_path_name(yr, file_name))

        return eia860_file
Exemple #4
0
def init(pudl_in, pudl_out, clobber=False):
    """
    Set up a new PUDL working environment based on the user settings.

    Args:
        pudl_in (os.PathLike): Path to the directory containing the PUDL input
            files, most notably the ``data`` directory which houses the raw
            data downloaded from public agencies by the
            :mod:`pudl.workspace.datastore` tools. ``pudl_in`` may be the same
            directory as ``pudl_out``.
        pudl_out (os.PathLike): Path to the directory where PUDL should write
            the outputs it generates. These will be organized into directories
            according to the output format (sqlite, datapackage, etc.).
        clobber (bool): if True, replace existing files. If False (the default)
            do not replace existing files.

    Returns:
        None

    """
    # Generate paths for the workspace:
    ps = derive_paths(pudl_in, pudl_out)

    # Make directories for all of the data sources, plus the temporary dir:
    for source in list(pc.data_sources):
        src_dir = pathlib.Path(
            datastore.path(source,
                           year=None,
                           file=False,
                           data_dir=ps["data_dir"]))
        src_dir.mkdir(parents=True, exist_ok=True)

    tmp_dir = pathlib.Path(ps["data_dir"], "tmp")
    tmp_dir.mkdir(parents=True, exist_ok=True)

    # These are files that may exist in the package_data directory, but that
    # we do not want to deploy into a user workspace:
    ignore_files = ['__init__.py', '.gitignore']

    # Make a settings directory in the workspace, and deploy settings files:
    settings_dir = pathlib.Path(ps['settings_dir'])
    settings_dir.mkdir(parents=True, exist_ok=True)
    settings_pkg = "pudl.package_data.settings"
    deploy(settings_pkg, settings_dir, ignore_files, clobber=clobber)

    # Make several output directories, and deploy example notebooks:
    for fmt in pc.output_formats:
        format_dir = pathlib.Path(ps["pudl_out"], fmt)
        format_dir.mkdir(parents=True, exist_ok=True)
    notebook_dir = pathlib.Path(ps["notebook_dir"])
    notebook_pkg = "pudl.package_data.notebooks"
    deploy(notebook_pkg, notebook_dir, ignore_files, clobber=clobber)

    # Deploy the pudl user environment file.
    environment_pkg = "pudl.package_data"
    deploy(environment_pkg, ps["pudl_out"], ignore_files, clobber=clobber)
Exemple #5
0
 def _get_file_path(self, year, page):
     """Returns full path to the excel spreadsheet."""
     directory = datastore.path(self._dataset_name,
                                year=year,
                                file=False,
                                data_dir=self._data_dir)
     files = glob.glob(
         os.path.join(directory, self.file_basename_glob(year, page)))
     if len(files) != 1:
         raise FileNotFoundError(
             f'{len(files)} matching files found for ' +
             f'{self._dataset_name} {page} {year}. Exacly one expected.')
     return files[0]
Exemple #6
0
def dbc_filename(year, data_dir):
    """Given a year, returns the path to the master FERC Form 1 .DBC file.

    Args:
        year (int): The year that we're trying to read data for

    Returns:
        str: the file path to the master FERC Form 1 .DBC file for the year
    """
    ferc1_path = datastore.path('ferc1',
                                data_dir=data_dir,
                                year=year,
                                file=False)
    return os.path.join(ferc1_path, 'F1_PUB.DBC')
Exemple #7
0
def get_raw_df(table, dbc_map, data_dir, years=pc.data_years['ferc1']):
    """Combine several years of a given FERC Form 1 DBF table into a dataframe.

    Args:
        table (string): The name of the FERC Form 1 table from which data is
            read.
        dbc_map (dict of dicts): A dictionary of dictionaries, of the kind
            returned by get_dbc_map(), describing the table and column names
            stored within the FERC Form 1 FoxPro database files.
        data_dir (str): A string representing the full path to the top level of
            the PUDL datastore containing the FERC Form 1 data to be used.
        min_length (int): The minimum number of consecutive printable
        years (list): Range of years to be combined into a single DataFrame.

    Returns:
        :class:`pandas.DataFrame`: A DataFrame containing several years of FERC
        Form 1 data for the given table.

    """
    dbf_name = pc.ferc1_tbl2dbf[table]

    raw_dfs = []
    for yr in years:
        ferc1_dir = datastore.path('ferc1',
                                   year=yr,
                                   file=False,
                                   data_dir=data_dir)
        dbf_path = os.path.join(ferc1_dir, f"{dbf_name}.DBF")

        if os.path.exists(dbf_path):
            new_df = pd.DataFrame(
                iter(
                    dbfread.DBF(dbf_path,
                                encoding='latin1',
                                parserclass=FERC1FieldParser)))
            raw_dfs = raw_dfs + [
                new_df,
            ]

    if raw_dfs:
        return (pd.concat(raw_dfs, sort=True).drop('_NullFlags',
                                                   axis=1,
                                                   errors='ignore').rename(
                                                       dbc_map[table], axis=1))
Exemple #8
0
def get_dbf_path(table, year, data_dir):
    """Given a year and table name, returns the path to its datastore DBF file.

    Args:
        table (string): The name of one of the FERC Form 1 data tables. For
            example 'f1_fuel' or 'f1_steam'
        year (int): The year whose data you wish to find.
        data_dir (str): A string representing the full path to the top level of
            the PUDL datastore containing the FERC Form 1 data to be used.

    Returns:
        str: dbf_path, a (hopefully) OS independent path including the
        filename of the DBF file corresponding to the requested year and
        table name.
    """
    dbf_name = pc.ferc1_tbl2dbf[table]
    ferc1_dir = datastore.path('ferc1',
                               year=year,
                               file=False,
                               data_dir=data_dir)
    dbf_path = os.path.join(ferc1_dir, f"{dbf_name}.DBF")
    return dbf_path
Exemple #9
0
def extract(epacems_years, states, data_dir):
    """
    Coordinate the extraction of EPA CEMS hourly DataFrames.

    Args:
        epacems_years (list): list of years from which we are trying to read
            CEMS data
        states (list): list of states from which we are trying to read CEMS
            data
        data_dir (path-like): Path to the top directory of the PUDL datastore.

    Yields:
        dict: a dictionary of States (keys) and DataFrames of CEMS data (values)

    Todo:
        This is really slow. Can we do some parallel processing?

    """
    for year in epacems_years:
        # The keys of the us_states dictionary are the state abbrevs
        for state in states:
            dfs = []
            for month in range(1, 13):
                filename = datastore.path('epacems',
                                          year=year,
                                          month=month,
                                          state=state,
                                          data_dir=data_dir)
                logger.info(f"Performing ETL for EPA CEMS hourly "
                            f"{state}-{year}-{month:02}")
                dfs.append(read_cems_csv(filename))
            # Return a dictionary where the key identifies this dataset
            # (just like the other extract functions), but unlike the
            # others, this is yielded as a generator (and it's a one-item
            # dictionary).
            yield {
                ("hourly_emissions_epacems_" + str(year) + "_" + state.lower()):
                pd.concat(dfs, sort=True, copy=False, ignore_index=True)
            }
Exemple #10
0
def extract(epacems_years, states, data_dir):
    """
    Coordinate the extraction of EPA CEMS hourly DataFrames.

    Args:
        epacems_years (list): The years of CEMS data to extract, as 4-digit
            integers.
        states (list): The states whose CEMS data we want to extract, indicated
            by 2-letter US state codes.
        data_dir (path-like): Path to the top directory of the PUDL datastore.

    Yields:
        dict: a dictionary with a single EPA CEMS tabular data resource name as
        the key, having the form "hourly_emissions_epacems_YEAR_STATE" where
        YEAR is a 4 digit number and STATE is a lower case 2-letter code for a
        US state. The value is a :class:`pandas.DataFrame` containing all the
        raw EPA CEMS hourly emissions data for the indicated state and year.

    """
    for year in epacems_years:
        # The keys of the us_states dictionary are the state abbrevs
        for state in states:
            dfs = []
            logger.info(f"Performing ETL for EPA CEMS hourly {state}-{year}")
            for month in range(1, 13):
                filename = datastore.path('epacems',
                                          year=year,
                                          month=month,
                                          state=state,
                                          data_dir=data_dir)
                dfs.append(read_cems_csv(filename))
            # Return a dictionary where the key identifies this dataset
            # (just like the other extract functions), but unlike the
            # others, this is yielded as a generator (and it's a one-item
            # dictionary).
            yield {
                ("hourly_emissions_epacems_" + str(year) + "_" + state.lower()):
                pd.concat(dfs, sort=True, copy=False, ignore_index=True)
            }
Exemple #11
0
def get_epaipm_name(file, data_dir):
    """Returns the appropriate EPA IPM excel file.

    Args:
        file (str): The file that we're trying to read data for.
        data_dir (path-like): Path to the top directory of the PUDL datastore.

    Returns:
        str: The path to EPA IPM spreadsheet.

    """
    # Access the CSV scraped from a PDF & distributed with PUDL:
    if file == 'transmission_joint_epaipm':
        with importlib.resources.path(
                'pudl.package_data.epa.ipm',
                'table_3-5_transmission_joint_ipm.csv') as p:
            name = p
    else:
        epaipm_dir = Path(
            datastore.path('epaipm', file=False, year=None, data_dir=data_dir))
        pattern = pc.files_dict_epaipm[file]
        name = sorted(epaipm_dir.glob(pattern))[0]

    return name
Exemple #12
0
def main():  # noqa: C901
    """Main function controlling flow of the script.

    Assumes you have a local datastore, and need to copy a small subset of it
    over into the Travis CI test data directory.
    """

    args = parse_command_line(sys.argv)

    # If no years were specified, use the most recent year of data.
    # If years were specified, keep only the years which are valid for that
    # data source, and optionally output a message saying which years are
    # being ignored because they aren't valid.
    yrs_by_src = {}
    for src in args.sources:
        if not args.year:
            yrs_by_src[src] = [max(pc.data_years[src])]
        else:
            yrs_by_src[src] = [
                int(yr) for yr in args.year if int(yr) in pc.data_years[src]
            ]
            bad_yrs = [
                int(yr) for yr in args.year
                if int(yr) not in pc.data_years[src]
            ]
            if bad_yrs:
                logger.warning(f"Invalid {src} years ignored: {bad_yrs}.")

    logger.info(f"out_dir: {args.out_dir}")
    pudl_settings = pudl.workspace.setup.derive_paths(
        pudl_in=pudl.workspace.setup.get_defaults()["pudl_in"],
        pudl_out=pudl.workspace.setup.get_defaults()["pudl_in"])

    for src in args.sources:
        for yr in yrs_by_src[src]:
            src_dir = datastore.path(src,
                                     pudl_settings["data_dir"],
                                     year=yr,
                                     file=False)
            tmp_dir = os.path.join(args.out_dir, 'tmp')

            if src == 'ferc1':
                files_to_move = [
                    f"{pc.ferc1_tbl2dbf[f]}.DBF"
                    for f in pc.ferc1_default_tables
                ]
                files_to_move = files_to_move + ['F1_PUB.DBC', 'F1_32.FPT']
            elif src == 'epacems':
                files_to_move = [
                    datastore.path('epacems',
                                   pudl_settings["data_dir"],
                                   year=yr,
                                   state=st,
                                   month=mo) for mo in range(1, 13)
                    for st in args.states
                ]
                files_to_move = [os.path.basename(f) for f in files_to_move]
            else:
                raise AssertionError(f"Unrecognized data source {src}")

            if not os.path.exists(tmp_dir):
                os.makedirs(tmp_dir)
            logger.info(f"src: {src_dir}")
            logger.info(f"tmp: {tmp_dir}")
            src_files = [os.path.join(src_dir, f) for f in files_to_move]
            dst_files = [os.path.join(tmp_dir, f) for f in files_to_move]

            for src_file, dst_file in zip(src_files, dst_files):
                if os.path.exists(dst_file):
                    os.remove(dst_file)
                shutil.copy(src_file, dst_file)

            if src == 'ferc1':
                ferc1_test_zipfile = os.path.join(pudl_settings['data_dir'],
                                                  f"f1_{yr}.zip")
                z = zipfile.ZipFile(ferc1_test_zipfile,
                                    mode='w',
                                    compression=zipfile.ZIP_DEFLATED)
                for root, dirs, files in os.walk(tmp_dir):
                    for filename in files:
                        z.write(os.path.join(root, filename), arcname=filename)
                logger.info(f"closing {ferc1_test_zipfile}")
                z.close()
                shutil.move(ferc1_test_zipfile, tmp_dir)
                for f in dst_files:
                    os.remove(f)

            logger.info(f"organizing datastore for {src} {yr}")
            datastore.organize(src,
                               yr,
                               states=args.states,
                               data_dir=pudl_settings['data_dir'],
                               unzip=False)