def get_eia923_file(yr, data_dir): """Construct the appopriate path for a given year's EIA923 Excel file. Args: year (int): The year that we're trying to read data for. data_dir (str): Top level datastore directory. Returns: str: path to EIA 923 spreadsheets corresponding to a given year. """ if yr < min(pc.working_years['eia923']): raise ValueError( f"EIA923 file selection only works for 2009 & later " f"but file for {yr} was requested." ) eia923_dir = datastore.path('eia923', year=yr, file=False, data_dir=data_dir) eia923_globs = glob.glob(os.path.join(eia923_dir, '*2_3_4*')) # There can only be one! if len(eia923_globs) > 1: raise AssertionError( f'Multiple matching EIA923 spreadsheets found for {yr}!' ) return eia923_globs[0]
def get_eia860_file(yr, file, data_dir): """ Construct the appopriate path for a given EIA860 Excel file. Args: year (int): The year that we're trying to read data for. file (str): A string containing part of the file name for a given EIA 860 file (e.g. '*Generat*') data_dir (str): Top level datastore directory. Returns: str: Path to EIA 860 spreadsheets corresponding to a given year. Raises: AssertionError: If the requested year is not in the list of working years for EIA 860. """ if yr not in pc.working_years['eia860']: raise AssertionError( f"Requested non-working EIA 860 year: {yr}.\n" f"EIA 860 is only working for: {pc.working_years['eia860']}\n") eia860_dir = datastore.path('eia860', year=yr, file=False, data_dir=data_dir) eia860_file = glob.glob(os.path.join(eia860_dir, file))[0] return eia860_file
def get_file(self, yr, file_name): """ Construct the appopriate path for a given EIA860 Excel file. Args: year (int): The year that we're trying to read data for. file_name (str): A string containing part of the file name for a given EIA 860 file (e.g. '*Generat*') Returns: str: Path to EIA 861 spreadsheets corresponding to a given year. Raises: ValueError: If the requested year is not in the list of working years for EIA 861. """ if yr not in pc.working_years[self.dataset_name]: raise ValueError( f"Requested non-working {self.dataset_name} year: {yr}.\n" f"{self.dataset_name} is only working for: {pc.working_years[self.dataset_name]}\n" ) eia860_dir = datastore.path(self.dataset_name, year=yr, file=False, data_dir=self.data_dir) eia860_file = pathlib.Path( eia860_dir, self.get_path_name(yr, file_name)) return eia860_file
def init(pudl_in, pudl_out, clobber=False): """ Set up a new PUDL working environment based on the user settings. Args: pudl_in (os.PathLike): Path to the directory containing the PUDL input files, most notably the ``data`` directory which houses the raw data downloaded from public agencies by the :mod:`pudl.workspace.datastore` tools. ``pudl_in`` may be the same directory as ``pudl_out``. pudl_out (os.PathLike): Path to the directory where PUDL should write the outputs it generates. These will be organized into directories according to the output format (sqlite, datapackage, etc.). clobber (bool): if True, replace existing files. If False (the default) do not replace existing files. Returns: None """ # Generate paths for the workspace: ps = derive_paths(pudl_in, pudl_out) # Make directories for all of the data sources, plus the temporary dir: for source in list(pc.data_sources): src_dir = pathlib.Path( datastore.path(source, year=None, file=False, data_dir=ps["data_dir"])) src_dir.mkdir(parents=True, exist_ok=True) tmp_dir = pathlib.Path(ps["data_dir"], "tmp") tmp_dir.mkdir(parents=True, exist_ok=True) # These are files that may exist in the package_data directory, but that # we do not want to deploy into a user workspace: ignore_files = ['__init__.py', '.gitignore'] # Make a settings directory in the workspace, and deploy settings files: settings_dir = pathlib.Path(ps['settings_dir']) settings_dir.mkdir(parents=True, exist_ok=True) settings_pkg = "pudl.package_data.settings" deploy(settings_pkg, settings_dir, ignore_files, clobber=clobber) # Make several output directories, and deploy example notebooks: for fmt in pc.output_formats: format_dir = pathlib.Path(ps["pudl_out"], fmt) format_dir.mkdir(parents=True, exist_ok=True) notebook_dir = pathlib.Path(ps["notebook_dir"]) notebook_pkg = "pudl.package_data.notebooks" deploy(notebook_pkg, notebook_dir, ignore_files, clobber=clobber) # Deploy the pudl user environment file. environment_pkg = "pudl.package_data" deploy(environment_pkg, ps["pudl_out"], ignore_files, clobber=clobber)
def _get_file_path(self, year, page): """Returns full path to the excel spreadsheet.""" directory = datastore.path(self._dataset_name, year=year, file=False, data_dir=self._data_dir) files = glob.glob( os.path.join(directory, self.file_basename_glob(year, page))) if len(files) != 1: raise FileNotFoundError( f'{len(files)} matching files found for ' + f'{self._dataset_name} {page} {year}. Exacly one expected.') return files[0]
def dbc_filename(year, data_dir): """Given a year, returns the path to the master FERC Form 1 .DBC file. Args: year (int): The year that we're trying to read data for Returns: str: the file path to the master FERC Form 1 .DBC file for the year """ ferc1_path = datastore.path('ferc1', data_dir=data_dir, year=year, file=False) return os.path.join(ferc1_path, 'F1_PUB.DBC')
def get_raw_df(table, dbc_map, data_dir, years=pc.data_years['ferc1']): """Combine several years of a given FERC Form 1 DBF table into a dataframe. Args: table (string): The name of the FERC Form 1 table from which data is read. dbc_map (dict of dicts): A dictionary of dictionaries, of the kind returned by get_dbc_map(), describing the table and column names stored within the FERC Form 1 FoxPro database files. data_dir (str): A string representing the full path to the top level of the PUDL datastore containing the FERC Form 1 data to be used. min_length (int): The minimum number of consecutive printable years (list): Range of years to be combined into a single DataFrame. Returns: :class:`pandas.DataFrame`: A DataFrame containing several years of FERC Form 1 data for the given table. """ dbf_name = pc.ferc1_tbl2dbf[table] raw_dfs = [] for yr in years: ferc1_dir = datastore.path('ferc1', year=yr, file=False, data_dir=data_dir) dbf_path = os.path.join(ferc1_dir, f"{dbf_name}.DBF") if os.path.exists(dbf_path): new_df = pd.DataFrame( iter( dbfread.DBF(dbf_path, encoding='latin1', parserclass=FERC1FieldParser))) raw_dfs = raw_dfs + [ new_df, ] if raw_dfs: return (pd.concat(raw_dfs, sort=True).drop('_NullFlags', axis=1, errors='ignore').rename( dbc_map[table], axis=1))
def get_dbf_path(table, year, data_dir): """Given a year and table name, returns the path to its datastore DBF file. Args: table (string): The name of one of the FERC Form 1 data tables. For example 'f1_fuel' or 'f1_steam' year (int): The year whose data you wish to find. data_dir (str): A string representing the full path to the top level of the PUDL datastore containing the FERC Form 1 data to be used. Returns: str: dbf_path, a (hopefully) OS independent path including the filename of the DBF file corresponding to the requested year and table name. """ dbf_name = pc.ferc1_tbl2dbf[table] ferc1_dir = datastore.path('ferc1', year=year, file=False, data_dir=data_dir) dbf_path = os.path.join(ferc1_dir, f"{dbf_name}.DBF") return dbf_path
def extract(epacems_years, states, data_dir): """ Coordinate the extraction of EPA CEMS hourly DataFrames. Args: epacems_years (list): list of years from which we are trying to read CEMS data states (list): list of states from which we are trying to read CEMS data data_dir (path-like): Path to the top directory of the PUDL datastore. Yields: dict: a dictionary of States (keys) and DataFrames of CEMS data (values) Todo: This is really slow. Can we do some parallel processing? """ for year in epacems_years: # The keys of the us_states dictionary are the state abbrevs for state in states: dfs = [] for month in range(1, 13): filename = datastore.path('epacems', year=year, month=month, state=state, data_dir=data_dir) logger.info(f"Performing ETL for EPA CEMS hourly " f"{state}-{year}-{month:02}") dfs.append(read_cems_csv(filename)) # Return a dictionary where the key identifies this dataset # (just like the other extract functions), but unlike the # others, this is yielded as a generator (and it's a one-item # dictionary). yield { ("hourly_emissions_epacems_" + str(year) + "_" + state.lower()): pd.concat(dfs, sort=True, copy=False, ignore_index=True) }
def extract(epacems_years, states, data_dir): """ Coordinate the extraction of EPA CEMS hourly DataFrames. Args: epacems_years (list): The years of CEMS data to extract, as 4-digit integers. states (list): The states whose CEMS data we want to extract, indicated by 2-letter US state codes. data_dir (path-like): Path to the top directory of the PUDL datastore. Yields: dict: a dictionary with a single EPA CEMS tabular data resource name as the key, having the form "hourly_emissions_epacems_YEAR_STATE" where YEAR is a 4 digit number and STATE is a lower case 2-letter code for a US state. The value is a :class:`pandas.DataFrame` containing all the raw EPA CEMS hourly emissions data for the indicated state and year. """ for year in epacems_years: # The keys of the us_states dictionary are the state abbrevs for state in states: dfs = [] logger.info(f"Performing ETL for EPA CEMS hourly {state}-{year}") for month in range(1, 13): filename = datastore.path('epacems', year=year, month=month, state=state, data_dir=data_dir) dfs.append(read_cems_csv(filename)) # Return a dictionary where the key identifies this dataset # (just like the other extract functions), but unlike the # others, this is yielded as a generator (and it's a one-item # dictionary). yield { ("hourly_emissions_epacems_" + str(year) + "_" + state.lower()): pd.concat(dfs, sort=True, copy=False, ignore_index=True) }
def get_epaipm_name(file, data_dir): """Returns the appropriate EPA IPM excel file. Args: file (str): The file that we're trying to read data for. data_dir (path-like): Path to the top directory of the PUDL datastore. Returns: str: The path to EPA IPM spreadsheet. """ # Access the CSV scraped from a PDF & distributed with PUDL: if file == 'transmission_joint_epaipm': with importlib.resources.path( 'pudl.package_data.epa.ipm', 'table_3-5_transmission_joint_ipm.csv') as p: name = p else: epaipm_dir = Path( datastore.path('epaipm', file=False, year=None, data_dir=data_dir)) pattern = pc.files_dict_epaipm[file] name = sorted(epaipm_dir.glob(pattern))[0] return name
def main(): # noqa: C901 """Main function controlling flow of the script. Assumes you have a local datastore, and need to copy a small subset of it over into the Travis CI test data directory. """ args = parse_command_line(sys.argv) # If no years were specified, use the most recent year of data. # If years were specified, keep only the years which are valid for that # data source, and optionally output a message saying which years are # being ignored because they aren't valid. yrs_by_src = {} for src in args.sources: if not args.year: yrs_by_src[src] = [max(pc.data_years[src])] else: yrs_by_src[src] = [ int(yr) for yr in args.year if int(yr) in pc.data_years[src] ] bad_yrs = [ int(yr) for yr in args.year if int(yr) not in pc.data_years[src] ] if bad_yrs: logger.warning(f"Invalid {src} years ignored: {bad_yrs}.") logger.info(f"out_dir: {args.out_dir}") pudl_settings = pudl.workspace.setup.derive_paths( pudl_in=pudl.workspace.setup.get_defaults()["pudl_in"], pudl_out=pudl.workspace.setup.get_defaults()["pudl_in"]) for src in args.sources: for yr in yrs_by_src[src]: src_dir = datastore.path(src, pudl_settings["data_dir"], year=yr, file=False) tmp_dir = os.path.join(args.out_dir, 'tmp') if src == 'ferc1': files_to_move = [ f"{pc.ferc1_tbl2dbf[f]}.DBF" for f in pc.ferc1_default_tables ] files_to_move = files_to_move + ['F1_PUB.DBC', 'F1_32.FPT'] elif src == 'epacems': files_to_move = [ datastore.path('epacems', pudl_settings["data_dir"], year=yr, state=st, month=mo) for mo in range(1, 13) for st in args.states ] files_to_move = [os.path.basename(f) for f in files_to_move] else: raise AssertionError(f"Unrecognized data source {src}") if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logger.info(f"src: {src_dir}") logger.info(f"tmp: {tmp_dir}") src_files = [os.path.join(src_dir, f) for f in files_to_move] dst_files = [os.path.join(tmp_dir, f) for f in files_to_move] for src_file, dst_file in zip(src_files, dst_files): if os.path.exists(dst_file): os.remove(dst_file) shutil.copy(src_file, dst_file) if src == 'ferc1': ferc1_test_zipfile = os.path.join(pudl_settings['data_dir'], f"f1_{yr}.zip") z = zipfile.ZipFile(ferc1_test_zipfile, mode='w', compression=zipfile.ZIP_DEFLATED) for root, dirs, files in os.walk(tmp_dir): for filename in files: z.write(os.path.join(root, filename), arcname=filename) logger.info(f"closing {ferc1_test_zipfile}") z.close() shutil.move(ferc1_test_zipfile, tmp_dir) for f in dst_files: os.remove(f) logger.info(f"organizing datastore for {src} {yr}") datastore.organize(src, yr, states=args.states, data_dir=pudl_settings['data_dir'], unzip=False)