def process_raw(self, df, page, **partition): """Drops reserved columns.""" to_drop = [c for c in df.columns if c[:8] == 'reserved'] df.drop(to_drop, axis=1, inplace=True) df = df.rename( columns=self._metadata.get_column_map(page, **partition)) self.cols_added = [] df = fix_leading_zero_gen_ids(df) return df
def process_raw(self, df, page, **partition): """Adds source column and report_year column if missing.""" df = df.rename( columns=self._metadata.get_column_map(page, **partition)) if 'report_year' not in df.columns: df['report_year'] = datetime.strptime( list(partition.values())[0], "%Y-%m").year df = df.assign(data_source='eia860m') self.cols_added = ['data_source', 'report_year'] df = fix_leading_zero_gen_ids(df) return df
def process_raw(self, df, page, **partition): """Rename columns with location.""" warnings.warn( "Integration of EIA 861 into PUDL is still experimental and incomplete.\n" "The data has not yet been validated, and the structure may change." ) column_map_numeric = self._metadata.get_column_map(page, **partition) df = df.rename(columns=dict( zip(df.columns[list(column_map_numeric.keys())], list(column_map_numeric.values())))) self.cols_added = [] df = fix_leading_zero_gen_ids(df) return df
def process_raw(self, df, page, **partition): """ Apply necessary pre-processing to the dataframe. * Rename columns based on our compiled spreadsheet metadata * Add report_year if it is missing * Add a flag indicating if record came from EIA 860, or EIA 860M * Fix any generator_id values with leading zeroes. """ df = df.rename( columns=self._metadata.get_column_map(page, **partition)) if 'report_year' not in df.columns: df['report_year'] = list(partition.values())[0] self.cols_added = ['report_year'] # if this is one of the EIA860M pages, add data_source meta_eia860m = excel.Metadata('eia860m') pages_eia860m = meta_eia860m.get_all_pages() if page in pages_eia860m: df = df.assign(data_source='eia860') self.cols_added.append('data_source') df = fix_leading_zero_gen_ids(df) return df
def test_fix_leading_zero_gen_ids(): """Test removal of leading zeroes from EIA generator IDs.""" in_df = pd.DataFrame({ "generator_id": [ "0001", # Leading zeroes, all numeric string. "26", # An appropriate numeric string w/o leading zeroes. 100, # Integer, should get stringified. 100.0, # What happens if it's a float? "01-A", # Leading zeroes, alphanumeric. Should not change. "HRSG-01", # Alphanumeric, should be no change. ] }) expected_df = pd.DataFrame( {"generator_id": [ "1", "26", "100", "100.0", "01-A", "HRSG-01", ]}) out_df = fix_leading_zero_gen_ids(in_df) assert_frame_equal(out_df, expected_df)