def createTable(df: pd.DataFrame, engine: sqlalchemy.engine.base.Engine, tablename: str, dbname: str = None, ifExists: str = "error", indexList: list = None, isIndexUnique: bool = True, dtype: dict = {}) -> None: """ ===example=== createTable(df_iris, engine, "iris", "db_data", ifExists="replace") >>> None """ # make query string if dbname is None: dbname = engine.url.database new_names = renameColumns(df.columns.to_series()) col_dtype = [ dtypeParser(col) if name not in dtype else dtype[name] for name, col in df.iteritems() ] column_query_part = "\n ,".join( [f"{name} {dtype}" for name, dtype in zip(new_names, col_dtype)]) if indexList: idxSeries = pd.Series(indexList).astype(str) idx_str = ",".join(renameColumns(idxSeries)) q_unique = "unique " if isIndexUnique else "" index_query_part = q_unique + f"primary index ({idx_str})" else: index_query_part = "no primary index" query = """ create table {dbname}.{tablename} ( {column_query_part} ) {index_query_part} """.format(tablename=tablename, dbname=dbname, column_query_part=column_query_part, index_query_part=index_query_part) # execute query if ifExists == "replace": dropIfExists(tablename, dbname, engine) elif ifExists == "error": if tableIsExist(tablename, dbname, engine): raise ValueError(f"{tablename}.{dbname} is already exists.") elif ifExists == "insert": if tableIsExist(tablename, dbname, engine): return engine.execute(query) print("created table \n" + query)
def dropIfExists(tablename: str, dbname: str, engine: sqlalchemy.engine.base.Engine) -> None: """ ===example=== from sqlalchemy import create_engine engine = create_engine("teradata://*****:*****@hostip:1025") dropIfExists("iris", "db_data", engine) droped table db_data.iris >>> None """ if tableIsExist(tablename, dbname, engine): engine.execute(f"drop table {dbname}.{tablename}") print(f"droped table {dbname}.{tablename}")
def query_and_decrypt_data( db: sqlalchemy.engine.base.Engine, env_aead: tink.aead.KmsEnvelopeAead, table_name: str, ) -> None: with db.connect() as conn: # Execute the query and fetch all results recent_votes = conn.execute( f"SELECT team, time_cast, voter_email FROM {table_name} " "ORDER BY time_cast DESC LIMIT 5").fetchall() print("Team\tEmail\tTime Cast") for row in recent_votes: team = row[0] # Postgres pads CHAR fields with spaces. These will need to be removed before # decrypting. aad = team.rstrip() # Use the envelope AEAD primitive to decrypt the email, using the team name as # associated data. Encryption with associated data ensures authenticity # (who the sender is) and integrity (the data has not been tampered with) of that # data, but not its secrecy. (see RFC 5116 for more info) email = env_aead.decrypt(row[2], aad.encode()).decode() time_cast = row[1] # Print recent votes print(f"{team}\t{email}\t{time_cast}")
def usage_file_is_processed(file_name: str, db_engine: sqlalchemy.engine.base.Engine) -> bool: """ Verifies if the current file has been previously processed Parameters ---------- file_name: str Name of the file db_engine: sqlalchemy.engine.base.Engine Returns ------- bool True if the file has been processed, False if it has not. """ _create_usage_table_if_it_does_not_exist(db_engine) query = f"select exists(select 1 from {USAGE_TABLE_NAME} where FILE_NAME='{file_name}') as 'exists'" db_item = {"exists": False} with db_engine.connect() as con: result: Union[ResultProxy, None] = con.execute(query) db_item = _map_single_result_to_dict(result) return db_item["exists"] == 1
def encrypt_and_insert_data( db: sqlalchemy.engine.base.Engine, env_aead: tink.aead.KmsEnvelopeAead, table_name: str, team: str, email: str, ) -> None: time_cast = datetime.datetime.now(tz=datetime.timezone.utc) # Use the envelope AEAD primitive to encrypt the email, using the team name as # associated data. Encryption with associated data ensures authenticity # (who the sender is) and integrity (the data has not been tampered with) of that # data, but not its secrecy. (see RFC 5116 for more info) encrypted_email = env_aead.encrypt(email.encode(), team.encode()) # Verify that the team is one of the allowed options if team != "TABS" and team != "SPACES": logger.error(f"Invalid team specified: {team}") return # Preparing a statement before hand can help protect against injections. stmt = sqlalchemy.text( f"INSERT INTO {table_name} (time_cast, team, voter_email)" " VALUES (:time_cast, :team, :voter_email)" ) # Using a with statement ensures that the connection is always released # back into the pool at the end of statement (even if an error occurs) with db.connect() as conn: conn.execute( stmt, time_cast=time_cast, team=team, voter_email=encrypted_email) print(f"Vote successfully cast for '{team}' at time {time_cast}!")
def getUserByEmail(email: str, engine: sa.engine.base.Engine) -> User: ## should check username validity user = list(engine.connect().execute( f"select * from {User.__tablename__} where email = '{email}'")) if (len(user) == 0): return None else: user = user[0] user = User.from_row_to_obj(user) return user
def _table_to_csv(engine: sqlalchemy.engine.base.Engine, table_name: str, file_path: str): with open(file_path, 'w') as fh: outcsv = csv.writer(fh) with engine.connect() as con: records = con.execute(f"SELECT * FROM {table_name}") outcsv.writerow(records.keys()) outcsv.writerows(records)
def getUSerById(id: int, engine: sa.engine.base.Engine) -> User: user = list(engine.connect().execute( f"select * from {User.__tablename__} where id = '{id}'")) if (len(user) == 0): return None else: user = user[0] user = User.from_row_to_obj(user) return user
def insert_usage_file_name(file_name: str, db_engine: sqlalchemy.engine.base.Engine) -> None: """ Inserts the name of the file in the table for processed files Parameters ---------- file_name: str Name of the file db_engine: sqlalchemy.engine.base.Engine Returns ------- None """ _create_usage_table_if_it_does_not_exist(db_engine) db_engine.execute(f"INSERT INTO {USAGE_TABLE_NAME} VALUES('{file_name}')")
def tearDown(file: str, started_at: str, engine: sqlalchemy.engine.base.Engine, session: sqlalchemy.orm.session.Session): """ close session & engine, banner :param file: caller, usually __file__ :param started_at: eg, str(datetime.now()) :param engine: eg, nw.logic import session, engine :param session: from nw.logic import session, engine :return: """ session.close() engine.dispose() print("\n") print("**********************") print("** Test complete, SQLAlchemy session/engine closed for: " + file) print("** Started: " + started_at + " Ended: " + str(datetime.now())) print("**********************")
def last_sync_date( sync_db: sqlalchemy.engine.base.Engine) -> Optional[datetime]: with sync_db.connect() as con: try: usage_df = read_sql("SELECT asOfDate FROM Usage", con) if usage_df["asOfDate"].count() == 0: return None return date_parse(usage_df["asOfDate"].max()) # type: ignore except OperationalError: logger.debug("No Usage table yet") return None
def create_tables(config: list, engine: sqlalchemy.engine.base.Engine): con = engine.connect() for table in config: name = table.get('name') schema = table.get('schema') ddl = f"""DROP TABLE IF EXISTS {name}""" con.execute(ddl) ddl = f"""CREATE TABLE {name} ({schema})""" con.execute(ddl)
def build_from_engine(self, engine: sqlalchemy.engine.base.Engine) -> dict: self.setup(engine) # Create a connection to the database conn = engine.connect() # Record the time spent duration = { 'querying': {}, } # Sample each relation self.relations = {} sampling_method = { True: 'SYSTEM', False: 'BERNOULLI' }[self.block_sampling] for rel_name in self.rel_names: # Sample the relation if the number of rows is high enough query = 'SELECT * FROM {}'.format(rel_name) # Add a sampling statement if the sampling ratio is lower than 1 sampling_ratio = max(self.sampling_ratio, self.min_rows / self.rel_cards[rel_name]) if sampling_ratio < 1: # Make sure there won't be less samples then the minimum number of allowed rows query += ' TABLESAMPLE {} ({}) REPEATABLE ({})'.format( sampling_method, sampling_ratio * 100, self.seed) date_atts = [ att for att, typ in self.att_types[rel_name].items() if typ == 'date' ] tic = time.time() rel = pd.read_sql_query(sql=query, con=conn, parse_dates=date_atts) duration['querying'][rel_name] = time.time() - tic # Convert the datetimes to ISO formatted strings for att in date_atts: rel[att] = rel[att].map(lambda x: x.isoformat()) # Strip the whitespace from the string columns for att in rel.columns: if rel[att].dtype == 'object': rel[att] = rel[att].str.rstrip() # Store the relation self.relations[rel_name] = rel # Close the connection to the database conn.close() return duration
def delete_table(engine: sa.engine.base.Engine) -> bool: conn = engine.connect() try: conn.execute(f"drop table {User.__tablename__}") return True except sa.exc.OperationalError as err: if f"no such table: {User.__tablename__}" in err._message(): if (engine.echo): print(f"[!] Table '{User.__tablename__}' does not exist!") return False else: raise err
def cleanup_after_sync(resource_name: str, sync_db: sqlalchemy.engine.base.Engine): """ Delete sync temporary tables if they exist Parameters ---------- resource_name: str the name of the API resource, e.g. "Courses", to be used in SQL sync_db: sqlalchemy.engine.base.Engine an Engine instance for creating database connections """ with sync_db.connect() as con: con.execute(f"DROP TABLE IF EXISTS Sync_{resource_name}") con.execute(f"DROP TABLE IF EXISTS Unmatched_{resource_name}")
def temp_ids_con(engine: sa.engine.base.Engine, ids: set): """Create database connection that makes temp.ids available as single column temp table """ with engine.connect() as con: rows = ", ".join([f"({sql_clause_format(id)})" for id in ids]) queries = [ "DROP TABLE IF EXISTS temp.ids", "CREATE TEMP TABLE temp.ids(id INTEGER)", f"INSERT INTO temp.ids(id) VALUES {rows}", ] for query in queries: con.execute(query) yield con
def _create_sync_table_from_resource_df( resource_df: DataFrame, identity_columns: List[str], resource_name: str, sync_db: sqlalchemy.engine.base.Engine, ): """ Take fetched data and push to a new temporary sync table. Includes hash and tentative extractor CreateDate/LastModifiedDates. Parameters ---------- resource_df: DataFrame a DataFrame with current fetched data. identity_columns: List[str] a List of the identity columns for the resource dataframe. resource_name: str the name of the API resource, e.g. "Courses", to be used in SQL sync_db: sqlalchemy.engine.base.Engine an Engine instance for creating database connections """ with sync_db.connect() as con: # ensure sync table exists, need column ordering to be identical to regular table con.execute(f"DROP TABLE IF EXISTS Sync_{resource_name}") con.execute( f""" CREATE TABLE IF NOT EXISTS Sync_{resource_name} ( {SYNC_COLUMNS_SQL} ) """ ) sync_df: DataFrame = resource_df.copy() sync_df = add_hash_and_json_to(sync_df) # add (possibly composite) primary key, sorting for consistent ordering add_sourceid_to(sync_df, identity_columns) now: datetime = datetime.now() sync_df["CreateDate"] = now sync_df["LastModifiedDate"] = now sync_df["SyncNeeded"] = 1 sync_df = sync_df[SYNC_COLUMNS] sync_df.set_index("SourceId", inplace=True) # push to temporary sync table sync_df.to_sql( f"Sync_{resource_name}", sync_db, if_exists="append", index=True, chunksize=1000 )
def setup(self, engine: sqlalchemy.engine.base.Engine): # Retrieve the metadata to know what tables and joins are available metadata = tools.get_metadata(engine) self.rel_names = tuple(metadata.tables.keys()) # Create a connection to the database conn = engine.connect() # Retrieve relation cardinalities self.rel_cards = {} query = ''' SELECT relname, reltuples FROM pg_class WHERE relname IN :rel_names ''' rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names) for (rel_name, card) in rows: self.rel_cards[rel_name] = card # Retrieve attribute cardinalities and number of nulls self.att_cards = defaultdict(dict) self.null_fracs = defaultdict(dict) query = ''' SELECT tablename, attname, n_distinct, null_frac FROM pg_stats WHERE tablename IN :rel_names ''' rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names) for (rel_name, att_name, card, null_frac) in rows: self.att_cards[rel_name][att_name] = -card * self.rel_cards[ rel_name] if card < 0 else card self.null_fracs[rel_name][att_name] = null_frac # Retrieve the type of each attribute self.att_types = defaultdict(dict) query = ''' SELECT table_name, column_name, data_type FROM information_schema.columns WHERE table_name IN :rel_names ''' rows = conn.execute(sqlalchemy.text(query), rel_names=self.rel_names) for (rel_name, att_name, att_type) in rows: self.att_types[rel_name][att_name] = att_type # Close the connection to the database conn.close()
def sync_to_db_without_cleanup( resource_df: DataFrame, identity_columns: List[str], resource_name: str, sync_db: sqlalchemy.engine.base.Engine, ): """ Take fetched data and sync with database. Creates tables when necessary, but ok if temporary tables are there to start. Does not delete temporary tables when finished. Parameters ---------- resource_df: DataFrame a DataFrame with current fetched data identity_columns: List[str] a List of the identity columns for the resource dataframe. resource_name: str the name of the API resource, e.g. "Courses", to be used in SQL sync_db: sqlalchemy.engine.base.Engine an Engine instance for creating database connections Returns ------- DataFrame a DataFrame with current fetched data and reconciled CreateDate/LastModifiedDate """ assert (Series(identity_columns).isin( resource_df.columns).all()), "Identity columns missing from dataframe" # In certain cases we can end up with duplicate records, for example # in Canvas when a course belongs to a sub-account. De-duplicate the # DataFrame based on the identity_columns resource_df.drop_duplicates(subset=identity_columns, inplace=True) _create_sync_table_from_resource_df(resource_df, identity_columns, resource_name, sync_db) with sync_db.connect() as con: _ensure_main_table_exists(resource_name, con) _create_unmatched_records_temp_table(resource_name, con) _get_true_create_dates_for_unmatched_records(resource_name, con) _update_resource_table_with_changes(resource_name, con) result_df: DataFrame = _update_dataframe_with_true_dates( resource_df, identity_columns, resource_name, con) return result_df
def _table_exist( table_name: str, db_engine: sqlalchemy.engine.base.Engine, ) -> bool: with db_engine.connect() as con: result: Union[ResultProxy, None] = con.execute( """ SELECT name FROM sqlite_master WHERE type='table' AND name=?; """, (table_name, ), ) if result is None: return False if result.first() is None: return False return True
def load_features(engine: sqlalchemy.engine.base.Engine, processed_data, processed_labels): processed_data.to_sql('features', con=engine, index_label='user_id', if_exists='replace', method=psql_insert_copy) processed_labels = pd.DataFrame({'is_fraudster': processed_labels}, index=processed_labels.index) processed_labels.to_sql('labels', con=engine, index_label='user_id', if_exists='replace', method=psql_insert_copy) with engine.connect() as con: con.execute('ALTER TABLE features ADD PRIMARY KEY (user_id);') con.execute('ALTER TABLE labels ADD PRIMARY KEY (user_id);')
def _drop_everything(engine: sqlalchemy.engine.base.Engine): """(On a live db) drops all foreign key constraints before dropping all tables. Workaround for SQLAlchemy not doing DROP ## CASCADE for drop_all() (https://github.com/pallets/flask-sqlalchemy/issues/722) """ con = engine.connect() trans = con.begin() inspector = Inspector.from_engine(engine) # We need to re-create a minimal metadata with only the required things to # successfully emit drop constraints and tables commands for # postgres (based on the actual schema of the running instance) meta = MetaData() tables = [] all_fkeys = [] for view_name in inspector.get_view_names(): con.execute( "DROP MATERIALIZED VIEW IF EXISTS {} CASCADE".format(view_name)) for table_name in inspector.get_table_names(): fkeys = [] for fkey in inspector.get_foreign_keys(table_name): if not fkey["name"]: continue fkeys.append(ForeignKeyConstraint((), (), name=fkey["name"])) tables.append(Table(table_name, meta, *fkeys)) all_fkeys.extend(fkeys) for fkey in all_fkeys: con.execute(DropConstraint(fkey)) for table in tables: con.execute(DropTable(table)) trans.commit() Base.metadata.drop_all(engine)
def request_all_usage_as_df( resource: Optional[Resource], sync_db: sqlalchemy.engine.base.Engine, env_start_date: str, env_end_date: str, ) -> DataFrame: usage_df: DataFrame = request_latest_usage_as_df( resource, start_date(sync_db, env_start_date), end_date(env_end_date)) if usage_df.empty: return usage_df usage_df.to_sql("Usage", sync_db, if_exists="append", index=False, chunksize=500) # remove duplicates - leave only the most recent with sync_db.connect() as con: con.execute("DELETE from Usage " "WHERE rowid not in (select max(rowid) " "FROM Usage " "GROUP BY email, asOfDate)") return usage_df
def create_user(info: dict, engine: sa.engine.base.Engine) -> bool: user_table = User.get_table_obj(engine) ins = user_table.insert() conn = engine.connect() ## incomplete fields check integrity hashing, email/uname/pass format validation etc try: #user = list( conn.execute( ins.values( name=info['name'], username=info['username'], email=info['email'], password=bcrypt.hashpw(info['password'].encode(), bcrypt.gensalt()).decode(), role=info['role'], )) #.returning(sa.literal_column('*')))) #if(len(user) == 0): return False #else: user = User.from_row_to_obj(engine) #print(user) return True except sa.exc.IntegrityError as err: if (engine.echo): print(f"[!] {err._message()}") return False
def build_from_engine(self, engine: sqlalchemy.engine.base.Engine) -> dict: self.setup(engine) # Create a connection to the database conn = engine.connect() # Record the time spent duration = {'querying': {}, 'parameters': {}} # Create histograms per attribute self.histograms = {} self.n_in_bin = {} sampling_method = { True: 'SYSTEM', False: 'BERNOULLI' }[self.block_sampling] for rel_name in self.rel_names: self.histograms[rel_name] = {} self.n_in_bin[rel_name] = {} rel_card = self.rel_cards[rel_name] # Sample the relation if the number of rows is high enough query = 'SELECT * FROM {}'.format(rel_name) # Add a sampling statement if the sampling ratio is lower than 1 sampling_ratio = max(self.sampling_ratio, self.min_rows / rel_card) if sampling_ratio < 1: # Make sure there won't be less samples then the minimum number of allowed rows query += ' TABLESAMPLE {} ({}) REPEATABLE ({})'.format( sampling_method, sampling_ratio * 100, self.seed) date_atts = [ att for att, typ in self.att_types[rel_name].items() if typ == 'date' ] tic = time.time() rel = pd.read_sql_query(sql=query, con=conn, parse_dates=date_atts) duration['querying'][rel_name] = time.time() - tic # Convert the datetimes to ISO formatted strings for att in date_atts: rel[att] = rel[att].map(lambda x: x.isoformat()) # Strip the whitespace from the string columns for att in rel.columns: if rel[att].dtype == 'object': rel[att] = rel[att].str.rstrip() # Blacklist the ID columns blacklist = [ att for att in rel.columns if '_id' in att or 'id_' in att or att == 'id' or '_sk' in att or self.att_types[rel_name][att] == 'character varying' or round(rel_card * self.null_fracs[rel_name][att] + self.att_cards[rel_name][att]) == rel_card ] # Create one histogram per attribute tic = time.time() for att in set(rel.columns) - set(blacklist): rel[att], self.n_in_bin[rel_name][ att] = tools.discretize_series(rel[att], n_mcv=self.n_mcv, n_bins=self.n_bins) self.histograms[rel_name][att] = distribution.Distribution( on=att, by=None) self.histograms[rel_name][att].build_from_df( rel, types=self.att_types[rel_name]) duration['parameters'][rel_name] = time.time() - tic # Close the connection to the database conn.close() return duration
def update_on_table(df: pd.DataFrame, keys: update_key_type, values: update_key_type, table_name: str, engine: sa.engine.base.Engine, schema: str) -> int: """ :param df: a dataframe with data tha needs to be updated. Must have columns to be used as key and some for values :param keys: the set of columns to use as key, i.e. update when matched :param values: the set of columns to update, i.e. set when matched :param table_name: a table name as in util_function :param engine: the sqlalchemy engine for the database :param schema: a schema of interest - None if default schema of database is ok :return: the number of records updated """ # get table tbl = util_function(table_name, engine, schema) # change nan to None, make sure columns are modified so that we can easily bindparam df_ = df.copy() df_.columns = [f"{el.lower()}_updt" for el in df_.columns] groups = toolz.partition_all( CHUNK_SIZE, df_.where(pd.notnull(df_), None).to_dict(orient='records')) if not isinstance(keys, tuple) and not isinstance(keys, dict): raise BadArgumentType( "keys and values must either be both tuples or both dicts", None) # create where clause, and update statement update_statement: dml.Update if isinstance(keys, tuple): if not isinstance(values, tuple): raise BadArgumentType( "keys and values must either be both tuples or both dicts", None) where = [ tbl.c[el] == sa.bindparam(f"{el.lower()}_updt") for el in keys ] update_statement = tbl.update().where(sa.and_(*where)).values( dict((a, sa.bindparam(f"{a.lower()}_updt")) for a in values)) if isinstance(keys, dict): if not isinstance(values, dict): raise BadArgumentType( "keys and values must either be both tuples or both dicts", None) where = [ tbl.c[k] == sa.bindparam(f"{v.lower()}_updt") for k, v in keys.items() ] update_statement = tbl.update().where(sa.and_(*where)).values( dict((k, sa.bindparam(f"{v.lower()}_updt")) for k, v in values.items())) # update count, last_successful_update = 0, None with engine.connect() as connection: for group in groups: try: result = connection.execute(update_statement, group) last_successful_update = group[-1] count += result.rowcount except exc.OperationalError as _: # try again time.sleep(2) try: result = connection.execute(update_statement, group) last_successful_update = group[-1] count += result.rowcount except exc.OperationalError as e: raise OperationalError( f"Failed to update records. Last successful update: {last_successful_update}", e) return count
def atb_fixed_var_om_existing( results: pd.DataFrame, atb_hr_df: pd.DataFrame, settings: dict, pudl_engine: sqlalchemy.engine.base.Engine, ) -> pd.DataFrame: """Add fixed and variable O&M for existing power plants ATB O&M data for new power plants are used as reference values. Fixed and variable O&M for each technology and heat rate are calculated. Assume that O&M scales with heat rate from new plants to existing generators. A separate multiplier for fixed O&M is specified in the settings file. Parameters ---------- results : DataFrame Compiled results of clustered power plants with weighted average heat rates. Note that column names should include "technology", "Heat_rate_MMBTU_per_MWh", and "region". Technology names should not yet be converted to snake case. atb_hr_df : DataFrame Heat rate data from NREL ATB settings : dict User-defined parameters from a settings file Returns ------- DataFrame Same as incoming "results" dataframe but with new columns "Fixed_OM_cost_per_MWyr" and "Var_OM_cost_per_MWh" """ logger.info("Adding fixed and variable O&M for existing plants") techs = settings["eia_atb_tech_map"] existing_year = settings["atb_existing_year"] # ATB string is <technology>_<tech_detail> techs = {eia: atb.split("_") for eia, atb in techs.items()} df_list = [] grouped_results = results.reset_index().groupby( ["plant_id_eia", "technology"], as_index=False) for group, _df in grouped_results: plant_id, eia_tech = group existing_hr = _df["heat_rate_mmbtu_mwh"].mean() try: atb_tech, tech_detail = techs[eia_tech] except KeyError: if eia_tech in settings["tech_groups"]: raise KeyError( f"{eia_tech} is defined in 'tech_groups' but doesn't have a " "corresponding ATB technology in 'eia_atb_tech_map'") else: raise KeyError( f"{eia_tech} doesn't have a corresponding ATB technology in " "'eia_atb_tech_map'") try: new_build_hr = (atb_hr_df.query( "technology==@atb_tech & tech_detail==@tech_detail" "& basis_year==@existing_year").squeeze().at["heat_rate"]) except ValueError: # Not all technologies have a heat rate. If they don't, just set both values # to 1 existing_hr = 1 new_build_hr = 1 try: s = f""" select parameter_value from technology_costs_nrelatb where technology == "{atb_tech}" AND tech_detail == "{tech_detail}" AND basis_year == "{existing_year}" AND financial_case == "Market" AND cost_case == "Mid" AND atb_year == "{settings['atb_data_year']}" AND parameter == "variable_o_m_mwh" """ atb_var_om_mwh = pudl_engine.execute(s).fetchall()[0][0] except IndexError: # logger.warning(f"No variable O&M for {atb_tech}") atb_var_om_mwh = 0 try: s = f""" select parameter_value from technology_costs_nrelatb where technology == "{atb_tech}" AND tech_detail == "{tech_detail}" AND basis_year == "{existing_year}" AND financial_case == "Market" AND cost_case == "Mid" AND atb_year == "{settings['atb_data_year']}" AND parameter == "fixed_o_m_mw" """ atb_fixed_om_mw_yr = pudl_engine.execute(s).fetchall()[0][0] except IndexError: # logger.warning(f"No fixed O&M for {atb_tech}") atb_fixed_om_mw_yr = 0 nems_o_m_techs = [ "Combined Cycle", "Combustion Turbine", "Coal", "Steam Turbine", "Hydroelectric", "Geothermal", "Nuclear", ] if any(t in eia_tech for t in nems_o_m_techs): # Change CC and CT O&M to EIA NEMS values, which are much higher for CCs and # lower for CTs than a heat rate & linear mulitpler correction to the ATB # values. # Add natural gas steam turbine O&M. # Also using the new values for coal plants, assuming 40-50 yr age and half # FGD # https://www.eia.gov/analysis/studies/powerplants/generationcost/pdf/full_report.pdf # logger.info(f"Using NEMS values for {eia_tech} fixed/variable O&M") target_usd_year = settings["target_usd_year"] simple_o_m = { "Combined Cycle": { # "fixed_o_m_mw": inflation_price_adjustment( # 13.08 * 1000, 2017, target_usd_year # ), # "variable_o_m_mwh": inflation_price_adjustment( # 3.91, 2017, target_usd_year # ) }, # "Combustion Turbine": { # # This includes both the Fixed O&M and Capex. Capex includes # # variable O&M, which is split out in the calculations below. # "fixed_o_m_mw": inflation_price_adjustment( # (5.33 + 6.90) * 1000, 2017, target_usd_year # ), # "variable_o_m_mwh": 0, # }, "Natural Gas Steam Turbine": { # NEMS documenation splits capex and fixed O&M across 2 tables # "fixed_o_m_mw": inflation_price_adjustment( # (15.96 + 24.68) * 1000, 2017, target_usd_year # ), "variable_o_m_mwh": inflation_price_adjustment(1.0, 2017, target_usd_year) }, "Coal": { # "fixed_o_m_mw": inflation_price_adjustment( # ((22.2 + 27.88) / 2 + 46.01) * 1000, 2017, target_usd_year # ), "variable_o_m_mwh": inflation_price_adjustment(1.78, 2017, target_usd_year) }, "Conventional Hydroelectric": { "fixed_o_m_mw": inflation_price_adjustment(44.56 * 1000, 2017, target_usd_year), "variable_o_m_mwh": 0, }, "Geothermal": { "fixed_o_m_mw": inflation_price_adjustment(198.04 * 1000, 2017, target_usd_year), "variable_o_m_mwh": 0, }, "Pumped Hydro": { "fixed_o_m_mw": inflation_price_adjustment((23.63 + 14.83) * 1000, 2017, target_usd_year), "variable_o_m_mwh": 0, }, } if "Combined Cycle" in eia_tech: # https://www.eia.gov/analysis/studies/powerplants/generationcost/pdf/full_report.pdf plant_capacity = _df[settings["capacity_col"]].sum() assert plant_capacity > 0 if plant_capacity < 500: fixed = 15.62 * 1000 variable = 4.31 elif 500 <= plant_capacity < 1000: fixed = 9.27 * 1000 variable = 3.42 else: fixed = 11.68 * 1000 variable = 3.37 _df["Fixed_OM_cost_per_MWyr"] = inflation_price_adjustment( fixed, 2017, target_usd_year) _df["Var_OM_cost_per_MWh"] = inflation_price_adjustment( variable, 2017, target_usd_year) if "Combustion Turbine" in eia_tech: # need to adjust the EIA fixed/variable costs because they have no # variable cost per MWh for existing CTs but they do have per MWh for # new build. Assume $11/MWh from new-build and 4% CF: # (11*8760*0.04/1000)=$3.85/kW-yr. Scale the new-build variable # (~$11/MWh) by relative heat rate and subtract a /kW-yr value as # calculated above from the FOM. # Based on conversation with Jesse J. on Dec 20, 2019. plant_capacity = _df[settings["capacity_col"]].sum() op, op_value = (settings.get("atb_modifiers", {}).get( "ngct", {}).get("Var_OM_cost_per_MWh", (None, None))) if op: f = operator.attrgetter(op) atb_var_om_mwh = f(operator)(atb_var_om_mwh, op_value) variable = atb_var_om_mwh # * (existing_hr / new_build_hr) if plant_capacity < 100: annual_capex = 9.0 * 1000 fixed = annual_capex + 5.96 * 1000 elif 100 <= plant_capacity <= 300: annual_capex = 6.18 * 1000 fixed = annual_capex + 6.43 * 1000 else: annual_capex = 6.95 * 1000 fixed = annual_capex + 3.99 * 1000 fixed = fixed - (variable * 8760 * 0.04) _df["Fixed_OM_cost_per_MWyr"] = inflation_price_adjustment( fixed, 2017, target_usd_year) _df["Var_OM_cost_per_MWh"] = inflation_price_adjustment( variable, 2017, target_usd_year) if "Natural Gas Steam Turbine" in eia_tech: # https://www.eia.gov/analysis/studies/powerplants/generationcost/pdf/full_report.pdf plant_capacity = _df[settings["capacity_col"]].sum() assert plant_capacity > 0 if plant_capacity < 500: annual_capex = 18.86 * 1000 fixed = annual_capex + 29.73 * 1000 elif 500 <= plant_capacity < 1000: annual_capex = 11.57 * 1000 fixed = annual_capex + 17.98 * 1000 else: annual_capex = 10.82 * 1000 fixed = annual_capex + 14.51 * 1000 _df["Fixed_OM_cost_per_MWyr"] = inflation_price_adjustment( fixed, 2017, target_usd_year) _df["Var_OM_cost_per_MWh"] = simple_o_m[ "Natural Gas Steam Turbine"]["variable_o_m_mwh"] if "Coal" in eia_tech: plant_capacity = _df[settings["capacity_col"]].sum() assert plant_capacity > 0 age = settings["model_year"] - _df.operating_date.dt.year age = age.fillna(age.mean()) age = age.fillna(40) # https://www.eia.gov/analysis/studies/powerplants/generationcost/pdf/full_report.pdf annual_capex = (16.53 + (0.126 * age) + (5.68 * 0.5)) * 1000 if plant_capacity < 500: fixed = 44.21 * 1000 elif 500 <= plant_capacity < 1000: fixed = 34.02 * 1000 elif 1000 <= plant_capacity < 2000: fixed = 28.52 * 1000 else: fixed = 33.27 * 1000 _df["Fixed_OM_cost_per_MWyr"] = inflation_price_adjustment( fixed + annual_capex, 2017, target_usd_year) _df["Var_OM_cost_per_MWh"] = simple_o_m["Coal"][ "variable_o_m_mwh"] if "Hydroelectric" in eia_tech: _df["Fixed_OM_cost_per_MWyr"] = simple_o_m[ "Conventional Hydroelectric"]["fixed_o_m_mw"] _df["Var_OM_cost_per_MWh"] = simple_o_m[ "Conventional Hydroelectric"]["variable_o_m_mwh"] if "Geothermal" in eia_tech: _df["Fixed_OM_cost_per_MWyr"] = simple_o_m["Geothermal"][ "fixed_o_m_mw"] _df["Var_OM_cost_per_MWh"] = simple_o_m["Geothermal"][ "variable_o_m_mwh"] if "Pumped" in eia_tech: _df["Fixed_OM_cost_per_MWyr"] = simple_o_m["Pumped Hydro"][ "fixed_o_m_mw"] _df["Var_OM_cost_per_MWh"] = simple_o_m["Pumped Hydro"][ "variable_o_m_mwh"] if "Nuclear" in eia_tech: num_units = len(_df) plant_capacity = _df[settings["capacity_col"]].sum() # Operating costs for different size/num units in 2016 INL report # "Economic and Market Challenges Facing the U.S. Nuclear Fleet" # https://gain.inl.gov/Shared%20Documents/Economics-Nuclear-Fleet.pdf, # table 1. Average of the two costs are used in each case. # The costs in that report include fuel and VOM. Assume $0.66/mmbtu # and $2.32/MWh plus 90% CF (ATB 2020) to get the costs below. # The INL report doesn't give a dollar year for costs, assume 2015. if num_units == 1 and plant_capacity < 900: fixed = 315000 elif num_units == 1 and plant_capacity >= 900: fixed = 252000 else: fixed = 177000 # age = (settings["model_year"] - _df.operating_date.dt.year).values # age = age.fillna(age.mean()) # age = age.fillna(40) # EIA, 2020, "Assumptions to Annual Energy Outlook, Electricity Market Module," # Available: https://www.eia.gov/outlooks/aeo/assumptions/pdf/electricity.pdf # fixed = np.ones_like(age) # fixed[age < 30] *= 27 * 1000 # fixed[age >= 30] *= (27+37) * 1000 _df["Fixed_OM_cost_per_MWyr"] = inflation_price_adjustment( fixed, 2015, target_usd_year) _df["Var_OM_cost_per_MWh"] = atb_var_om_mwh * (existing_hr / new_build_hr) else: _df["Fixed_OM_cost_per_MWyr"] = atb_fixed_om_mw_yr _df["Var_OM_cost_per_MWh"] = atb_var_om_mwh * (existing_hr / new_build_hr) df_list.append(_df) mod_results = pd.concat(df_list, ignore_index=True) # mod_results = mod_results.sort_values(["model_region", "technology", "cluster"]) mod_results.loc[:, "Fixed_OM_cost_per_MWyr"] = mod_results.loc[:, "Fixed_OM_cost_per_MWyr"].astype( int) mod_results.loc[:, "Var_OM_cost_per_MWh"] = mod_results.loc[:, "Var_OM_cost_per_MWh"] return mod_results
def _create_usage_table_if_it_does_not_exist( db_engine: sqlalchemy.engine.base.Engine) -> None: if _table_exist(USAGE_TABLE_NAME, db_engine) is False: db_engine.execute(f"""CREATE TABLE {USAGE_TABLE_NAME} ( FILE_NAME TEXT PRIMARY KEY );""")
def fetch_atb_costs( pudl_engine: sqlalchemy.engine.base.Engine, settings: dict, offshore_spur_costs: pd.DataFrame = None, ) -> pd.DataFrame: """Get NREL ATB power plant cost data from database, filter where applicable. This function can also remove NREL ATB offshore spur costs if more accurate costs will be included elsewhere (e.g. as part of total interconnection costs). Parameters ---------- pudl_engine : sqlalchemy.Engine A sqlalchemy connection for use by pandas settings : dict User-defined parameters from a settings file. Needs to have keys `atb_data_year`, `atb_new_gen`, and `target_usd_year`. If the key `atb_financial_case` is not included, the default value will be "Market". offshore_spur_costs : pd.DataFrame An optional dataframe with spur costs for offshore wind resources. These costs are included in ATB for a fixed distance (same for all sites). PowerGenome interconnection costs for offshore sites include a spur cost calculated using actual distance from shore. Returns ------- pd.DataFrame Power plant cost data with columns: ['technology', 'cap_recovery_years', 'cost_case', 'financial_case', 'basis_year', 'tech_detail', 'fixed_o_m_mw', 'variable_o_m_mwh', 'capex', 'cf', 'fuel', 'lcoe', 'wacc_nominal'] """ logger.info("Loading NREL ATB data") col_names = [ "technology", "tech_detail", "cost_case", "parameter", "basis_year", "parameter_value", "dollar_year", ] atb_year = settings["atb_data_year"] fin_case = settings.get("atb_financial_case", "Market") # Fetch cost data from sqlite and create dataframe. Only get values for techs/cases # listed in the settings file. all_rows = [] wacc_rows = [] tech_list = [] techs = settings["atb_new_gen"] mod_techs = [] if settings.get("modified_atb_new_gen"): for _, m in settings.get("modified_atb_new_gen").items(): mod_techs.append([ m["atb_technology"], m["atb_tech_detail"], m["atb_cost_case"], None ]) cost_params = ( "capex_mw", "fixed_o_m_mw", "variable_o_m_mwh", "capex_mwh", "fixed_o_m_mwh", ) # add_pv_wacc = True for tech in techs + mod_techs: tech, tech_detail, cost_case, _ = tech # if tech == "UtilityPV": # add_pv_wacc = False s = f""" SELECT technology, tech_detail, cost_case, parameter, basis_year, parameter_value, dollar_year from technology_costs_nrelatb where technology == "{tech}" AND tech_detail == "{tech_detail}" AND financial_case == "{fin_case}" AND cost_case == "{cost_case}" AND atb_year == {atb_year} AND parameter IN ({','.join('?'*len(cost_params))}) """ all_rows.extend(pudl_engine.execute(s, cost_params).fetchall()) if tech not in tech_list: # ATB2020 summary file provides a single WACC for each technology and a single # tech detail of "*", so need to fetch this separately from other cost params. # Only need to fetch once per technology. wacc_s = f""" select technology, cost_case, basis_year, parameter_value from technology_costs_nrelatb where technology == "{tech}" AND financial_case == "{fin_case}" AND cost_case == "{cost_case}" AND atb_year == {atb_year} AND parameter == "wacc_nominal" """ wacc_rows.extend(pudl_engine.execute(wacc_s).fetchall()) tech_list.append(tech) if "Battery" not in tech_list: df = pd.DataFrame(all_rows, columns=col_names) wacc_df = pd.DataFrame( wacc_rows, columns=["technology", "cost_case", "basis_year", "wacc_nominal"]) else: # ATB doesn't have a WACC for battery storage. We use UtilityPV WACC as a default # stand-in -- make sure we have it in case. s = 'SELECT DISTINCT("technology") from technology_costs_nrelatb WHERE parameter == "wacc_nominal"' atb_techs = [x[0] for x in pudl_engine.execute(s).fetchall()] battery_wacc_standin = settings.get("atb_battery_wacc") battery_tech = [x for x in techs if x[0] == "Battery"][0] if isinstance(battery_wacc_standin, float): if battery_wacc_standin > 0.1: logger.warning( f"You defined a battery WACC of {battery_wacc_standin}, which seems" " very high. Check settings parameter `atb_battery_wacc`.") battery_wacc_rows = [(battery_tech[0], battery_tech[2], year, battery_wacc_standin) for year in range(2017, 2051)] wacc_rows.extend(battery_wacc_rows) elif battery_wacc_standin in atb_techs: # if battery_wacc_standin in tech_list: # pass # else: logger.info( f"Using {battery_wacc_standin} {fin_case} WACC for Battery storage." ) wacc_s = f""" select technology, cost_case, basis_year, parameter_value from technology_costs_nrelatb where technology == "{battery_wacc_standin}" AND financial_case == "{fin_case}" AND cost_case == "Mid" AND atb_year == {atb_year} AND parameter == "wacc_nominal" """ b_rows = pudl_engine.execute(wacc_s).fetchall() battery_wacc_rows = [(battery_tech[0], battery_tech[2], b_row[2], b_row[3]) for b_row in b_rows] wacc_rows.extend(battery_wacc_rows) else: raise ValueError( f"The settings key `atb_battery_wacc` value is {battery_wacc_standin}. It " f"should either be a float or a string from the list {atb_techs}." ) df = pd.DataFrame(all_rows, columns=col_names) wacc_df = pd.DataFrame( wacc_rows, columns=["technology", "cost_case", "basis_year", "wacc_nominal"]) # Transform from tidy to wide dataframe, which makes it easier to fill generator # rows with the correct values. atb_costs = (df.drop_duplicates().set_index([ "technology", "tech_detail", "cost_case", "dollar_year", "basis_year", "parameter", ]).unstack(level=-1)) atb_costs.columns = atb_costs.columns.droplevel(0) atb_costs = (atb_costs.reset_index().merge( wacc_df, on=["technology", "cost_case", "basis_year"], how="left").drop_duplicates()) atb_costs = atb_costs.fillna(0) usd_columns = [ "fixed_o_m_mw", "fixed_o_m_mwh", "variable_o_m_mwh", "capex_mw", "capex_mwh", ] for col in usd_columns: if col not in atb_costs.columns: atb_costs[col] = 0 atb_target_year = settings["target_usd_year"] atb_costs[usd_columns] = atb_costs.apply( lambda row: inflation_price_adjustment(row[usd_columns], base_year=row["dollar_year"], target_year=atb_target_year), axis=1, ) if any("PV" in tech for tech in tech_list) and atb_year == 2019: print("Inflating ATB 2019 PV costs from DC to AC") atb_costs.loc[atb_costs["technology"].str.contains("PV"), ["capex_mw", "fixed_o_m_mw", "variable_o_m_mwh"], ] *= settings.get("pv_ac_dc_ratio", 1.3) elif atb_year > 2019: logger.info( "PV costs are already in AC units, not inflating the cost.") if offshore_spur_costs is not None and "OffShoreWind" in atb_costs[ "technology"]: idx_cols = ["technology", "tech_detail", "cost_case", "basis_year"] offshore_spur_costs = offshore_spur_costs.set_index(idx_cols) atb_costs = atb_costs.set_index(idx_cols) atb_costs.loc[idx["OffShoreWind", :, :, :], "capex_mw"] = ( atb_costs.loc[idx["OffShoreWind", :, :, :], "capex_mw"] # .values - offshore_spur_costs.loc[idx["OffShoreWind", :, :, :], "capex_mw"] # .values ) atb_costs = atb_costs.reset_index() return atb_costs