def _execute_statement(self, statement: str) -> None: session = Session(bind=self.postgres_engine) try: session.execute(statement) session.commit() except Exception as e: logging.warning("Failed to cleanup: %s", e) session.rollback() finally: session.close()
def _alter_session_variables(cls, session: Session) -> None: # Postgres uses a query cost analysis heuristic to decide what type of read to use for a particular query. It # sometimes chooses to use a sequential read because for hard disk drives (HDDs, as opposed to solid state # drives, SSDs) that may be faster than jumping around to random pages of an index. This is especially likely # when running over small sets of data. Setting this option changes the heuristic to almost always prefer index # reads. # # Our postgres instances run on SSDs, so this should increase performance for us. This is also important # because sequential reads lock an entire table, whereas index reads only lock the particular predicate from a # query. See https://www.postgresql.org/docs/12/transaction-iso.html and # https://stackoverflow.com/questions/42288808/why-does-postgresql-serializable-transaction-think-this-as-conflict. # # TODO(#3928): Once defined in code, set this on the SQL instance itself instead of per session. if session.bind.dialect.name == "postgresql": session.execute("SET random_page_cost=1;")
def _fetch_most_recent_snapshots_for_entity_type( self, session: Session, master_class: Type, entity_ids: Set[int], schema: ModuleType, ) -> List[DatabaseEntity]: """Returns a list containing the most recent snapshot for each ID in |entity_ids| with type |master_class| """ # Get name of historical table in database (as distinct from name of ORM # class representing historical table in code) history_table_class = _get_historical_class(master_class, schema) history_table_name = history_table_class.__table__.name history_table_primary_key_col_name = ( history_table_class.get_primary_key_column_name()) # See module assumption #2 master_table_primary_key_col_name = master_class.get_primary_key_column_name( ) ids_list = ", ".join([str(id) for id in entity_ids]) # Get snapshot IDs in a separate query. The subquery logic here is ugly # and easier to do as a raw string query than through the ORM query, but # the return type of a raw string query is just a collection of values # rather than an ORM model. Doing this step as a separate query enables # passing just the IDs to the second request, which allows proper ORM # models to be returned as a result. snapshot_ids_query = f""" SELECT history.{history_table_primary_key_col_name}, history.{master_table_primary_key_col_name}, history.valid_to FROM {history_table_name} history JOIN ( SELECT {master_table_primary_key_col_name}, MAX(valid_from) AS valid_from FROM {history_table_name} WHERE {master_table_primary_key_col_name} IN ({ids_list}) GROUP BY {master_table_primary_key_col_name} ) AS most_recent_valid_from ON history.{master_table_primary_key_col_name} = most_recent_valid_from.{master_table_primary_key_col_name} WHERE history.valid_from = most_recent_valid_from.valid_from; """ results = session.execute(text(snapshot_ids_query)).fetchall() # Use only results where valid_to is None to exclude any overlapping # non-open snapshots snapshot_ids = [ snapshot_id for snapshot_id, master_id, valid_to in results if valid_to is None ] # Removing the below early return will pass in tests but fail in # production, because SQLite allows "IN ()" but Postgres does not if not snapshot_ids: return [] filter_statement = ( "{historical_table}.{primary_key_column} IN ({ids_list})".format( historical_table=history_table_name, primary_key_column=history_table_class. get_primary_key_column_name(), ids_list=", ".join([str(id) for id in snapshot_ids]), )) return session.query(history_table_class).filter( text(filter_statement)).all()