def get_file_metadata_row_for_path( session: Session, region_code: str, path: GcsfsFilePath ) -> Union[schema.DirectIngestRawFileMetadata, schema.DirectIngestIngestFileMetadata]: """Returns metadata information for the provided path. If the file has not yet been registered in the appropriate metadata table, this function will generate a file_id to return with the metadata. """ parts = filename_parts_from_path(path) if parts.file_type == GcsfsDirectIngestFileType.INGEST_VIEW: results = session.query( schema.DirectIngestIngestFileMetadata).filter_by( region_code=region_code, is_invalidated=False, normalized_file_name=path.file_name).all() elif parts.file_type == GcsfsDirectIngestFileType.RAW_DATA: results = session.query(schema.DirectIngestRawFileMetadata).filter_by( region_code=region_code, normalized_file_name=path.file_name).all() else: raise ValueError(f'Unexpected path type: {parts.file_type}') if len(results) != 1: raise ValueError( f'Unexpected number of metadata results for path {path.abs_path()}: [{len(results)}]' ) return one(results)
def check_people_do_not_have_multiple_ids_same_type( session: Session, region_code: str, output_people: List[schema.StatePerson]) -> bool: """Validates that person has two ids of the same type (for states configured to enforce this invariant).""" check_not_dirty(session) logging.info( "[Invariant validation] Checking that no person has multiple external ids of the same type." ) if state_allows_multiple_ids_same_type(region_code): logging.info( "[Invariant validation] Multiple external ids of the same type allowed for [%s] - skipping.", region_code) return True person_ids = {p.person_id for p in output_people} if not person_ids: logging.warning( "[Invariant validation] No StatePersonExternalIds in the output set - skipping validations." ) return True counts_subquery = session.query( schema.StatePersonExternalId.state_code.label('state_code'), schema.StatePersonExternalId.person_id.label('person_id'), schema.StatePersonExternalId.id_type.label('id_type'), func.count().label('cnt') ).filter(schema.StatePersonExternalId.state_code == region_code.upper( )).filter( # Ideally we would not filter by person_ids, but that query takes ~10s on a test of US_PA external ids. Since # this will be run for every file, that sort of performance is prohibitive. We instead filter by just the person # ids we think we have touched this session. schema.StatePersonExternalId.person_id.in_(person_ids)).group_by( schema.StatePersonExternalId.state_code, schema.StatePersonExternalId.person_id, schema.StatePersonExternalId.id_type, ).subquery() query = session.query( counts_subquery.c.state_code, counts_subquery.c.person_id, counts_subquery.c.id_type, counts_subquery.c.cnt, ).filter(counts_subquery.c.cnt > 1).limit(1) results = query.all() if results: _state_code, person_id, id_type, count = one(results) logging.error( '[Invariant validation] Found people with multiple ids of the same type. First example: ' 'person_id=[%s], id_type=[%s] is used [%s] times.', person_id, id_type, count) return False logging.info( "[Invariant validation] Found no people with multiple external ids of the same type." ) return True
def get_file_metadata_row( session: Session, file_type: GcsfsDirectIngestFileType, file_id: int ) -> Union[schema.DirectIngestRawFileMetadata, schema.DirectIngestIngestFileMetadata]: """Queries for the file metadata row by the metadata row primary key.""" if file_type == GcsfsDirectIngestFileType.INGEST_VIEW: results = session.query(schema.DirectIngestIngestFileMetadata).filter_by(file_id=file_id).all() elif file_type == GcsfsDirectIngestFileType.RAW_DATA: results = session.query(schema.DirectIngestRawFileMetadata).filter_by(file_id=file_id).all() else: raise ValueError(f'Unexpected path type: {file_type}') return one(results)
def get_ingest_view_metadata_for_export_job( session: Session, region_code: str, file_tag: str, datetimes_contained_lower_bound_exclusive: Optional[datetime.datetime], datetimes_contained_upper_bound_inclusive: datetime.datetime, ingest_database_name: str, ) -> Optional[schema.DirectIngestIngestFileMetadata]: """Returns the ingest file metadata row corresponding to the export job with the provided args. Throws if such a row does not exist. """ results = (session.query(schema.DirectIngestIngestFileMetadata).filter_by( region_code=region_code.upper(), file_tag=file_tag, is_invalidated=False, is_file_split=False, datetimes_contained_lower_bound_exclusive= datetimes_contained_lower_bound_exclusive, datetimes_contained_upper_bound_inclusive= datetimes_contained_upper_bound_inclusive, ingest_database_name=ingest_database_name, ).all()) if not results: return None return one(results)
def get_ingest_file_metadata_row_for_path( session: Session, region_code: str, path: GcsfsFilePath, ingest_database_name: str) -> schema.DirectIngestIngestFileMetadata: """Returns metadata information for the provided path. If the file has not yet been registered in the appropriate metadata table, this function will generate a file_id to return with the metadata. """ parts = filename_parts_from_path(path) if parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW: raise ValueError(f"Unexpected file type [{parts.file_type}]") results = (session.query(schema.DirectIngestIngestFileMetadata).filter_by( region_code=region_code.upper(), is_invalidated=False, normalized_file_name=path.file_name, ingest_database_name=ingest_database_name, ).all()) if len(results) != 1: raise ValueError( f"Unexpected number of metadata results for path {path.abs_path()}: [{len(results)}]" ) return one(results)
def _pending_to_persistent(session: Session, instance: Any) -> None: """Called when a SQLAlchemy object transitions to a persistent object. If this function throws, the session will be rolled back and that object will not be committed.""" if not isinstance(instance, DirectIngestIngestFileMetadata): return results = (session.query(DirectIngestIngestFileMetadata).filter_by( is_invalidated=False, is_file_split=False, region_code=instance.region_code, file_tag=instance.file_tag, ingest_database_name=instance.ingest_database_name, datetimes_contained_lower_bound_exclusive=instance. datetimes_contained_lower_bound_exclusive, datetimes_contained_upper_bound_inclusive=instance. datetimes_contained_upper_bound_inclusive, ).all()) if len(results) > 1: raise IntegrityError( f"Attempting to commit repeated non-file split DirectIngestIngestFileMetadata row for " f"region_code={instance.region_code}, file_tag={instance.file_tag}, " f"ingest_database_name={instance.ingest_database_name}", f"datetimes_contained_lower_bound_exclusive={instance.datetimes_contained_lower_bound_exclusive}, " f"datetimes_contained_upper_bound_inclusive={instance.datetimes_contained_upper_bound_inclusive}", )
def get_date_sorted_unprocessed_raw_files_for_region( session: Session, region_code: str, ) -> List[schema.DirectIngestRawFileMetadata]: """Returns metadata for all raw files that do not have a processed_time from earliest to latest""" return (session.query(schema.DirectIngestRawFileMetadata).filter_by( region_code=region_code.upper(), processed_time=None).order_by( schema.DirectIngestRawFileMetadata. datetimes_contained_upper_bound_inclusive.asc()).all())
def get_ingest_file_metadata_row( session: Session, file_id: int, ingest_database_name: str, ) -> schema.DirectIngestIngestFileMetadata: """Queries for the ingest file metadata row by the metadata row primary key.""" results = (session.query(schema.DirectIngestIngestFileMetadata).filter_by( file_id=file_id, ingest_database_name=ingest_database_name).all()) return one(results)
def get_ingest_view_metadata_pending_export( session: Session, region_code: str) -> List[schema.DirectIngestIngestFileMetadata]: """Returns metadata for all ingest files have not yet been exported.""" return session.query(schema.DirectIngestIngestFileMetadata).filter_by( region_code=region_code, is_invalidated=False, is_file_split=False, export_time=None).all()
def get_date_sorted_unprocessed_ingest_view_files_for_region( session: Session, region_code: str, ingest_database_name: str, ) -> List[schema.DirectIngestIngestFileMetadata]: """Returns metadata for all ingest files that do not have a processed_time from earliest to latest""" return (session.query(schema.DirectIngestIngestFileMetadata).filter_by( region_code=region_code.upper(), processed_time=None, ingest_database_name=ingest_database_name, is_invalidated=False, ).order_by(schema.DirectIngestIngestFileMetadata. datetimes_contained_upper_bound_inclusive.asc()).all())
def get_metadata_for_raw_files_discovered_after_datetime( session: Session, region_code: str, raw_file_tag: str, discovery_time_lower_bound_exclusive: Optional[datetime.datetime] ) -> List[schema.DirectIngestRawFileMetadata]: """Returns metadata for all raw files with a given tag that have been updated after the provided date.""" query = session.query(schema.DirectIngestRawFileMetadata).filter_by( region_code=region_code, file_tag=raw_file_tag) if discovery_time_lower_bound_exclusive: query = query.filter(schema.DirectIngestRawFileMetadata.discovery_time > discovery_time_lower_bound_exclusive) return query.all()
def to_query(self, session: Session) -> Query: """Create a query to SELECT each column based on the Mapping's name.""" select_statements = [] for new_view_column_name, source_column in attr.asdict(self).items(): # Default unmapped columns to NULL to ensure we SELECT all columns select_statement = null().label(new_view_column_name) if source_column is not None: select_statement = source_column.label(new_view_column_name) select_statements.append(select_statement) return session.query(*select_statements)
def get_ingest_view_metadata_for_most_recent_valid_job( session: Session, region_code: str, file_tag: str) -> Optional[schema.DirectIngestIngestFileMetadata]: """Returns most recently created export metadata row where is_invalidated is False, or None if there are no metadata rows for this file tag for this manager's region.""" results = session.query(schema.DirectIngestIngestFileMetadata).filter_by( region_code=region_code, is_invalidated=False, is_file_split=False, file_tag=file_tag).order_by(schema.DirectIngestIngestFileMetadata. job_creation_time.desc()).limit(1).all() if not results: return None return one(results)
def assert_no_unexpected_entities_in_db( expected_entities: Sequence[DatabaseEntity], session: Session ): """Counts all of the entities present in the |expected_entities| graph by type and ensures that the same number of entities exists in the DB for each type. """ entity_counter: Dict[Type, List[DatabaseEntity]] = defaultdict(list) get_entities_by_type(expected_entities, entity_counter) for cls, entities_of_cls in entity_counter.items(): # Standalone classes do not need to be attached to a person by design, # so it is valid if some standalone entities are not reachable from the # provided |expected_entities| if is_standalone_class(cls): continue expected_ids = set() for entity in entities_of_cls: expected_ids.add(entity.get_id()) db_entities = session.query(cls).all() db_ids = set() for entity in db_entities: db_ids.add(entity.get_id()) if expected_ids != db_ids: print("\n********** Entities from |found_persons| **********\n") for entity in sorted(entities_of_cls, key=lambda x: x.get_id()): print_entity_tree(entity) print("\n********** Entities from db **********\n") for entity in sorted(db_entities, key=lambda x: x.get_id()): print_entity_tree(entity) raise ValueError( f"For cls {cls.__name__}, found difference in primary keys from" f"expected entities and those of entities read from db.\n" f"Expected ids not present in db: " f"{str(expected_ids - db_ids)}\n" f"Db ids not present in expected entities: " f"{str(db_ids - expected_ids)}\n" )
def _get_status_using_session( self, session: Session) -> DirectIngestInstanceStatus: return (session.query(DirectIngestInstanceStatus).filter( DirectIngestInstanceStatus.region_code == self.region_code, DirectIngestInstanceStatus.instance == self.ingest_instance.value, ).one())
def _fetch_most_recent_snapshots_for_entity_type( self, session: Session, master_class: Type, entity_ids: Set[int], schema: ModuleType, ) -> List[DatabaseEntity]: """Returns a list containing the most recent snapshot for each ID in |entity_ids| with type |master_class| """ # Get name of historical table in database (as distinct from name of ORM # class representing historical table in code) history_table_class = _get_historical_class(master_class, schema) history_table_name = history_table_class.__table__.name history_table_primary_key_col_name = ( history_table_class.get_primary_key_column_name()) # See module assumption #2 master_table_primary_key_col_name = master_class.get_primary_key_column_name( ) ids_list = ", ".join([str(id) for id in entity_ids]) # Get snapshot IDs in a separate query. The subquery logic here is ugly # and easier to do as a raw string query than through the ORM query, but # the return type of a raw string query is just a collection of values # rather than an ORM model. Doing this step as a separate query enables # passing just the IDs to the second request, which allows proper ORM # models to be returned as a result. snapshot_ids_query = f""" SELECT history.{history_table_primary_key_col_name}, history.{master_table_primary_key_col_name}, history.valid_to FROM {history_table_name} history JOIN ( SELECT {master_table_primary_key_col_name}, MAX(valid_from) AS valid_from FROM {history_table_name} WHERE {master_table_primary_key_col_name} IN ({ids_list}) GROUP BY {master_table_primary_key_col_name} ) AS most_recent_valid_from ON history.{master_table_primary_key_col_name} = most_recent_valid_from.{master_table_primary_key_col_name} WHERE history.valid_from = most_recent_valid_from.valid_from; """ results = session.execute(text(snapshot_ids_query)).fetchall() # Use only results where valid_to is None to exclude any overlapping # non-open snapshots snapshot_ids = [ snapshot_id for snapshot_id, master_id, valid_to in results if valid_to is None ] # Removing the below early return will pass in tests but fail in # production, because SQLite allows "IN ()" but Postgres does not if not snapshot_ids: return [] filter_statement = ( "{historical_table}.{primary_key_column} IN ({ids_list})".format( historical_table=history_table_name, primary_key_column=history_table_class. get_primary_key_column_name(), ids_list=", ".join([str(id) for id in snapshot_ids]), )) return session.query(history_table_class).filter( text(filter_statement)).all()