def add_daemon_heartbeat(self, daemon_heartbeat): with self.connect() as conn: # insert, or update if already present try: conn.execute( DaemonHeartbeatsTable.insert().values( # pylint: disable=no-value-for-parameter timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp), daemon_type=daemon_heartbeat.daemon_type.value, daemon_id=daemon_heartbeat.daemon_id, body=serialize_dagster_namedtuple(daemon_heartbeat), ) ) except db.exc.IntegrityError: conn.execute( DaemonHeartbeatsTable.update() # pylint: disable=no-value-for-parameter .where( DaemonHeartbeatsTable.c.daemon_type == daemon_heartbeat.daemon_type.value ) .values( # pylint: disable=no-value-for-parameter timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp), daemon_id=daemon_heartbeat.daemon_id, body=serialize_dagster_namedtuple(daemon_heartbeat), ) )
def store_asset(self, event): check.inst_param(event, "event", EventLogEntry) if not event.is_dagster_event or not event.dagster_event.asset_key: return materialization = event.dagster_event.step_materialization_data.materialization # We switched to storing the entire event record of the last materialization instead of just # the AssetMaterialization object, so that we have access to metadata like timestamp, # pipeline, run_id, etc. # # This should make certain asset queries way more performant, without having to do extra # queries against the event log. # # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization` # to `last_materialization_event`, for clarity. For now, we should do some back-compat. # # https://github.com/dagster-io/dagster/issues/3945 if self.has_secondary_index(ASSET_KEY_INDEX_COLS): insert_statement = ( AssetKeyTable.insert().values( # pylint: disable=no-value-for-parameter asset_key=event.dagster_event.asset_key.to_string(), last_materialization=serialize_dagster_namedtuple(event), last_materialization_timestamp=utc_datetime_from_timestamp( event.timestamp), last_run_id=event.run_id, tags=seven.json.dumps(materialization.tags) if materialization.tags else None, )) update_statement = ( AssetKeyTable.update().values( # pylint: disable=no-value-for-parameter last_materialization=serialize_dagster_namedtuple(event), last_materialization_timestamp=utc_datetime_from_timestamp( event.timestamp), last_run_id=event.run_id, tags=seven.json.dumps(materialization.tags) if materialization.tags else None, ).where( AssetKeyTable.c.asset_key == event.dagster_event.asset_key.to_string(), )) else: insert_statement = ( AssetKeyTable.insert().values( # pylint: disable=no-value-for-parameter asset_key=event.dagster_event.asset_key.to_string(), last_materialization=serialize_dagster_namedtuple(event), last_run_id=event.run_id, )) update_statement = ( AssetKeyTable.update().values( # pylint: disable=no-value-for-parameter last_materialization=serialize_dagster_namedtuple(event), last_run_id=event.run_id, ).where( AssetKeyTable.c.asset_key == event.dagster_event.asset_key.to_string(), )) with self.index_connection() as conn: try: conn.execute(insert_statement) except db.exc.IntegrityError: conn.execute(update_statement)
def _add_filter_limit(self, query, before=None, after=None, limit=None): check.opt_float_param(before, "before") check.opt_float_param(after, "after") check.opt_int_param(limit, "limit") if before: query = query.where(JobTickTable.c.timestamp < utc_datetime_from_timestamp(before)) if after: query = query.where(JobTickTable.c.timestamp > utc_datetime_from_timestamp(after)) if limit: query = query.limit(limit) return query
def add_daemon_heartbeat(self, daemon_heartbeat): with self.connect() as conn: conn.execute( db.dialects.mysql.insert(DaemonHeartbeatsTable).values( timestamp=utc_datetime_from_timestamp( daemon_heartbeat.timestamp), daemon_type=daemon_heartbeat.daemon_type, daemon_id=daemon_heartbeat.daemon_id, body=serialize_dagster_namedtuple(daemon_heartbeat), ).on_duplicate_key_update( timestamp=utc_datetime_from_timestamp( daemon_heartbeat.timestamp), daemon_id=daemon_heartbeat.daemon_id, body=serialize_dagster_namedtuple(daemon_heartbeat), ))
def purge_ticks(self, origin_id, selector_id, tick_status, before): check.str_param(origin_id, "origin_id") check.inst_param(tick_status, "tick_status", TickStatus) check.float_param(before, "before") utc_before = utc_datetime_from_timestamp(before) base_query = ( JobTickTable.delete() # pylint: disable=no-value-for-parameter .where(JobTickTable.c.status == tick_status.value).where( JobTickTable.c.timestamp < utc_before)) if self.has_instigators_table(): query = base_query.where( db.or_( JobTickTable.c.selector_id == selector_id, db.and_( JobTickTable.c.selector_id == None, JobTickTable.c.job_origin_id == origin_id, ), )) else: query = base_query.where(JobTickTable.c.job_origin_id == origin_id) with self.connect() as conn: conn.execute(query)
def _add_cursor_limit_to_query( self, query, before_cursor, after_cursor, limit, ascending=False, before_timestamp=None, ): """ Helper function to deal with cursor/limit pagination args """ if before_cursor: before_query = db.select([SqlEventLogStorageTable.c.id]).where( SqlEventLogStorageTable.c.id == before_cursor ) query = query.where(SqlEventLogStorageTable.c.id < before_query) if after_cursor: after_query = db.select([SqlEventLogStorageTable.c.id]).where( SqlEventLogStorageTable.c.id == after_cursor ) query = query.where(SqlEventLogStorageTable.c.id > after_query) if before_timestamp: query = query.where( SqlEventLogStorageTable.c.timestamp < utc_datetime_from_timestamp(before_timestamp) ) if limit: query = query.limit(limit) if ascending: query = query.order_by(SqlEventLogStorageTable.c.timestamp.asc()) else: query = query.order_by(SqlEventLogStorageTable.c.timestamp.desc()) return query
def store_event(self, event): '''Store an event corresponding to a pipeline run. Args: event (EventRecord): The event to store. ''' check.inst_param(event, 'event', EventRecord) dagster_event_type = None if event.is_dagster_event: dagster_event_type = event.dagster_event.event_type_value run_id = event.run_id # https://stackoverflow.com/a/54386260/324449 event_insert = SqlEventLogStorageTable.insert().values( # pylint: disable=no-value-for-parameter run_id=run_id, event=serialize_dagster_namedtuple(event), dagster_event_type=dagster_event_type, timestamp=utc_datetime_from_timestamp(event.timestamp), step_key=event.step_key, ) with self.connect(run_id) as conn: conn.execute(event_insert)
def create_schedule_tick(self, repository_name, schedule_tick_data): check.str_param(repository_name, 'repository_name') check.inst_param(schedule_tick_data, 'schedule_tick_data', ScheduleTickData) with self.connect() as conn: try: tick_insert = ScheduleTickTable.insert().values( # pylint: disable=no-value-for-parameter repository_name=repository_name, schedule_name=schedule_tick_data.schedule_name, status=schedule_tick_data.status.value, timestamp=utc_datetime_from_timestamp(schedule_tick_data.timestamp), tick_body=serialize_dagster_namedtuple(schedule_tick_data), ) result = conn.execute(tick_insert) tick_id = result.inserted_primary_key[0] return ScheduleTick(tick_id, schedule_tick_data) except db.exc.IntegrityError as exc: six.raise_from( DagsterInvariantViolationError( 'Unable to insert ScheduleTick for schedule {schedule_name} in storage'.format( schedule_name=schedule_tick_data.schedule_name, ) ), exc, )
def _add_asset_wipe_filter_to_query(self, query, asset_details): if not asset_details or not asset_details.last_wipe_timestamp: return query return query.where( SqlEventLogStorageTable.c.timestamp > utc_datetime_from_timestamp( asset_details.last_wipe_timestamp))
def has_asset_key(self, asset_key: AssetKey) -> bool: check.inst_param(asset_key, "asset_key", AssetKey) query = (db.select([ AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details ]).where( db.or_( AssetKeyTable.c.asset_key == asset_key.to_string(), AssetKeyTable.c.asset_key == asset_key.to_string(legacy=True), )).limit(1)) with self.index_connection() as conn: row = conn.execute(query).fetchone() if not row: return False asset_details: Optional[ AssetDetails] = AssetDetails.from_db_string(row[1]) if not asset_details or not asset_details.last_wipe_timestamp: return True materialization_row = conn.execute( db.select([SqlEventLogStorageTable.c.timestamp]).where( db.or_( AssetKeyTable.c.asset_key == asset_key.to_string(), AssetKeyTable.c.asset_key == asset_key.to_string( legacy=True), )).order_by( SqlEventLogStorageTable.c.timestamp.desc()).limit( 1)).fetchone() if not materialization_row: return False return utc_datetime_from_naive( materialization_row[0]) > utc_datetime_from_timestamp( asset_details.last_wipe_timestamp)
def prepare_insert_statement(self, event): ''' Helper method for preparing the event log SQL insertion statement. Abstracted away to have a single place for the logical table representation of the event, while having a way for SQL backends to implement different execution implementations for `store_event`. See the `dagster-postgres` implementation which overrides the generic SQL implementation of `store_event`. ''' dagster_event_type = None asset_key_str = None step_key = event.step_key if event.is_dagster_event: dagster_event_type = event.dagster_event.event_type_value step_key = event.dagster_event.step_key if event.dagster_event.asset_key: check.inst_param(event.dagster_event.asset_key, 'asset_key', AssetKey) asset_key_str = event.dagster_event.asset_key.to_string() # https://stackoverflow.com/a/54386260/324449 return SqlEventLogStorageTable.insert().values( # pylint: disable=no-value-for-parameter run_id=event.run_id, event=serialize_dagster_namedtuple(event), dagster_event_type=dagster_event_type, timestamp=utc_datetime_from_timestamp(event.timestamp), step_key=step_key, asset_key=asset_key_str, )
def store_asset(self, event): check.inst_param(event, "event", EventLogEntry) if not event.is_dagster_event or not event.dagster_event.asset_key: return materialization = event.dagster_event.step_materialization_data.materialization if self.has_secondary_index(ASSET_KEY_INDEX_COLS): with self.index_connection() as conn: conn.execute( db.dialects.postgresql.insert(AssetKeyTable).values( asset_key=event.dagster_event.asset_key.to_string(), last_materialization=serialize_dagster_namedtuple( materialization), last_materialization_timestamp= utc_datetime_from_timestamp(event.timestamp), last_run_id=event.run_id, tags=seven.json.dumps(materialization.tags) if materialization.tags else None, ).on_conflict_do_update( index_elements=[AssetKeyTable.c.asset_key], set_=dict( last_materialization=serialize_dagster_namedtuple( materialization), last_materialization_timestamp= utc_datetime_from_timestamp(event.timestamp), last_run_id=event.run_id, tags=seven.json.dumps(materialization.tags) if materialization.tags else None, ), )) else: with self.index_connection() as conn: conn.execute( db.dialects.postgresql.insert(AssetKeyTable).values( asset_key=event.dagster_event.asset_key.to_string(), last_materialization=serialize_dagster_namedtuple( materialization), last_run_id=event.run_id, ).on_conflict_do_update( index_elements=[AssetKeyTable.c.asset_key], set_=dict( last_materialization=serialize_dagster_namedtuple( materialization), last_run_id=event.run_id, ), ))
def all_asset_keys(self): with self.index_connection() as conn: results = conn.execute( db.select([AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details]) ).fetchall() asset_keys = set() wiped = set() wiped_timestamps = {} for result in results: asset_key = AssetKey.from_db_string(result[0]) asset_details: Optional[AssetDetails] = AssetDetails.from_db_string(result[1]) asset_keys.add(asset_key) if asset_details and asset_details.last_wipe_timestamp: wiped_timestamps[asset_key] = asset_details.last_wipe_timestamp if wiped_timestamps: materialized_timestamps = {} # fetch the last materialization timestamp per asset key materialization_results = conn.execute( db.select( [ SqlEventLogStorageTable.c.asset_key, db.func.max(SqlEventLogStorageTable.c.timestamp), ] ) .where( SqlEventLogStorageTable.c.asset_key.in_( [asset_key.to_string() for asset_key in wiped_timestamps.keys()] ) ) .group_by(SqlEventLogStorageTable.c.asset_key) .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).asc()) ).fetchall() for result in materialization_results: asset_key = AssetKey.from_db_string(result[0]) last_materialized_timestamp = result[1] materialized_timestamps[asset_key] = last_materialized_timestamp # calculate the set of wiped asset keys that have not had a materialization since # the wipe timestamp wiped = set( [ asset_key for asset_key in wiped_timestamps.keys() if not materialized_timestamps.get(asset_key) or utc_datetime_from_naive(materialized_timestamps.get(asset_key)) < utc_datetime_from_timestamp(wiped_timestamps[asset_key]) ] ) return list(asset_keys.difference(wiped))
def add_backfill(self, partition_backfill: PartitionBackfill): check.inst_param(partition_backfill, "partition_backfill", PartitionBackfill) with self.connect() as conn: conn.execute(BulkActionsTable.insert().values( # pylint: disable=no-value-for-parameter key=partition_backfill.backfill_id, status=partition_backfill.status.value, timestamp=utc_datetime_from_timestamp( partition_backfill.backfill_timestamp), body=serialize_dagster_namedtuple(partition_backfill), ))
def add_daemon_heartbeat(self, daemon_heartbeat): with self.connect() as conn: # insert or update if already present, using postgres specific on_conflict conn.execute( db.dialects.postgresql.insert(DaemonHeartbeatsTable) .values( # pylint: disable=no-value-for-parameter timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp), daemon_type=daemon_heartbeat.daemon_type, daemon_id=daemon_heartbeat.daemon_id, body=serialize_dagster_namedtuple(daemon_heartbeat), ) .on_conflict_do_update( index_elements=[DaemonHeartbeatsTable.c.daemon_type], set_={ "timestamp": utc_datetime_from_timestamp(daemon_heartbeat.timestamp), "daemon_id": daemon_heartbeat.daemon_id, "body": serialize_dagster_namedtuple(daemon_heartbeat), }, ) )
def has_asset_key(self, asset_key: AssetKey) -> bool: check.inst_param(asset_key, "asset_key", AssetKey) if self.has_secondary_index(ASSET_KEY_INDEX_COLS): query = (db.select([AssetKeyTable.c.asset_key]).where( db.or_( AssetKeyTable.c.asset_key == asset_key.to_string(), AssetKeyTable.c.asset_key == asset_key.to_string( legacy=True), )).where( db.or_( AssetKeyTable.c.wipe_timestamp == None, AssetKeyTable.c.last_materialization_timestamp > AssetKeyTable.c.wipe_timestamp, )).limit(1)) with self.index_connection() as conn: row = conn.execute(query).fetchone() return bool(row) # has not migrated, need to pull asset_details to get wipe status query = (db.select([ AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details ]).where( db.or_( AssetKeyTable.c.asset_key == asset_key.to_string(), AssetKeyTable.c.asset_key == asset_key.to_string(legacy=True), )).limit(1)) with self.index_connection() as conn: row = conn.execute(query).fetchone() if not row: return False asset_details: Optional[ AssetDetails] = AssetDetails.from_db_string(row[1]) if not asset_details or not asset_details.last_wipe_timestamp: return True materialization_row = conn.execute( db.select([SqlEventLogStorageTable.c.timestamp]).where( db.or_( AssetKeyTable.c.asset_key == asset_key.to_string(), AssetKeyTable.c.asset_key == asset_key.to_string( legacy=True), )).order_by( SqlEventLogStorageTable.c.timestamp.desc()).limit( 1)).fetchone() if not materialization_row: return False return utc_datetime_from_naive( materialization_row[0]) > utc_datetime_from_timestamp( asset_details.last_wipe_timestamp)
def purge_job_ticks(self, job_origin_id, tick_status, before): check.str_param(job_origin_id, "job_origin_id") check.inst_param(tick_status, "tick_status", JobTickStatus) check.float_param(before, "before") utc_before = utc_datetime_from_timestamp(before) with self.connect() as conn: conn.execute( JobTickTable.delete() # pylint: disable=no-value-for-parameter .where(JobTickTable.c.status == tick_status.value).where( JobTickTable.c.timestamp < utc_before).where( JobTickTable.c.job_origin_id == job_origin_id))
def update_job_tick(self, tick): check.inst_param(tick, "tick", JobTick) with self.connect() as conn: conn.execute( JobTickTable.update() # pylint: disable=no-value-for-parameter .where(JobTickTable.c.id == tick.tick_id).values( status=tick.status.value, type=tick.job_type.value, timestamp=utc_datetime_from_timestamp(tick.timestamp), tick_body=serialize_dagster_namedtuple(tick.job_tick_data), )) return tick
def _add_filter_limit(self, query, before=None, after=None, limit=None, statuses=None): check.opt_float_param(before, "before") check.opt_float_param(after, "after") check.opt_int_param(limit, "limit") check.opt_list_param(statuses, "statuses", of_type=TickStatus) if before: query = query.where( JobTickTable.c.timestamp < utc_datetime_from_timestamp(before)) if after: query = query.where( JobTickTable.c.timestamp > utc_datetime_from_timestamp(after)) if limit: query = query.limit(limit) if statuses: query = query.where( JobTickTable.c.status.in_( [status.value for status in statuses])) return query
def update_event_log_record(self, record_id, event): ''' Utility method for migration scripts to update SQL representation of event records. ''' check.int_param(record_id, 'record_id') check.inst_param(event, 'event', EventRecord) dagster_event_type = None if event.is_dagster_event: dagster_event_type = event.dagster_event.event_type_value with self.connect(run_id=event.run_id) as conn: conn.execute( SqlEventLogStorageTable.update() # pylint: disable=no-value-for-parameter .where(SqlEventLogStorageTable.c.id == record_id).values( event=serialize_dagster_namedtuple(event), dagster_event_type=dagster_event_type, timestamp=utc_datetime_from_timestamp(event.timestamp), step_key=event.step_key, ))
def store_asset_event(self, event): asset_key = event.dagster_event.asset_key asset = self._assets[asset_key] if asset_key in self._assets else { "id": len(self._assets) } asset["last_materialization_timestamp"] = utc_datetime_from_timestamp( event.timestamp) if event.dagster_event.is_step_materialization: materialization = event.dagster_event.step_materialization_data.materialization asset["last_materialization"] = event asset[ "tags"] = materialization.tags if materialization.tags else None if (event.dagster_event.is_step_materialization or event.dagster_event.is_asset_materialization_planned): asset["last_run_id"] = event.run_id self._assets[asset_key] = asset
def update_tick(self, tick): check.inst_param(tick, "tick", InstigatorTick) values = { "status": tick.status.value, "type": tick.instigator_type.value, "timestamp": utc_datetime_from_timestamp(tick.timestamp), "tick_body": serialize_dagster_namedtuple(tick.tick_data), } if self.has_instigators_table() and tick.selector_id: values["selector_id"] = tick.selector_id with self.connect() as conn: conn.execute( JobTickTable.update() # pylint: disable=no-value-for-parameter .where(JobTickTable.c.id == tick.tick_id).values(**values)) return tick
def wipe_asset(self, asset_key): check.inst_param(asset_key, "asset_key", AssetKey) wipe_timestamp = pendulum.now("UTC").timestamp() if self.has_secondary_index(ASSET_KEY_INDEX_COLS): with self.index_connection() as conn: conn.execute( AssetKeyTable.update() # pylint: disable=no-value-for-parameter .where( db.or_( AssetKeyTable.c.asset_key == asset_key.to_string(), AssetKeyTable.c.asset_key == asset_key.to_string( legacy=True), )).values( last_materialization=None, last_run_id=None, last_materialization_timestamp=None, tags=None, asset_details=serialize_dagster_namedtuple( AssetDetails( last_wipe_timestamp=wipe_timestamp)), wipe_timestamp=utc_datetime_from_timestamp( wipe_timestamp), )) else: with self.index_connection() as conn: conn.execute( AssetKeyTable.update() # pylint: disable=no-value-for-parameter .where( db.or_( AssetKeyTable.c.asset_key == asset_key.to_string(), AssetKeyTable.c.asset_key == asset_key.to_string( legacy=True), )).values( last_materialization=None, last_run_id=None, asset_details=serialize_dagster_namedtuple( AssetDetails( last_wipe_timestamp=wipe_timestamp)), ))
def create_job_tick(self, job_tick_data): check.inst_param(job_tick_data, "job_tick_data", JobTickData) with self.connect() as conn: try: tick_insert = JobTickTable.insert().values( # pylint: disable=no-value-for-parameter job_origin_id=job_tick_data.job_origin_id, status=job_tick_data.status.value, type=job_tick_data.job_type.value, timestamp=utc_datetime_from_timestamp( job_tick_data.timestamp), tick_body=serialize_dagster_namedtuple(job_tick_data), ) result = conn.execute(tick_insert) tick_id = result.inserted_primary_key[0] return JobTick(tick_id, job_tick_data) except db.exc.IntegrityError as exc: raise DagsterInvariantViolationError( f"Unable to insert JobTick for job {job_tick_data.job_name} in storage" ) from exc
def create_tick(self, tick_data): check.inst_param(tick_data, "tick_data", TickData) values = { "job_origin_id": tick_data.instigator_origin_id, "status": tick_data.status.value, "type": tick_data.instigator_type.value, "timestamp": utc_datetime_from_timestamp(tick_data.timestamp), "tick_body": serialize_dagster_namedtuple(tick_data), } if self.has_instigators_table() and tick_data.selector_id: values["selector_id"] = tick_data.selector_id with self.connect() as conn: try: tick_insert = JobTickTable.insert().values(**values) # pylint: disable=no-value-for-parameter result = conn.execute(tick_insert) tick_id = result.inserted_primary_key[0] return InstigatorTick(tick_id, tick_data) except db.exc.IntegrityError as exc: raise DagsterInvariantViolationError( f"Unable to insert InstigatorTick for job {tick_data.instigator_name} in storage" ) from exc
def update_event_log_record(self, record_id, event): """ Utility method for migration scripts to update SQL representation of event records. """ check.int_param(record_id, "record_id") check.inst_param(event, "event", EventRecord) dagster_event_type = None asset_key_str = None if event.is_dagster_event: dagster_event_type = event.dagster_event.event_type_value if event.dagster_event.asset_key: check.inst_param(event.dagster_event.asset_key, "asset_key", AssetKey) asset_key_str = event.dagster_event.asset_key.to_string() with self.run_connection(run_id=event.run_id) as conn: conn.execute( SqlEventLogStorageTable.update() # pylint: disable=no-value-for-parameter .where(SqlEventLogStorageTable.c.id == record_id).values( event=serialize_dagster_namedtuple(event), dagster_event_type=dagster_event_type, timestamp=utc_datetime_from_timestamp(event.timestamp), step_key=event.step_key, asset_key=asset_key_str, ))
def get_asset_runs_count_by_step(graphene_info, asset_nodes): from ..schema.pipelines.pipeline import GrapheneJobRunsCount instance = graphene_info.context.instance jobs_runs_count: Dict[str, GrapheneJobRunsCount] = {} if len(asset_nodes) == 0: return jobs_runs_count step_key_to_job_names: Dict[str, List[str]] = { asset_node.op_name: asset_node.job_names for asset_node in asset_nodes } materializations = instance.get_latest_materialization_events( [asset_node.asset_key for asset_node in asset_nodes]) for asset_node in asset_nodes: event = materializations.get(asset_node.asset_key) step_key = asset_node.op_name job_names = step_key_to_job_names[step_key] runs_count = sum([ instance.get_runs_count( RunsFilter( pipeline_name=job_name, updated_after=utc_datetime_from_timestamp(event.timestamp) if event else None, )) for job_name in job_names ]) # If a materialization has occurred, we subtract one so that the runs count # does not include the run that generated the materialization. if event: runs_count -= 1 jobs_runs_count[step_key] = GrapheneJobRunsCount( step_key, job_names, runs_count, True if event else False) return jobs_runs_count
def migrate_asset_keys_index_columns(event_log_storage, print_fn=None): from dagster.core.storage.event_log.sql_event_log import SqlEventLogStorage from dagster.serdes import serialize_dagster_namedtuple from .schema import AssetKeyTable, SqlEventLogStorageTable if not isinstance(event_log_storage, SqlEventLogStorage): return with event_log_storage.index_connection() as conn: if print_fn: print_fn("Querying asset keys.") results = conn.execute( db.select( [ AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details, AssetKeyTable.c.last_materialization, ] ) ).fetchall() if print_fn: print_fn(f"Found {len(results)} assets to reindex.") results = tqdm(results) for row in results: asset_key_str, asset_details_str, last_materialization_str = row wipe_timestamp = None event = None asset_key = AssetKey.from_db_string(asset_key_str) if asset_details_str: asset_details = deserialize_json_to_dagster_namedtuple(asset_details_str) wipe_timestamp = asset_details.last_wipe_timestamp if asset_details else None if last_materialization_str: event_or_materialization = deserialize_json_to_dagster_namedtuple( last_materialization_str ) if isinstance(event_or_materialization, EventLogEntry): event = event_or_materialization if not event: materialization_query = ( db.select([SqlEventLogStorageTable.c.event]) .where( db.or_( SqlEventLogStorageTable.c.asset_key == asset_key.to_string(), SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True), ) ) .order_by(SqlEventLogStorageTable.c.timestamp.desc()) .limit(1) ) row = conn.execute(materialization_query).fetchone() if row: event = deserialize_json_to_dagster_namedtuple(row[0]) if not event: # this must be a wiped asset conn.execute( AssetKeyTable.update() .values( # pylint: disable=no-value-for-parameter last_materialization=None, last_materialization_timestamp=None, wipe_timestamp=utc_datetime_from_timestamp(wipe_timestamp) if wipe_timestamp else None, tags=None, ) .where( AssetKeyTable.c.asset_key == asset_key.to_string(), ) ) else: tags = event.dagster_event.step_materialization_data.materialization.tags conn.execute( AssetKeyTable.update() .values( # pylint: disable=no-value-for-parameter last_materialization=serialize_dagster_namedtuple(event), last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp), wipe_timestamp=utc_datetime_from_timestamp(wipe_timestamp) if wipe_timestamp else None, tags=seven.json.dumps(tags) if tags else None, ) .where( AssetKeyTable.c.asset_key == asset_key.to_string(), ) )
def _fetch_raw_asset_rows(self, asset_keys=None, prefix=None, limit=None, cursor=None): # fetches rows containing asset_key, last_materialization, and asset_details from the DB, # applying the filters specified in the arguments. Does not guarantee that the number of # rows returned will match the limit specified. This helper function is used to fetch a # chunk of asset key rows, which may or may not be wiped. # # Returns a tuple of (rows, has_more, cursor), where each row is a tuple of serialized # asset_key, materialization, and asset_details columns = [ AssetKeyTable.c.asset_key, AssetKeyTable.c.last_materialization, AssetKeyTable.c.asset_details, ] is_partial_query = bool(asset_keys) or bool(prefix) or bool(limit) or bool(cursor) if self.has_asset_key_index_cols() and not is_partial_query: # if the schema has been migrated, fetch the last_materialization_timestamp to see if # we can lazily migrate the data table columns.append(AssetKeyTable.c.last_materialization_timestamp) columns.append(AssetKeyTable.c.wipe_timestamp) query = db.select(columns).order_by(AssetKeyTable.c.asset_key.asc()) query = self._apply_asset_filter_to_query(query, asset_keys, prefix, limit, cursor) if self.has_secondary_index(ASSET_KEY_INDEX_COLS): query = query.where( db.or_( AssetKeyTable.c.wipe_timestamp == None, AssetKeyTable.c.last_materialization_timestamp > AssetKeyTable.c.wipe_timestamp, ) ) with self.index_connection() as conn: rows = conn.execute(query).fetchall() return rows, False, None with self.index_connection() as conn: rows = conn.execute(query).fetchall() wiped_timestamps_by_asset_key = {} row_by_asset_key = OrderedDict() for row in rows: asset_key = AssetKey.from_db_string(row[0]) if not asset_key: continue asset_details = AssetDetails.from_db_string(row[2]) if not asset_details or not asset_details.last_wipe_timestamp: row_by_asset_key[asset_key] = row continue materialization_or_event = ( deserialize_json_to_dagster_namedtuple(row[1]) if row[1] else None ) if isinstance(materialization_or_event, EventLogEntry): if asset_details.last_wipe_timestamp > materialization_or_event.timestamp: # this asset has not been materialized since being wiped, skip continue else: # add the key row_by_asset_key[asset_key] = row else: row_by_asset_key[asset_key] = row wiped_timestamps_by_asset_key[asset_key] = asset_details.last_wipe_timestamp if wiped_timestamps_by_asset_key: materialization_times = self._fetch_backcompat_materialization_times( wiped_timestamps_by_asset_key.keys() ) for asset_key, wiped_timestamp in wiped_timestamps_by_asset_key.items(): materialization_time = materialization_times.get(asset_key) if not materialization_time or utc_datetime_from_naive( materialization_time ) < utc_datetime_from_timestamp(wiped_timestamp): # remove rows that have not been materialized since being wiped row_by_asset_key.pop(asset_key) has_more = limit and len(rows) == limit new_cursor = rows[-1][0] if rows else None return row_by_asset_key.values(), has_more, new_cursor
def _wrapped_fn(context: SensorEvaluationContext): # initiate the cursor to (most recent event id, current timestamp) when: # * it's the first time starting the sensor # * or, the cursor isn't in valid format (backcompt) if context.cursor is None or not RunStatusSensorCursor.is_valid(context.cursor): most_recent_event_records = list( context.instance.get_event_records(ascending=False, limit=1) ) most_recent_event_id = ( most_recent_event_records[0].storage_id if len(most_recent_event_records) == 1 else -1 ) new_cursor = RunStatusSensorCursor( update_timestamp=pendulum.now("UTC").isoformat(), record_id=most_recent_event_id, ) context.update_cursor(new_cursor.to_json()) yield SkipReason(f"Initiating {name}. Set cursor to {new_cursor}") return record_id, update_timestamp = RunStatusSensorCursor.from_json(context.cursor) # Fetch events after the cursor id # * we move the cursor forward to the latest visited event's id to avoid revisits # * when the daemon is down, bc we persist the cursor info, we can go back to where we # left and backfill alerts for the qualified events (up to 5 at a time) during the downtime # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage. event_records = context.instance.get_event_records( EventRecordsFilter( after_cursor=RunShardedEventsCursor( id=record_id, run_updated_after=cast(datetime, pendulum.parse(update_timestamp)), ), event_type=PIPELINE_RUN_STATUS_TO_EVENT_TYPE[pipeline_run_status], ), ascending=True, limit=5, ) for event_record in event_records: event_log_entry = event_record.event_log_entry storage_id = event_record.storage_id # get run info run_records = context.instance.get_run_records( filters=RunsFilter(run_ids=[event_log_entry.run_id]) ) # skip if we couldn't find the right run if len(run_records) != 1: # bc we couldn't find the run, we use the event timestamp as the approximate # run update timestamp approximate_update_timestamp = utc_datetime_from_timestamp( event_log_entry.timestamp ) context.update_cursor( RunStatusSensorCursor( record_id=storage_id, update_timestamp=approximate_update_timestamp.isoformat(), ).to_json() ) continue pipeline_run = run_records[0].pipeline_run update_timestamp = run_records[0].update_timestamp # skip if any of of the followings happens: if ( # the pipeline does not have a repository (manually executed) not pipeline_run.external_pipeline_origin or # the pipeline does not belong to the current repository pipeline_run.external_pipeline_origin.external_repository_origin.repository_name != context.repository_name or # if pipeline is not selected (pipeline_selection and pipeline_run.pipeline_name not in pipeline_selection) or # if job not selected ( job_selection and pipeline_run.pipeline_name not in map(lambda x: x.name, job_selection) ) ): context.update_cursor( RunStatusSensorCursor( record_id=storage_id, update_timestamp=update_timestamp.isoformat() ).to_json() ) continue serializable_error = None try: with user_code_error_boundary( RunStatusSensorExecutionError, lambda: f'Error occurred during the execution sensor "{name}".', ): # one user code invocation maps to one failure event run_status_sensor_fn( RunStatusSensorContext( sensor_name=name, dagster_run=pipeline_run, dagster_event=event_log_entry.dagster_event, instance=context.instance, ) ) except RunStatusSensorExecutionError as run_status_sensor_execution_error: # When the user code errors, we report error to the sensor tick not the original run. serializable_error = serializable_error_info_from_exc_info( run_status_sensor_execution_error.original_exc_info ) context.update_cursor( RunStatusSensorCursor( record_id=storage_id, update_timestamp=update_timestamp.isoformat() ).to_json() ) # Yield PipelineRunReaction to indicate the execution success/failure. # The sensor machinery would # * report back to the original run if success # * update cursor and job state yield PipelineRunReaction( pipeline_run=pipeline_run, run_status=pipeline_run_status, error=serializable_error, )