def add_daemon_heartbeat(self, daemon_heartbeat):
        with self.connect() as conn:

            # insert, or update if already present
            try:
                conn.execute(
                    DaemonHeartbeatsTable.insert().values(  # pylint: disable=no-value-for-parameter
                        timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),
                        daemon_type=daemon_heartbeat.daemon_type.value,
                        daemon_id=daemon_heartbeat.daemon_id,
                        body=serialize_dagster_namedtuple(daemon_heartbeat),
                    )
                )
            except db.exc.IntegrityError:
                conn.execute(
                    DaemonHeartbeatsTable.update()  # pylint: disable=no-value-for-parameter
                    .where(
                        DaemonHeartbeatsTable.c.daemon_type == daemon_heartbeat.daemon_type.value
                    )
                    .values(  # pylint: disable=no-value-for-parameter
                        timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),
                        daemon_id=daemon_heartbeat.daemon_id,
                        body=serialize_dagster_namedtuple(daemon_heartbeat),
                    )
                )
Exemple #2
0
    def store_asset(self, event):
        check.inst_param(event, "event", EventLogEntry)
        if not event.is_dagster_event or not event.dagster_event.asset_key:
            return

        materialization = event.dagster_event.step_materialization_data.materialization
        # We switched to storing the entire event record of the last materialization instead of just
        # the AssetMaterialization object, so that we have access to metadata like timestamp,
        # pipeline, run_id, etc.
        #
        # This should make certain asset queries way more performant, without having to do extra
        # queries against the event log.
        #
        # This should be accompanied by a schema change in 0.12.0, renaming `last_materialization`
        # to `last_materialization_event`, for clarity.  For now, we should do some back-compat.
        #
        # https://github.com/dagster-io/dagster/issues/3945
        if self.has_secondary_index(ASSET_KEY_INDEX_COLS):
            insert_statement = (
                AssetKeyTable.insert().values(  # pylint: disable=no-value-for-parameter
                    asset_key=event.dagster_event.asset_key.to_string(),
                    last_materialization=serialize_dagster_namedtuple(event),
                    last_materialization_timestamp=utc_datetime_from_timestamp(
                        event.timestamp),
                    last_run_id=event.run_id,
                    tags=seven.json.dumps(materialization.tags)
                    if materialization.tags else None,
                ))
            update_statement = (
                AssetKeyTable.update().values(  # pylint: disable=no-value-for-parameter
                    last_materialization=serialize_dagster_namedtuple(event),
                    last_materialization_timestamp=utc_datetime_from_timestamp(
                        event.timestamp),
                    last_run_id=event.run_id,
                    tags=seven.json.dumps(materialization.tags)
                    if materialization.tags else None,
                ).where(
                    AssetKeyTable.c.asset_key ==
                    event.dagster_event.asset_key.to_string(), ))
        else:
            insert_statement = (
                AssetKeyTable.insert().values(  # pylint: disable=no-value-for-parameter
                    asset_key=event.dagster_event.asset_key.to_string(),
                    last_materialization=serialize_dagster_namedtuple(event),
                    last_run_id=event.run_id,
                ))
            update_statement = (
                AssetKeyTable.update().values(  # pylint: disable=no-value-for-parameter
                    last_materialization=serialize_dagster_namedtuple(event),
                    last_run_id=event.run_id,
                ).where(
                    AssetKeyTable.c.asset_key ==
                    event.dagster_event.asset_key.to_string(), ))

        with self.index_connection() as conn:
            try:
                conn.execute(insert_statement)
            except db.exc.IntegrityError:
                conn.execute(update_statement)
Exemple #3
0
    def _add_filter_limit(self, query, before=None, after=None, limit=None):
        check.opt_float_param(before, "before")
        check.opt_float_param(after, "after")
        check.opt_int_param(limit, "limit")

        if before:
            query = query.where(JobTickTable.c.timestamp < utc_datetime_from_timestamp(before))
        if after:
            query = query.where(JobTickTable.c.timestamp > utc_datetime_from_timestamp(after))
        if limit:
            query = query.limit(limit)
        return query
Exemple #4
0
 def add_daemon_heartbeat(self, daemon_heartbeat):
     with self.connect() as conn:
         conn.execute(
             db.dialects.mysql.insert(DaemonHeartbeatsTable).values(
                 timestamp=utc_datetime_from_timestamp(
                     daemon_heartbeat.timestamp),
                 daemon_type=daemon_heartbeat.daemon_type,
                 daemon_id=daemon_heartbeat.daemon_id,
                 body=serialize_dagster_namedtuple(daemon_heartbeat),
             ).on_duplicate_key_update(
                 timestamp=utc_datetime_from_timestamp(
                     daemon_heartbeat.timestamp),
                 daemon_id=daemon_heartbeat.daemon_id,
                 body=serialize_dagster_namedtuple(daemon_heartbeat),
             ))
    def purge_ticks(self, origin_id, selector_id, tick_status, before):
        check.str_param(origin_id, "origin_id")
        check.inst_param(tick_status, "tick_status", TickStatus)
        check.float_param(before, "before")

        utc_before = utc_datetime_from_timestamp(before)

        base_query = (
            JobTickTable.delete()  # pylint: disable=no-value-for-parameter
            .where(JobTickTable.c.status == tick_status.value).where(
                JobTickTable.c.timestamp < utc_before))

        if self.has_instigators_table():
            query = base_query.where(
                db.or_(
                    JobTickTable.c.selector_id == selector_id,
                    db.and_(
                        JobTickTable.c.selector_id == None,
                        JobTickTable.c.job_origin_id == origin_id,
                    ),
                ))
        else:
            query = base_query.where(JobTickTable.c.job_origin_id == origin_id)

        with self.connect() as conn:
            conn.execute(query)
Exemple #6
0
    def _add_cursor_limit_to_query(
        self,
        query,
        before_cursor,
        after_cursor,
        limit,
        ascending=False,
        before_timestamp=None,
    ):
        """ Helper function to deal with cursor/limit pagination args """

        if before_cursor:
            before_query = db.select([SqlEventLogStorageTable.c.id]).where(
                SqlEventLogStorageTable.c.id == before_cursor
            )
            query = query.where(SqlEventLogStorageTable.c.id < before_query)
        if after_cursor:
            after_query = db.select([SqlEventLogStorageTable.c.id]).where(
                SqlEventLogStorageTable.c.id == after_cursor
            )
            query = query.where(SqlEventLogStorageTable.c.id > after_query)
        if before_timestamp:
            query = query.where(
                SqlEventLogStorageTable.c.timestamp < utc_datetime_from_timestamp(before_timestamp)
            )

        if limit:
            query = query.limit(limit)

        if ascending:
            query = query.order_by(SqlEventLogStorageTable.c.timestamp.asc())
        else:
            query = query.order_by(SqlEventLogStorageTable.c.timestamp.desc())

        return query
Exemple #7
0
    def store_event(self, event):
        '''Store an event corresponding to a pipeline run.

        Args:
            event (EventRecord): The event to store.
        '''
        check.inst_param(event, 'event', EventRecord)

        dagster_event_type = None
        if event.is_dagster_event:
            dagster_event_type = event.dagster_event.event_type_value

        run_id = event.run_id

        # https://stackoverflow.com/a/54386260/324449
        event_insert = SqlEventLogStorageTable.insert().values(  # pylint: disable=no-value-for-parameter
            run_id=run_id,
            event=serialize_dagster_namedtuple(event),
            dagster_event_type=dagster_event_type,
            timestamp=utc_datetime_from_timestamp(event.timestamp),
            step_key=event.step_key,
        )

        with self.connect(run_id) as conn:
            conn.execute(event_insert)
    def create_schedule_tick(self, repository_name, schedule_tick_data):
        check.str_param(repository_name, 'repository_name')
        check.inst_param(schedule_tick_data, 'schedule_tick_data', ScheduleTickData)

        with self.connect() as conn:
            try:
                tick_insert = ScheduleTickTable.insert().values(  # pylint: disable=no-value-for-parameter
                    repository_name=repository_name,
                    schedule_name=schedule_tick_data.schedule_name,
                    status=schedule_tick_data.status.value,
                    timestamp=utc_datetime_from_timestamp(schedule_tick_data.timestamp),
                    tick_body=serialize_dagster_namedtuple(schedule_tick_data),
                )
                result = conn.execute(tick_insert)
                tick_id = result.inserted_primary_key[0]
                return ScheduleTick(tick_id, schedule_tick_data)
            except db.exc.IntegrityError as exc:
                six.raise_from(
                    DagsterInvariantViolationError(
                        'Unable to insert ScheduleTick for schedule {schedule_name} in storage'.format(
                            schedule_name=schedule_tick_data.schedule_name,
                        )
                    ),
                    exc,
                )
Exemple #9
0
    def _add_asset_wipe_filter_to_query(self, query, asset_details):
        if not asset_details or not asset_details.last_wipe_timestamp:
            return query

        return query.where(
            SqlEventLogStorageTable.c.timestamp > utc_datetime_from_timestamp(
                asset_details.last_wipe_timestamp))
Exemple #10
0
    def has_asset_key(self, asset_key: AssetKey) -> bool:
        check.inst_param(asset_key, "asset_key", AssetKey)
        query = (db.select([
            AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details
        ]).where(
            db.or_(
                AssetKeyTable.c.asset_key == asset_key.to_string(),
                AssetKeyTable.c.asset_key == asset_key.to_string(legacy=True),
            )).limit(1))

        with self.index_connection() as conn:
            row = conn.execute(query).fetchone()
            if not row:
                return False

            asset_details: Optional[
                AssetDetails] = AssetDetails.from_db_string(row[1])
            if not asset_details or not asset_details.last_wipe_timestamp:
                return True

            materialization_row = conn.execute(
                db.select([SqlEventLogStorageTable.c.timestamp]).where(
                    db.or_(
                        AssetKeyTable.c.asset_key == asset_key.to_string(),
                        AssetKeyTable.c.asset_key == asset_key.to_string(
                            legacy=True),
                    )).order_by(
                        SqlEventLogStorageTable.c.timestamp.desc()).limit(
                            1)).fetchone()
            if not materialization_row:
                return False

            return utc_datetime_from_naive(
                materialization_row[0]) > utc_datetime_from_timestamp(
                    asset_details.last_wipe_timestamp)
Exemple #11
0
    def prepare_insert_statement(self, event):
        ''' Helper method for preparing the event log SQL insertion statement.  Abstracted away to
        have a single place for the logical table representation of the event, while having a way
        for SQL backends to implement different execution implementations for `store_event`. See
        the `dagster-postgres` implementation which overrides the generic SQL implementation of
        `store_event`.
        '''

        dagster_event_type = None
        asset_key_str = None
        step_key = event.step_key

        if event.is_dagster_event:
            dagster_event_type = event.dagster_event.event_type_value
            step_key = event.dagster_event.step_key
            if event.dagster_event.asset_key:
                check.inst_param(event.dagster_event.asset_key, 'asset_key',
                                 AssetKey)
                asset_key_str = event.dagster_event.asset_key.to_string()

        # https://stackoverflow.com/a/54386260/324449
        return SqlEventLogStorageTable.insert().values(  # pylint: disable=no-value-for-parameter
            run_id=event.run_id,
            event=serialize_dagster_namedtuple(event),
            dagster_event_type=dagster_event_type,
            timestamp=utc_datetime_from_timestamp(event.timestamp),
            step_key=step_key,
            asset_key=asset_key_str,
        )
Exemple #12
0
    def store_asset(self, event):
        check.inst_param(event, "event", EventLogEntry)
        if not event.is_dagster_event or not event.dagster_event.asset_key:
            return

        materialization = event.dagster_event.step_materialization_data.materialization
        if self.has_secondary_index(ASSET_KEY_INDEX_COLS):
            with self.index_connection() as conn:
                conn.execute(
                    db.dialects.postgresql.insert(AssetKeyTable).values(
                        asset_key=event.dagster_event.asset_key.to_string(),
                        last_materialization=serialize_dagster_namedtuple(
                            materialization),
                        last_materialization_timestamp=
                        utc_datetime_from_timestamp(event.timestamp),
                        last_run_id=event.run_id,
                        tags=seven.json.dumps(materialization.tags)
                        if materialization.tags else None,
                    ).on_conflict_do_update(
                        index_elements=[AssetKeyTable.c.asset_key],
                        set_=dict(
                            last_materialization=serialize_dagster_namedtuple(
                                materialization),
                            last_materialization_timestamp=
                            utc_datetime_from_timestamp(event.timestamp),
                            last_run_id=event.run_id,
                            tags=seven.json.dumps(materialization.tags)
                            if materialization.tags else None,
                        ),
                    ))

        else:
            with self.index_connection() as conn:
                conn.execute(
                    db.dialects.postgresql.insert(AssetKeyTable).values(
                        asset_key=event.dagster_event.asset_key.to_string(),
                        last_materialization=serialize_dagster_namedtuple(
                            materialization),
                        last_run_id=event.run_id,
                    ).on_conflict_do_update(
                        index_elements=[AssetKeyTable.c.asset_key],
                        set_=dict(
                            last_materialization=serialize_dagster_namedtuple(
                                materialization),
                            last_run_id=event.run_id,
                        ),
                    ))
Exemple #13
0
    def all_asset_keys(self):
        with self.index_connection() as conn:
            results = conn.execute(
                db.select([AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details])
            ).fetchall()

            asset_keys = set()
            wiped = set()
            wiped_timestamps = {}
            for result in results:
                asset_key = AssetKey.from_db_string(result[0])
                asset_details: Optional[AssetDetails] = AssetDetails.from_db_string(result[1])
                asset_keys.add(asset_key)
                if asset_details and asset_details.last_wipe_timestamp:
                    wiped_timestamps[asset_key] = asset_details.last_wipe_timestamp

            if wiped_timestamps:
                materialized_timestamps = {}

                # fetch the last materialization timestamp per asset key
                materialization_results = conn.execute(
                    db.select(
                        [
                            SqlEventLogStorageTable.c.asset_key,
                            db.func.max(SqlEventLogStorageTable.c.timestamp),
                        ]
                    )
                    .where(
                        SqlEventLogStorageTable.c.asset_key.in_(
                            [asset_key.to_string() for asset_key in wiped_timestamps.keys()]
                        )
                    )
                    .group_by(SqlEventLogStorageTable.c.asset_key)
                    .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).asc())
                ).fetchall()

                for result in materialization_results:
                    asset_key = AssetKey.from_db_string(result[0])
                    last_materialized_timestamp = result[1]
                    materialized_timestamps[asset_key] = last_materialized_timestamp

                # calculate the set of wiped asset keys that have not had a materialization since
                # the wipe timestamp
                wiped = set(
                    [
                        asset_key
                        for asset_key in wiped_timestamps.keys()
                        if not materialized_timestamps.get(asset_key)
                        or utc_datetime_from_naive(materialized_timestamps.get(asset_key))
                        < utc_datetime_from_timestamp(wiped_timestamps[asset_key])
                    ]
                )

        return list(asset_keys.difference(wiped))
Exemple #14
0
 def add_backfill(self, partition_backfill: PartitionBackfill):
     check.inst_param(partition_backfill, "partition_backfill",
                      PartitionBackfill)
     with self.connect() as conn:
         conn.execute(BulkActionsTable.insert().values(  # pylint: disable=no-value-for-parameter
             key=partition_backfill.backfill_id,
             status=partition_backfill.status.value,
             timestamp=utc_datetime_from_timestamp(
                 partition_backfill.backfill_timestamp),
             body=serialize_dagster_namedtuple(partition_backfill),
         ))
Exemple #15
0
    def add_daemon_heartbeat(self, daemon_heartbeat):
        with self.connect() as conn:

            # insert or update if already present, using postgres specific on_conflict
            conn.execute(
                db.dialects.postgresql.insert(DaemonHeartbeatsTable)
                .values(  # pylint: disable=no-value-for-parameter
                    timestamp=utc_datetime_from_timestamp(daemon_heartbeat.timestamp),
                    daemon_type=daemon_heartbeat.daemon_type,
                    daemon_id=daemon_heartbeat.daemon_id,
                    body=serialize_dagster_namedtuple(daemon_heartbeat),
                )
                .on_conflict_do_update(
                    index_elements=[DaemonHeartbeatsTable.c.daemon_type],
                    set_={
                        "timestamp": utc_datetime_from_timestamp(daemon_heartbeat.timestamp),
                        "daemon_id": daemon_heartbeat.daemon_id,
                        "body": serialize_dagster_namedtuple(daemon_heartbeat),
                    },
                )
            )
Exemple #16
0
    def has_asset_key(self, asset_key: AssetKey) -> bool:
        check.inst_param(asset_key, "asset_key", AssetKey)
        if self.has_secondary_index(ASSET_KEY_INDEX_COLS):
            query = (db.select([AssetKeyTable.c.asset_key]).where(
                db.or_(
                    AssetKeyTable.c.asset_key == asset_key.to_string(),
                    AssetKeyTable.c.asset_key == asset_key.to_string(
                        legacy=True),
                )).where(
                    db.or_(
                        AssetKeyTable.c.wipe_timestamp == None,
                        AssetKeyTable.c.last_materialization_timestamp >
                        AssetKeyTable.c.wipe_timestamp,
                    )).limit(1))
            with self.index_connection() as conn:
                row = conn.execute(query).fetchone()
                return bool(row)

        # has not migrated, need to pull asset_details to get wipe status
        query = (db.select([
            AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details
        ]).where(
            db.or_(
                AssetKeyTable.c.asset_key == asset_key.to_string(),
                AssetKeyTable.c.asset_key == asset_key.to_string(legacy=True),
            )).limit(1))

        with self.index_connection() as conn:
            row = conn.execute(query).fetchone()
            if not row:
                return False

            asset_details: Optional[
                AssetDetails] = AssetDetails.from_db_string(row[1])
            if not asset_details or not asset_details.last_wipe_timestamp:
                return True

            materialization_row = conn.execute(
                db.select([SqlEventLogStorageTable.c.timestamp]).where(
                    db.or_(
                        AssetKeyTable.c.asset_key == asset_key.to_string(),
                        AssetKeyTable.c.asset_key == asset_key.to_string(
                            legacy=True),
                    )).order_by(
                        SqlEventLogStorageTable.c.timestamp.desc()).limit(
                            1)).fetchone()
            if not materialization_row:
                return False

            return utc_datetime_from_naive(
                materialization_row[0]) > utc_datetime_from_timestamp(
                    asset_details.last_wipe_timestamp)
Exemple #17
0
    def purge_job_ticks(self, job_origin_id, tick_status, before):
        check.str_param(job_origin_id, "job_origin_id")
        check.inst_param(tick_status, "tick_status", JobTickStatus)
        check.float_param(before, "before")

        utc_before = utc_datetime_from_timestamp(before)

        with self.connect() as conn:
            conn.execute(
                JobTickTable.delete()  # pylint: disable=no-value-for-parameter
                .where(JobTickTable.c.status == tick_status.value).where(
                    JobTickTable.c.timestamp < utc_before).where(
                        JobTickTable.c.job_origin_id == job_origin_id))
Exemple #18
0
    def update_job_tick(self, tick):
        check.inst_param(tick, "tick", JobTick)

        with self.connect() as conn:
            conn.execute(
                JobTickTable.update()  # pylint: disable=no-value-for-parameter
                .where(JobTickTable.c.id == tick.tick_id).values(
                    status=tick.status.value,
                    type=tick.job_type.value,
                    timestamp=utc_datetime_from_timestamp(tick.timestamp),
                    tick_body=serialize_dagster_namedtuple(tick.job_tick_data),
                ))

        return tick
    def _add_filter_limit(self,
                          query,
                          before=None,
                          after=None,
                          limit=None,
                          statuses=None):
        check.opt_float_param(before, "before")
        check.opt_float_param(after, "after")
        check.opt_int_param(limit, "limit")
        check.opt_list_param(statuses, "statuses", of_type=TickStatus)

        if before:
            query = query.where(
                JobTickTable.c.timestamp < utc_datetime_from_timestamp(before))
        if after:
            query = query.where(
                JobTickTable.c.timestamp > utc_datetime_from_timestamp(after))
        if limit:
            query = query.limit(limit)
        if statuses:
            query = query.where(
                JobTickTable.c.status.in_(
                    [status.value for status in statuses]))
        return query
Exemple #20
0
 def update_event_log_record(self, record_id, event):
     ''' Utility method for migration scripts to update SQL representation of event records. '''
     check.int_param(record_id, 'record_id')
     check.inst_param(event, 'event', EventRecord)
     dagster_event_type = None
     if event.is_dagster_event:
         dagster_event_type = event.dagster_event.event_type_value
     with self.connect(run_id=event.run_id) as conn:
         conn.execute(
             SqlEventLogStorageTable.update()  # pylint: disable=no-value-for-parameter
             .where(SqlEventLogStorageTable.c.id == record_id).values(
                 event=serialize_dagster_namedtuple(event),
                 dagster_event_type=dagster_event_type,
                 timestamp=utc_datetime_from_timestamp(event.timestamp),
                 step_key=event.step_key,
             ))
Exemple #21
0
    def store_asset_event(self, event):
        asset_key = event.dagster_event.asset_key
        asset = self._assets[asset_key] if asset_key in self._assets else {
            "id": len(self._assets)
        }

        asset["last_materialization_timestamp"] = utc_datetime_from_timestamp(
            event.timestamp)
        if event.dagster_event.is_step_materialization:
            materialization = event.dagster_event.step_materialization_data.materialization
            asset["last_materialization"] = event
            asset[
                "tags"] = materialization.tags if materialization.tags else None
        if (event.dagster_event.is_step_materialization
                or event.dagster_event.is_asset_materialization_planned):
            asset["last_run_id"] = event.run_id

        self._assets[asset_key] = asset
    def update_tick(self, tick):
        check.inst_param(tick, "tick", InstigatorTick)

        values = {
            "status": tick.status.value,
            "type": tick.instigator_type.value,
            "timestamp": utc_datetime_from_timestamp(tick.timestamp),
            "tick_body": serialize_dagster_namedtuple(tick.tick_data),
        }
        if self.has_instigators_table() and tick.selector_id:
            values["selector_id"] = tick.selector_id

        with self.connect() as conn:
            conn.execute(
                JobTickTable.update()  # pylint: disable=no-value-for-parameter
                .where(JobTickTable.c.id == tick.tick_id).values(**values))

        return tick
Exemple #23
0
    def wipe_asset(self, asset_key):
        check.inst_param(asset_key, "asset_key", AssetKey)

        wipe_timestamp = pendulum.now("UTC").timestamp()

        if self.has_secondary_index(ASSET_KEY_INDEX_COLS):
            with self.index_connection() as conn:
                conn.execute(
                    AssetKeyTable.update()  # pylint: disable=no-value-for-parameter
                    .where(
                        db.or_(
                            AssetKeyTable.c.asset_key == asset_key.to_string(),
                            AssetKeyTable.c.asset_key == asset_key.to_string(
                                legacy=True),
                        )).values(
                            last_materialization=None,
                            last_run_id=None,
                            last_materialization_timestamp=None,
                            tags=None,
                            asset_details=serialize_dagster_namedtuple(
                                AssetDetails(
                                    last_wipe_timestamp=wipe_timestamp)),
                            wipe_timestamp=utc_datetime_from_timestamp(
                                wipe_timestamp),
                        ))

        else:
            with self.index_connection() as conn:
                conn.execute(
                    AssetKeyTable.update()  # pylint: disable=no-value-for-parameter
                    .where(
                        db.or_(
                            AssetKeyTable.c.asset_key == asset_key.to_string(),
                            AssetKeyTable.c.asset_key == asset_key.to_string(
                                legacy=True),
                        )).values(
                            last_materialization=None,
                            last_run_id=None,
                            asset_details=serialize_dagster_namedtuple(
                                AssetDetails(
                                    last_wipe_timestamp=wipe_timestamp)),
                        ))
    def create_job_tick(self, job_tick_data):
        check.inst_param(job_tick_data, "job_tick_data", JobTickData)

        with self.connect() as conn:
            try:
                tick_insert = JobTickTable.insert().values(  # pylint: disable=no-value-for-parameter
                    job_origin_id=job_tick_data.job_origin_id,
                    status=job_tick_data.status.value,
                    type=job_tick_data.job_type.value,
                    timestamp=utc_datetime_from_timestamp(
                        job_tick_data.timestamp),
                    tick_body=serialize_dagster_namedtuple(job_tick_data),
                )
                result = conn.execute(tick_insert)
                tick_id = result.inserted_primary_key[0]
                return JobTick(tick_id, job_tick_data)
            except db.exc.IntegrityError as exc:
                raise DagsterInvariantViolationError(
                    f"Unable to insert JobTick for job {job_tick_data.job_name} in storage"
                ) from exc
    def create_tick(self, tick_data):
        check.inst_param(tick_data, "tick_data", TickData)

        values = {
            "job_origin_id": tick_data.instigator_origin_id,
            "status": tick_data.status.value,
            "type": tick_data.instigator_type.value,
            "timestamp": utc_datetime_from_timestamp(tick_data.timestamp),
            "tick_body": serialize_dagster_namedtuple(tick_data),
        }
        if self.has_instigators_table() and tick_data.selector_id:
            values["selector_id"] = tick_data.selector_id

        with self.connect() as conn:
            try:
                tick_insert = JobTickTable.insert().values(**values)  # pylint: disable=no-value-for-parameter
                result = conn.execute(tick_insert)
                tick_id = result.inserted_primary_key[0]
                return InstigatorTick(tick_id, tick_data)
            except db.exc.IntegrityError as exc:
                raise DagsterInvariantViolationError(
                    f"Unable to insert InstigatorTick for job {tick_data.instigator_name} in storage"
                ) from exc
    def update_event_log_record(self, record_id, event):
        """ Utility method for migration scripts to update SQL representation of event records. """
        check.int_param(record_id, "record_id")
        check.inst_param(event, "event", EventRecord)
        dagster_event_type = None
        asset_key_str = None
        if event.is_dagster_event:
            dagster_event_type = event.dagster_event.event_type_value
            if event.dagster_event.asset_key:
                check.inst_param(event.dagster_event.asset_key, "asset_key",
                                 AssetKey)
                asset_key_str = event.dagster_event.asset_key.to_string()

        with self.run_connection(run_id=event.run_id) as conn:
            conn.execute(
                SqlEventLogStorageTable.update()  # pylint: disable=no-value-for-parameter
                .where(SqlEventLogStorageTable.c.id == record_id).values(
                    event=serialize_dagster_namedtuple(event),
                    dagster_event_type=dagster_event_type,
                    timestamp=utc_datetime_from_timestamp(event.timestamp),
                    step_key=event.step_key,
                    asset_key=asset_key_str,
                ))
Exemple #27
0
def get_asset_runs_count_by_step(graphene_info, asset_nodes):
    from ..schema.pipelines.pipeline import GrapheneJobRunsCount

    instance = graphene_info.context.instance

    jobs_runs_count: Dict[str, GrapheneJobRunsCount] = {}

    if len(asset_nodes) == 0:
        return jobs_runs_count

    step_key_to_job_names: Dict[str, List[str]] = {
        asset_node.op_name: asset_node.job_names
        for asset_node in asset_nodes
    }
    materializations = instance.get_latest_materialization_events(
        [asset_node.asset_key for asset_node in asset_nodes])
    for asset_node in asset_nodes:
        event = materializations.get(asset_node.asset_key)
        step_key = asset_node.op_name
        job_names = step_key_to_job_names[step_key]
        runs_count = sum([
            instance.get_runs_count(
                RunsFilter(
                    pipeline_name=job_name,
                    updated_after=utc_datetime_from_timestamp(event.timestamp)
                    if event else None,
                )) for job_name in job_names
        ])

        # If a materialization has occurred, we subtract one so that the runs count
        # does not include the run that generated the materialization.
        if event:
            runs_count -= 1

        jobs_runs_count[step_key] = GrapheneJobRunsCount(
            step_key, job_names, runs_count, True if event else False)
    return jobs_runs_count
Exemple #28
0
def migrate_asset_keys_index_columns(event_log_storage, print_fn=None):
    from dagster.core.storage.event_log.sql_event_log import SqlEventLogStorage
    from dagster.serdes import serialize_dagster_namedtuple
    from .schema import AssetKeyTable, SqlEventLogStorageTable

    if not isinstance(event_log_storage, SqlEventLogStorage):
        return

    with event_log_storage.index_connection() as conn:
        if print_fn:
            print_fn("Querying asset keys.")
        results = conn.execute(
            db.select(
                [
                    AssetKeyTable.c.asset_key,
                    AssetKeyTable.c.asset_details,
                    AssetKeyTable.c.last_materialization,
                ]
            )
        ).fetchall()

        if print_fn:
            print_fn(f"Found {len(results)} assets to reindex.")
            results = tqdm(results)

        for row in results:
            asset_key_str, asset_details_str, last_materialization_str = row
            wipe_timestamp = None
            event = None

            asset_key = AssetKey.from_db_string(asset_key_str)

            if asset_details_str:
                asset_details = deserialize_json_to_dagster_namedtuple(asset_details_str)
                wipe_timestamp = asset_details.last_wipe_timestamp if asset_details else None

            if last_materialization_str:
                event_or_materialization = deserialize_json_to_dagster_namedtuple(
                    last_materialization_str
                )

                if isinstance(event_or_materialization, EventLogEntry):
                    event = event_or_materialization

            if not event:
                materialization_query = (
                    db.select([SqlEventLogStorageTable.c.event])
                    .where(
                        db.or_(
                            SqlEventLogStorageTable.c.asset_key == asset_key.to_string(),
                            SqlEventLogStorageTable.c.asset_key == asset_key.to_string(legacy=True),
                        )
                    )
                    .order_by(SqlEventLogStorageTable.c.timestamp.desc())
                    .limit(1)
                )
                row = conn.execute(materialization_query).fetchone()
                if row:
                    event = deserialize_json_to_dagster_namedtuple(row[0])

            if not event:
                # this must be a wiped asset
                conn.execute(
                    AssetKeyTable.update()
                    .values(  # pylint: disable=no-value-for-parameter
                        last_materialization=None,
                        last_materialization_timestamp=None,
                        wipe_timestamp=utc_datetime_from_timestamp(wipe_timestamp)
                        if wipe_timestamp
                        else None,
                        tags=None,
                    )
                    .where(
                        AssetKeyTable.c.asset_key == asset_key.to_string(),
                    )
                )
            else:
                tags = event.dagster_event.step_materialization_data.materialization.tags
                conn.execute(
                    AssetKeyTable.update()
                    .values(  # pylint: disable=no-value-for-parameter
                        last_materialization=serialize_dagster_namedtuple(event),
                        last_materialization_timestamp=utc_datetime_from_timestamp(event.timestamp),
                        wipe_timestamp=utc_datetime_from_timestamp(wipe_timestamp)
                        if wipe_timestamp
                        else None,
                        tags=seven.json.dumps(tags) if tags else None,
                    )
                    .where(
                        AssetKeyTable.c.asset_key == asset_key.to_string(),
                    )
                )
Exemple #29
0
    def _fetch_raw_asset_rows(self, asset_keys=None, prefix=None, limit=None, cursor=None):
        # fetches rows containing asset_key, last_materialization, and asset_details from the DB,
        # applying the filters specified in the arguments.  Does not guarantee that the number of
        # rows returned will match the limit specified.  This helper function is used to fetch a
        # chunk of asset key rows, which may or may not be wiped.
        #
        # Returns a tuple of (rows, has_more, cursor), where each row is a tuple of serialized
        # asset_key, materialization, and asset_details

        columns = [
            AssetKeyTable.c.asset_key,
            AssetKeyTable.c.last_materialization,
            AssetKeyTable.c.asset_details,
        ]

        is_partial_query = bool(asset_keys) or bool(prefix) or bool(limit) or bool(cursor)
        if self.has_asset_key_index_cols() and not is_partial_query:
            # if the schema has been migrated, fetch the last_materialization_timestamp to see if
            # we can lazily migrate the data table
            columns.append(AssetKeyTable.c.last_materialization_timestamp)
            columns.append(AssetKeyTable.c.wipe_timestamp)

        query = db.select(columns).order_by(AssetKeyTable.c.asset_key.asc())
        query = self._apply_asset_filter_to_query(query, asset_keys, prefix, limit, cursor)

        if self.has_secondary_index(ASSET_KEY_INDEX_COLS):
            query = query.where(
                db.or_(
                    AssetKeyTable.c.wipe_timestamp == None,
                    AssetKeyTable.c.last_materialization_timestamp > AssetKeyTable.c.wipe_timestamp,
                )
            )
            with self.index_connection() as conn:
                rows = conn.execute(query).fetchall()

            return rows, False, None

        with self.index_connection() as conn:
            rows = conn.execute(query).fetchall()

        wiped_timestamps_by_asset_key = {}
        row_by_asset_key = OrderedDict()

        for row in rows:
            asset_key = AssetKey.from_db_string(row[0])
            if not asset_key:
                continue
            asset_details = AssetDetails.from_db_string(row[2])
            if not asset_details or not asset_details.last_wipe_timestamp:
                row_by_asset_key[asset_key] = row
                continue
            materialization_or_event = (
                deserialize_json_to_dagster_namedtuple(row[1]) if row[1] else None
            )
            if isinstance(materialization_or_event, EventLogEntry):
                if asset_details.last_wipe_timestamp > materialization_or_event.timestamp:
                    # this asset has not been materialized since being wiped, skip
                    continue
                else:
                    # add the key
                    row_by_asset_key[asset_key] = row
            else:
                row_by_asset_key[asset_key] = row
                wiped_timestamps_by_asset_key[asset_key] = asset_details.last_wipe_timestamp

        if wiped_timestamps_by_asset_key:
            materialization_times = self._fetch_backcompat_materialization_times(
                wiped_timestamps_by_asset_key.keys()
            )
            for asset_key, wiped_timestamp in wiped_timestamps_by_asset_key.items():
                materialization_time = materialization_times.get(asset_key)
                if not materialization_time or utc_datetime_from_naive(
                    materialization_time
                ) < utc_datetime_from_timestamp(wiped_timestamp):
                    # remove rows that have not been materialized since being wiped
                    row_by_asset_key.pop(asset_key)

        has_more = limit and len(rows) == limit
        new_cursor = rows[-1][0] if rows else None

        return row_by_asset_key.values(), has_more, new_cursor
Exemple #30
0
        def _wrapped_fn(context: SensorEvaluationContext):
            # initiate the cursor to (most recent event id, current timestamp) when:
            # * it's the first time starting the sensor
            # * or, the cursor isn't in valid format (backcompt)
            if context.cursor is None or not RunStatusSensorCursor.is_valid(context.cursor):
                most_recent_event_records = list(
                    context.instance.get_event_records(ascending=False, limit=1)
                )
                most_recent_event_id = (
                    most_recent_event_records[0].storage_id
                    if len(most_recent_event_records) == 1
                    else -1
                )

                new_cursor = RunStatusSensorCursor(
                    update_timestamp=pendulum.now("UTC").isoformat(),
                    record_id=most_recent_event_id,
                )
                context.update_cursor(new_cursor.to_json())
                yield SkipReason(f"Initiating {name}. Set cursor to {new_cursor}")
                return

            record_id, update_timestamp = RunStatusSensorCursor.from_json(context.cursor)

            # Fetch events after the cursor id
            # * we move the cursor forward to the latest visited event's id to avoid revisits
            # * when the daemon is down, bc we persist the cursor info, we can go back to where we
            #   left and backfill alerts for the qualified events (up to 5 at a time) during the downtime
            # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage.
            event_records = context.instance.get_event_records(
                EventRecordsFilter(
                    after_cursor=RunShardedEventsCursor(
                        id=record_id,
                        run_updated_after=cast(datetime, pendulum.parse(update_timestamp)),
                    ),
                    event_type=PIPELINE_RUN_STATUS_TO_EVENT_TYPE[pipeline_run_status],
                ),
                ascending=True,
                limit=5,
            )

            for event_record in event_records:
                event_log_entry = event_record.event_log_entry
                storage_id = event_record.storage_id

                # get run info
                run_records = context.instance.get_run_records(
                    filters=RunsFilter(run_ids=[event_log_entry.run_id])
                )

                # skip if we couldn't find the right run
                if len(run_records) != 1:
                    # bc we couldn't find the run, we use the event timestamp as the approximate
                    # run update timestamp
                    approximate_update_timestamp = utc_datetime_from_timestamp(
                        event_log_entry.timestamp
                    )
                    context.update_cursor(
                        RunStatusSensorCursor(
                            record_id=storage_id,
                            update_timestamp=approximate_update_timestamp.isoformat(),
                        ).to_json()
                    )
                    continue

                pipeline_run = run_records[0].pipeline_run
                update_timestamp = run_records[0].update_timestamp

                # skip if any of of the followings happens:
                if (
                    # the pipeline does not have a repository (manually executed)
                    not pipeline_run.external_pipeline_origin
                    or
                    # the pipeline does not belong to the current repository
                    pipeline_run.external_pipeline_origin.external_repository_origin.repository_name
                    != context.repository_name
                    or
                    # if pipeline is not selected
                    (pipeline_selection and pipeline_run.pipeline_name not in pipeline_selection)
                    or
                    # if job not selected
                    (
                        job_selection
                        and pipeline_run.pipeline_name not in map(lambda x: x.name, job_selection)
                    )
                ):
                    context.update_cursor(
                        RunStatusSensorCursor(
                            record_id=storage_id, update_timestamp=update_timestamp.isoformat()
                        ).to_json()
                    )
                    continue

                serializable_error = None

                try:
                    with user_code_error_boundary(
                        RunStatusSensorExecutionError,
                        lambda: f'Error occurred during the execution sensor "{name}".',
                    ):
                        # one user code invocation maps to one failure event
                        run_status_sensor_fn(
                            RunStatusSensorContext(
                                sensor_name=name,
                                dagster_run=pipeline_run,
                                dagster_event=event_log_entry.dagster_event,
                                instance=context.instance,
                            )
                        )
                except RunStatusSensorExecutionError as run_status_sensor_execution_error:
                    # When the user code errors, we report error to the sensor tick not the original run.
                    serializable_error = serializable_error_info_from_exc_info(
                        run_status_sensor_execution_error.original_exc_info
                    )

                context.update_cursor(
                    RunStatusSensorCursor(
                        record_id=storage_id, update_timestamp=update_timestamp.isoformat()
                    ).to_json()
                )

                # Yield PipelineRunReaction to indicate the execution success/failure.
                # The sensor machinery would
                # * report back to the original run if success
                # * update cursor and job state
                yield PipelineRunReaction(
                    pipeline_run=pipeline_run,
                    run_status=pipeline_run_status,
                    error=serializable_error,
                )