Esempio n. 1
0
    def get_latest_materialization_events(
        self, asset_keys: Sequence[AssetKey]
    ) -> Mapping[AssetKey, Optional[EventLogEntry]]:
        check.list_param(asset_keys, "asset_keys", AssetKey)
        rows = self._fetch_asset_rows(asset_keys=asset_keys)
        to_backcompat_fetch = set()
        results: Dict[AssetKey, Optional[EventLogEntry]] = {}
        for row in rows:
            asset_key = AssetKey.from_db_string(row[0])
            if not asset_key:
                continue
            event_or_materialization = (
                deserialize_json_to_dagster_namedtuple(row[1]) if row[1] else None
            )
            if isinstance(event_or_materialization, EventLogEntry):
                results[asset_key] = event_or_materialization
            else:
                to_backcompat_fetch.add(asset_key)

        if to_backcompat_fetch:
            latest_event_subquery = (
                db.select(
                    [
                        SqlEventLogStorageTable.c.asset_key,
                        db.func.max(SqlEventLogStorageTable.c.timestamp).label("timestamp"),
                    ]
                )
                .where(
                    db.and_(
                        SqlEventLogStorageTable.c.asset_key.in_(
                            [asset_key.to_string() for asset_key in to_backcompat_fetch]
                        ),
                        SqlEventLogStorageTable.c.dagster_event_type
                        == DagsterEventType.ASSET_MATERIALIZATION.value,
                    )
                )
                .group_by(SqlEventLogStorageTable.c.asset_key)
                .subquery()
            )
            backcompat_query = db.select(
                [SqlEventLogStorageTable.c.asset_key, SqlEventLogStorageTable.c.event]
            ).join(
                latest_event_subquery,
                db.and_(
                    SqlEventLogStorageTable.c.asset_key == latest_event_subquery.c.asset_key,
                    SqlEventLogStorageTable.c.timestamp == latest_event_subquery.c.timestamp,
                ),
            )
            with self.index_connection() as conn:
                event_rows = conn.execute(backcompat_query).fetchall()

            for row in event_rows:
                asset_key = AssetKey.from_db_string(row[0])
                if asset_key:
                    results[asset_key] = cast(
                        EventLogEntry, deserialize_json_to_dagster_namedtuple(row[1])
                    )

        return results
Esempio n. 2
0
    def all_asset_keys(self):
        with self.index_connection() as conn:
            results = conn.execute(
                db.select([AssetKeyTable.c.asset_key, AssetKeyTable.c.asset_details])
            ).fetchall()

            asset_keys = set()
            wiped = set()
            wiped_timestamps = {}
            for result in results:
                asset_key = AssetKey.from_db_string(result[0])
                asset_details: Optional[AssetDetails] = AssetDetails.from_db_string(result[1])
                asset_keys.add(asset_key)
                if asset_details and asset_details.last_wipe_timestamp:
                    wiped_timestamps[asset_key] = asset_details.last_wipe_timestamp

            if wiped_timestamps:
                materialized_timestamps = {}

                # fetch the last materialization timestamp per asset key
                materialization_results = conn.execute(
                    db.select(
                        [
                            SqlEventLogStorageTable.c.asset_key,
                            db.func.max(SqlEventLogStorageTable.c.timestamp),
                        ]
                    )
                    .where(
                        SqlEventLogStorageTable.c.asset_key.in_(
                            [asset_key.to_string() for asset_key in wiped_timestamps.keys()]
                        )
                    )
                    .group_by(SqlEventLogStorageTable.c.asset_key)
                    .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).asc())
                ).fetchall()

                for result in materialization_results:
                    asset_key = AssetKey.from_db_string(result[0])
                    last_materialized_timestamp = result[1]
                    materialized_timestamps[asset_key] = last_materialized_timestamp

                # calculate the set of wiped asset keys that have not had a materialization since
                # the wipe timestamp
                wiped = set(
                    [
                        asset_key
                        for asset_key in wiped_timestamps.keys()
                        if not materialized_timestamps.get(asset_key)
                        or utc_datetime_from_naive(materialized_timestamps.get(asset_key))
                        < utc_datetime_from_timestamp(wiped_timestamps[asset_key])
                    ]
                )

        return list(asset_keys.difference(wiped))
Esempio n. 3
0
    def get_all_asset_keys(self, prefix_path=None):
        if not prefix_path:
            if self.has_secondary_index(SECONDARY_INDEX_ASSET_KEY):
                query = db.select([AssetKeyTable.c.asset_key])
            else:
                query = (db.select([
                    SqlEventLogStorageTable.c.asset_key
                ]).where(
                    SqlEventLogStorageTable.c.asset_key != None).distinct())
        else:
            if self.has_secondary_index(SECONDARY_INDEX_ASSET_KEY):
                query = db.select([AssetKeyTable.c.asset_key]).where(
                    AssetKeyTable.c.asset_key.startswith(
                        AssetKey.get_db_prefix(prefix_path)))
            else:
                query = (db.select([
                    SqlEventLogStorageTable.c.asset_key
                ]).where(SqlEventLogStorageTable.c.asset_key != None).where(
                    SqlEventLogStorageTable.c.asset_key.startswith(
                        AssetKey.get_db_prefix(prefix_path))).distinct())

        with self.connect() as conn:
            results = conn.execute(query).fetchall()
        return [
            AssetKey.from_db_string(asset_key) for (asset_key, ) in results
            if asset_key
        ]
Esempio n. 4
0
def asset_wipe_command(key, **cli_args):
    if not cli_args.get("all") and len(key) == 0:
        raise click.UsageError(
            "Error, you must specify an asset key or use `--all` to wipe all asset keys."
        )

    if cli_args.get("all") and len(key) > 0:
        raise click.UsageError("Error, cannot use more than one of: asset key, `--all`.")

    with DagsterInstance.get() as instance:
        if len(key) > 0:
            asset_keys = [AssetKey.from_db_string(key_string) for key_string in key]
            prompt = (
                "Are you sure you want to remove the asset key indexes for these keys from the event "
                "logs? Type DELETE"
            )
        else:
            asset_keys = instance.all_asset_keys()
            prompt = "Are you sure you want to remove all asset key indexes from the event logs? Type DELETE"

        confirmation = click.prompt(prompt)
        if confirmation == "DELETE":
            with DagsterInstance.get() as instance:
                instance.wipe_assets(asset_keys)
                click.echo("Removed asset indexes from event logs")
        else:
            click.echo("Exiting without removing asset indexes")
Esempio n. 5
0
def asset_wipe_command(key, **cli_args):
    if not cli_args.get('all') and len(key) == 0:
        raise click.UsageError(
            'Error, you must specify an asset key or use `--all` to wipe all asset keys.'
        )

    if cli_args.get('all') and len(key) > 0:
        raise click.UsageError(
            'Error, cannot use more than one of: asset key, `--all`.')

    if len(key) > 0:
        asset_keys = [
            AssetKey.from_db_string(key_string) for key_string in key
        ]
        prompt = (
            'Are you sure you want to remove the asset key indexes for these keys from the event '
            'logs? Type DELETE')
    else:
        asset_keys = None
        prompt = (
            'Are you sure you want to remove all asset key indexes from the event logs? Type DELETE'
        )

    confirmation = click.prompt(prompt)
    if confirmation == 'DELETE':
        instance = DagsterInstance.get()
        if asset_keys:
            instance.wipe_assets(asset_keys)
        else:
            instance.wipe_all_assets()
        click.echo('Removed asset indexes from event logs')
    else:
        click.echo('Exiting without removing asset indexes')
Esempio n. 6
0
 def get_asset_keys(
     self,
     prefix: Optional[List[str]] = None,
     limit: Optional[int] = None,
     cursor: Optional[str] = None,
 ) -> Iterable[AssetKey]:
     rows = self._fetch_asset_rows(prefix=prefix, limit=limit, cursor=cursor)
     asset_keys = [AssetKey.from_db_string(row[0]) for row in sorted(rows, key=lambda x: x[0])]
     return [asset_key for asset_key in asset_keys if asset_key]
Esempio n. 7
0
    def get_all_asset_keys(self):
        query = db.select([SqlEventLogStorageTable.c.asset_key]).distinct()
        with self.connect() as conn:
            results = conn.execute(query).fetchall()

        return [
            AssetKey.from_db_string(asset_key) for (asset_key, ) in results
            if asset_key
        ]
Esempio n. 8
0
    def get_asset_keys(self, prefix_path=None):
        lazy_migrate = False

        if not prefix_path:
            if self.has_secondary_index(SECONDARY_INDEX_ASSET_KEY):
                query = db.select([AssetKeyTable.c.asset_key])
            else:
                query = (db.select([
                    SqlEventLogStorageTable.c.asset_key
                ]).where(
                    SqlEventLogStorageTable.c.asset_key != None).distinct())

                # This is in place to migrate everyone to using the secondary index table for asset
                # keys.  Performing this migration should result in a big performance boost for
                # any asset-catalog reads.

                # After a sufficient amount of time (>= 0.11.0?), we can remove the checks
                # for has_secondary_index(SECONDARY_INDEX_ASSET_KEY) and always read from the
                # AssetKeyTable, since we are already writing to the table. Tracking the conditional
                # check removal here: https://github.com/dagster-io/dagster/issues/3507
                lazy_migrate = True
        else:
            if self.has_secondary_index(SECONDARY_INDEX_ASSET_KEY):
                query = db.select([AssetKeyTable.c.asset_key]).where(
                    db.or_(
                        AssetKeyTable.c.asset_key.startswith(
                            AssetKey.get_db_prefix(prefix_path)),
                        AssetKeyTable.c.asset_key.startswith(
                            AssetKey.get_db_prefix(prefix_path, legacy=True)),
                    ))
            else:
                query = (db.select([
                    SqlEventLogStorageTable.c.asset_key
                ]).where(SqlEventLogStorageTable.c.asset_key != None).where(
                    db.or_(
                        SqlEventLogStorageTable.c.asset_key.startswith(
                            AssetKey.get_db_prefix(prefix_path)),
                        SqlEventLogStorageTable.c.asset_key.startswith(
                            AssetKey.get_db_prefix(prefix_path, legacy=True)),
                    )).distinct())

        with self.index_connection() as conn:
            results = conn.execute(query).fetchall()

        if lazy_migrate:
            # This is in place to migrate everyone to using the secondary index table for asset
            # keys.  Performing this migration should result in a big performance boost for
            # any subsequent asset-catalog reads.
            self._lazy_migrate_secondary_index_asset_key(
                [asset_key for (asset_key, ) in results if asset_key])
        return list(
            set([
                AssetKey.from_db_string(asset_key) for (asset_key, ) in results
                if asset_key
            ]))
Esempio n. 9
0
    def delete_events_for_run(self, conn, run_id):
        check.str_param(run_id, "run_id")

        delete_statement = (
            SqlEventLogStorageTable.delete().where(  # pylint: disable=no-value-for-parameter
                SqlEventLogStorageTable.c.run_id == run_id
            )
        )
        removed_asset_key_query = (
            db.select([SqlEventLogStorageTable.c.asset_key])
            .where(SqlEventLogStorageTable.c.run_id == run_id)
            .where(SqlEventLogStorageTable.c.asset_key != None)
            .group_by(SqlEventLogStorageTable.c.asset_key)
        )

        removed_asset_keys = [
            AssetKey.from_db_string(row[0])
            for row in conn.execute(removed_asset_key_query).fetchall()
        ]
        conn.execute(delete_statement)
        if len(removed_asset_keys) > 0:
            keys_to_check = []
            keys_to_check.extend([key.to_string() for key in removed_asset_keys])
            keys_to_check.extend([key.to_string(legacy=True) for key in removed_asset_keys])
            remaining_asset_keys = [
                AssetKey.from_db_string(row[0])
                for row in conn.execute(
                    db.select([SqlEventLogStorageTable.c.asset_key])
                    .where(SqlEventLogStorageTable.c.asset_key.in_(keys_to_check))
                    .group_by(SqlEventLogStorageTable.c.asset_key)
                )
            ]
            to_remove = set(removed_asset_keys) - set(remaining_asset_keys)
            if to_remove:
                keys_to_remove = []
                keys_to_remove.extend([key.to_string() for key in to_remove])
                keys_to_remove.extend([key.to_string(legacy=True) for key in to_remove])
                conn.execute(
                    AssetKeyTable.delete().where(  # pylint: disable=no-value-for-parameter
                        AssetKeyTable.c.asset_key.in_(keys_to_remove)
                    )
                )
Esempio n. 10
0
    def all_asset_tags(self):
        query = db.select([AssetKeyTable.c.asset_key, AssetKeyTable.c.last_materialization])
        tags_by_asset_key = defaultdict(dict)
        with self.index_connection() as conn:
            rows = conn.execute(query).fetchall()
            for asset_key, json_str in rows:
                materialization = self._asset_materialization_from_json_column(json_str)
                if materialization and materialization.tags:
                    tags_by_asset_key[AssetKey.from_db_string(asset_key)] = {
                        k: v for k, v in materialization.tags.items()
                    }

        return tags_by_asset_key
Esempio n. 11
0
    def all_asset_tags(self):
        query = db.select([AssetKeyTable.c.asset_key, AssetKeyTable.c.last_materialization])
        tags_by_asset_key = defaultdict(dict)
        with self.index_connection() as conn:
            rows = conn.execute(query).fetchall()
            for asset_key, materialization_str in rows:
                if materialization_str:
                    materialization = deserialize_json_to_dagster_namedtuple(materialization_str)
                    tags_by_asset_key[AssetKey.from_db_string(asset_key)] = {
                        k: v for k, v in (materialization.tags or {}).items()
                    }

        return tags_by_asset_key
Esempio n. 12
0
 def _lazy_migrate_secondary_index_asset_key(self, conn, asset_keys):
     results = conn.execute(db.select([AssetKeyTable.c.asset_key
                                       ])).fetchall()
     existing = [asset_key for (asset_key, ) in results if asset_key]
     to_migrate = set(asset_keys) - set(existing)
     for asset_key in to_migrate:
         try:
             conn.execute(AssetKeyTable.insert().values(  # pylint: disable=no-value-for-parameter
                 asset_key=AssetKey.from_db_string(asset_key).to_string()))
         except db.exc.IntegrityError:
             # asset key already present
             pass
     self.enable_secondary_index(SECONDARY_INDEX_ASSET_KEY)
Esempio n. 13
0
    def get_all_asset_keys(self, prefix_path=None):
        if not prefix_path:
            query = db.select([SqlEventLogStorageTable.c.asset_key]).distinct()
        else:
            query = (db.select([SqlEventLogStorageTable.c.asset_key]).where(
                SqlEventLogStorageTable.c.asset_key.startswith(
                    AssetKey.get_db_prefix(prefix_path))).distinct())

        with self.connect() as conn:
            results = conn.execute(query).fetchall()

        return [
            AssetKey.from_db_string(asset_key) for (asset_key, ) in results
            if asset_key
        ]
Esempio n. 14
0
    def all_asset_tags(self):
        tags_by_asset_key = defaultdict(dict)
        if self.has_secondary_index(ASSET_KEY_INDEX_COLS):
            query = (db.select([
                AssetKeyTable.c.asset_key, AssetKeyTable.c.tags
            ]).where(AssetKeyTable.c.tags != None).where(
                db.or_(
                    AssetKeyTable.c.wipe_timestamp == None,
                    AssetKeyTable.c.last_materialization_timestamp >
                    AssetKeyTable.c.wipe_timestamp,
                )))
            with self.index_connection() as conn:
                rows = conn.execute(query).fetchall()
                for asset_key, tags_json in rows:
                    tags = seven.json.loads(tags_json)
                    if tags:
                        tags_by_asset_key[AssetKey.from_db_string(
                            asset_key)] = tags

        else:
            query = db.select([
                AssetKeyTable.c.asset_key, AssetKeyTable.c.last_materialization
            ])
            with self.index_connection() as conn:
                rows = conn.execute(query).fetchall()
                for asset_key, json_str in rows:
                    materialization = self._asset_materialization_from_json_column(
                        json_str)
                    if materialization and materialization.tags:
                        tags_by_asset_key[AssetKey.from_db_string(
                            asset_key)] = {
                                k: v
                                for k, v in materialization.tags.items()
                            }

        return tags_by_asset_key
Esempio n. 15
0
    def get_materialization_count_by_partition(
        self, asset_keys: Sequence[AssetKey]
    ) -> Mapping[AssetKey, Mapping[str, int]]:
        check.list_param(asset_keys, "asset_keys", AssetKey)

        query = (
            db.select(
                [
                    SqlEventLogStorageTable.c.asset_key,
                    SqlEventLogStorageTable.c.partition,
                    db.func.count(SqlEventLogStorageTable.c.id),
                ]
            )
            .where(
                db.and_(
                    db.or_(
                        SqlEventLogStorageTable.c.asset_key.in_(
                            [asset_key.to_string() for asset_key in asset_keys]
                        ),
                        SqlEventLogStorageTable.c.asset_key.in_(
                            [asset_key.to_string(legacy=True) for asset_key in asset_keys]
                        ),
                    ),
                    SqlEventLogStorageTable.c.partition != None,
                )
            )
            .group_by(SqlEventLogStorageTable.c.asset_key, SqlEventLogStorageTable.c.partition)
        )

        assets_details = self._get_assets_details(asset_keys)
        query = self._add_assets_wipe_filter_to_query(query, assets_details, asset_keys)

        with self.index_connection() as conn:
            results = conn.execute(query).fetchall()

        materialization_count_by_partition: Dict[AssetKey, Dict[str, int]] = {
            asset_key: {} for asset_key in asset_keys
        }
        for row in results:
            asset_key = AssetKey.from_db_string(row[0])
            if asset_key:
                materialization_count_by_partition[asset_key][row[1]] = row[2]

        return materialization_count_by_partition
Esempio n. 16
0
 def get_asset_keys(
     self,
     prefix: Optional[List[str]] = None,
     limit: Optional[int] = None,
     cursor: Optional[str] = None,
 ) -> Iterable[AssetKey]:
     # base implementation of get_asset_keys, using the existing `all_asset_keys` and doing the
     # filtering in-memory
     asset_keys = sorted(self.all_asset_keys(), key=str)
     if prefix:
         asset_keys = [
             asset_key for asset_key in asset_keys if asset_key.path[: len(prefix)] == prefix
         ]
     if cursor:
         cursor_asset = AssetKey.from_db_string(cursor)
         if cursor_asset and cursor_asset in asset_keys:
             idx = asset_keys.index(cursor_asset)
             asset_keys = asset_keys[idx + 1 :]
     if limit:
         asset_keys = asset_keys[:limit]
     return asset_keys
Esempio n. 17
0
 def _fetch_backcompat_materialization_times(self, asset_keys):
     # fetches the latest materialization timestamp for the given asset_keys.  Uses the (slower)
     # raw event log table.
     backcompat_query = (
         db.select(
             [
                 SqlEventLogStorageTable.c.asset_key,
                 db.func.max(SqlEventLogStorageTable.c.timestamp),
             ]
         )
         .where(
             SqlEventLogStorageTable.c.asset_key.in_(
                 [asset_key.to_string() for asset_key in asset_keys]
             )
         )
         .group_by(SqlEventLogStorageTable.c.asset_key)
         .order_by(db.func.max(SqlEventLogStorageTable.c.timestamp).asc())
     )
     with self.index_connection() as conn:
         backcompat_rows = conn.execute(backcompat_query).fetchall()
     return {AssetKey.from_db_string(row[0]): row[1] for row in backcompat_rows}
Esempio n. 18
0
    def _fetch_raw_asset_rows(self, asset_keys=None, prefix=None, limit=None, cursor=None):
        # fetches rows containing asset_key, last_materialization, and asset_details from the DB,
        # applying the filters specified in the arguments.  Does not guarantee that the number of
        # rows returned will match the limit specified.  This helper function is used to fetch a
        # chunk of asset key rows, which may or may not be wiped.
        #
        # Returns a tuple of (rows, has_more, cursor), where each row is a tuple of serialized
        # asset_key, materialization, and asset_details

        columns = [
            AssetKeyTable.c.asset_key,
            AssetKeyTable.c.last_materialization,
            AssetKeyTable.c.asset_details,
        ]

        is_partial_query = bool(asset_keys) or bool(prefix) or bool(limit) or bool(cursor)
        if self.has_asset_key_index_cols() and not is_partial_query:
            # if the schema has been migrated, fetch the last_materialization_timestamp to see if
            # we can lazily migrate the data table
            columns.append(AssetKeyTable.c.last_materialization_timestamp)
            columns.append(AssetKeyTable.c.wipe_timestamp)

        query = db.select(columns).order_by(AssetKeyTable.c.asset_key.asc())
        query = self._apply_asset_filter_to_query(query, asset_keys, prefix, limit, cursor)

        if self.has_secondary_index(ASSET_KEY_INDEX_COLS):
            query = query.where(
                db.or_(
                    AssetKeyTable.c.wipe_timestamp == None,
                    AssetKeyTable.c.last_materialization_timestamp > AssetKeyTable.c.wipe_timestamp,
                )
            )
            with self.index_connection() as conn:
                rows = conn.execute(query).fetchall()

            return rows, False, None

        with self.index_connection() as conn:
            rows = conn.execute(query).fetchall()

        wiped_timestamps_by_asset_key = {}
        row_by_asset_key = OrderedDict()

        for row in rows:
            asset_key = AssetKey.from_db_string(row[0])
            if not asset_key:
                continue
            asset_details = AssetDetails.from_db_string(row[2])
            if not asset_details or not asset_details.last_wipe_timestamp:
                row_by_asset_key[asset_key] = row
                continue
            materialization_or_event = (
                deserialize_json_to_dagster_namedtuple(row[1]) if row[1] else None
            )
            if isinstance(materialization_or_event, EventLogEntry):
                if asset_details.last_wipe_timestamp > materialization_or_event.timestamp:
                    # this asset has not been materialized since being wiped, skip
                    continue
                else:
                    # add the key
                    row_by_asset_key[asset_key] = row
            else:
                row_by_asset_key[asset_key] = row
                wiped_timestamps_by_asset_key[asset_key] = asset_details.last_wipe_timestamp

        if wiped_timestamps_by_asset_key:
            materialization_times = self._fetch_backcompat_materialization_times(
                wiped_timestamps_by_asset_key.keys()
            )
            for asset_key, wiped_timestamp in wiped_timestamps_by_asset_key.items():
                materialization_time = materialization_times.get(asset_key)
                if not materialization_time or utc_datetime_from_naive(
                    materialization_time
                ) < utc_datetime_from_timestamp(wiped_timestamp):
                    # remove rows that have not been materialized since being wiped
                    row_by_asset_key.pop(asset_key)

        has_more = limit and len(rows) == limit
        new_cursor = rows[-1][0] if rows else None

        return row_by_asset_key.values(), has_more, new_cursor
Esempio n. 19
0
 def all_asset_keys(self):
     rows = self._fetch_asset_rows()
     asset_keys = [AssetKey.from_db_string(row[0]) for row in sorted(rows, key=lambda x: x[0])]
     return [asset_key for asset_key in asset_keys if asset_key]