def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard) -> List[MetadataChangeEvent]: actor = self.source_config.actor sys_time = int(time.time()) * 1000 chart_mces = [ self._make_chart_mce(element) for element in looker_dashboard.dashboard_elements ] dashboard_urn = f"urn:li:dashboard:({self.source_config.platform_name},{looker_dashboard.get_urn_dashboard_id()})" dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) last_modified = ChangeAuditStamps( created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dashboard_info = DashboardInfoClass( description=looker_dashboard.description if looker_dashboard.description is not None else "", title=looker_dashboard.title, charts=[mce.proposedSnapshot.urn for mce in chart_mces], lastModified=last_modified, dashboardUrl=looker_dashboard.url(self.source_config.base_url), ) dashboard_snapshot.aspects.append(dashboard_info) owners = [OwnerClass(owner=actor, type=OwnershipTypeClass.DATAOWNER)] dashboard_snapshot.aspects.append( OwnershipClass( owners=owners, lastModified=AuditStampClass(time=sys_time, actor=self.source_config.actor), )) dashboard_snapshot.aspects.append( Status(removed=looker_dashboard.is_deleted)) dashboard_mce = MetadataChangeEvent( proposedSnapshot=dashboard_snapshot) return chart_mces + [dashboard_mce]
def construct_dashboard_from_api_data(self, dashboard_data): dashboard_urn = f"urn:li:dashboard:({self.platform},{dashboard_data['id']})" dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) modified_actor = f"urn:li:corpuser:{(dashboard_data.get('changed_by') or {}).get('username', 'unknown')}" modified_ts = int( dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000 ) title = dashboard_data.get("dashboard_title", "") # note: the API does not currently supply created_by usernames due to a bug, but we are required to # provide a created AuditStamp to comply with ChangeAuditStamp model. For now, I sub in the last # modified actor urn last_modified = ChangeAuditStamps( created=AuditStamp(time=modified_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) dashboard_url = f"{self.config.connect_uri}{dashboard_data.get('url', '')}" chart_urns = [] raw_position_data = dashboard_data.get("position_json", "{}") position_data = ( json.loads(raw_position_data) if raw_position_data is not None else {} ) for key, value in position_data.items(): if not key.startswith("CHART-"): continue chart_urns.append( f"urn:li:chart:({self.platform},{value.get('meta', {}).get('chartId', 'unknown')})" ) dashboard_info = DashboardInfoClass( description="", title=title, charts=chart_urns, lastModified=last_modified, dashboardUrl=dashboard_url, customProperties={}, ) dashboard_snapshot.aspects.append(dashboard_info) return dashboard_snapshot
def get_schema_metadata(report: SourceReport, node: DBTNode, platform: str) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in node.columns: description = None if (column.comment and column.description and column.comment != column.description): description = f"{platform} comment: {column.comment}\n\ndbt model description: {column.description}" elif column.comment: description = column.comment elif column.description: description = column.description globalTags = None if column.tags: globalTags = GlobalTagsClass(tags=[ TagAssociationClass(f"urn:li:tag:{tag}") for tag in column.tags ]) field = SchemaField( fieldPath=column.name, nativeDataType=column.data_type, type=get_column_type(report, node.dbt_name, column.data_type), description=description, nullable=False, # TODO: actually autodetect this recursive=False, globalTags=globalTags, ) canonical_schema.append(field) last_modified = None if node.max_loaded_at is not None: actor = "urn:li:corpuser:dbt_executor" last_modified = AuditStamp( time=int( dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000), actor=actor, ) description = None return SchemaMetadata( schemaName=node.dbt_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), lastModified=last_modified, fields=canonical_schema, )
def _get_dashboard_snapshot(self, dashboard_data): dashboard_id = dashboard_data["id"] dashboard_urn = f"urn:li:dashboard:({self.platform},{dashboard_id})" dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) modified_actor = f"urn:li:corpuser:{dashboard_data.get('changed_by', {}).get('username', 'unknown')}" modified_ts = int( dp.parse(dashboard_data.get("updated_at", "now")).timestamp() * 1000) title = dashboard_data.get("name", "") last_modified = ChangeAuditStamps( created=AuditStamp(time=modified_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) dashboard_url = ( f"{self.config.connect_uri}/dashboard/{dashboard_data.get('slug', '')}" ) widgets = dashboard_data.get("widgets", []) description = self._get_dashboard_description_from_widgets(widgets) chart_urns = self._get_dashboard_chart_urns_from_widgets(widgets) dashboard_info = DashboardInfoClass( description=description, title=title, charts=chart_urns, lastModified=last_modified, dashboardUrl=dashboard_url, customProperties={}, ) dashboard_snapshot.aspects.append(dashboard_info) return dashboard_snapshot
def get_upstream_lineage(upstream_urns: List[str]) -> UpstreamLineage: ucl: List[UpstreamClass] = [] actor, sys_time = "urn:li:corpuser:dbt_executor", int(time.time()) * 1000 for dep in upstream_urns: uc = UpstreamClass( dataset=dep, auditStamp=AuditStamp(actor=actor, time=sys_time), type=DatasetLineageTypeClass.TRANSFORMED, ) ucl.append(uc) return UpstreamLineage(upstreams=ucl)
def get_schema_metadata(sql_report: SQLSourceReport, dataset_name: str, platform: str, columns) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in columns: field = SchemaField( fieldPath=column["name"], nativeDataType=repr(column["type"]), type=get_column_type(sql_report, dataset_name, column["type"]), description=column.get("comment", None), ) canonical_schema.append(field) actor, sys_time = "urn:li:corpuser:etl", int(time.time()) * 1000 schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) return schema_metadata
def _get_upsteam_lineage(self, looker_view: LookerView, actor: str, sys_time: int) -> UpstreamLineage: upstreams = [] for sql_table_name in looker_view.sql_table_names: upstream = UpstreamClass( dataset=self._construct_datalineage_urn( sql_table_name, looker_view.connection), auditStamp=AuditStamp(actor=actor, time=sys_time), type=DatasetLineageTypeClass.TRANSFORMED, ) upstreams.append(upstream) upstream_lineage = UpstreamLineage(upstreams=upstreams) return upstream_lineage
def _get_schema(self, looker_view: LookerView, actor: str, sys_time: int) -> SchemaMetadataClass: fields, primary_keys = self._get_fields_and_primary_keys(looker_view) stamp = AuditStamp(time=sys_time, actor=actor) schema_metadata = SchemaMetadata( schemaName=looker_view.view_name, platform=f"urn:li:dataPlatform:{self.source_config.platform_name}", version=0, fields=fields, primaryKeys=primary_keys, created=stamp, lastModified=stamp, hash="", platformSchema=OtherSchema(rawSchema="looker-view"), ) return schema_metadata
def construct_card_from_api_data( self, card_data: dict) -> Optional[ChartSnapshot]: card_id = card_data.get("id", "") card_url = f"{self.config.connect_uri}/api/card/{card_id}" try: card_response = self.session.get(card_url) card_response.raise_for_status() card_details = card_response.json() except HTTPError as http_error: self.report.report_failure( key=f"metabase-card-{card_id}", reason=f"Unable to retrieve Card info. " f"Reason: {str(http_error)}", ) return None chart_urn = builder.make_chart_urn(self.platform, card_id) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[], ) last_edit_by = card_details.get("last-edit-info") or {} modified_actor = builder.make_user_urn( last_edit_by.get("email", "unknown")) modified_ts = self.get_timestamp_millis_from_ts_string( f"{last_edit_by.get('timestamp')}") last_modified = ChangeAuditStamps( created=AuditStamp(time=modified_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) chart_type = self._get_chart_type(card_details.get("id", ""), card_details.get("display")) description = card_details.get("description") or "" title = card_details.get("name") or "" datasource_urn = self.get_datasource_urn(card_details) custom_properties = self.construct_card_custom_properties(card_details) chart_info = ChartInfoClass( type=chart_type, description=description, title=title, lastModified=last_modified, chartUrl=f"{self.config.connect_uri}/card/{card_id}", inputs=datasource_urn, customProperties=custom_properties, ) chart_snapshot.aspects.append(chart_info) if card_details.get("query_type", "") == "native": raw_query = (card_details.get("dataset_query", {}).get("native", {}).get("query", "")) chart_query_native = ChartQueryClass( rawQuery=raw_query, type=ChartQueryTypeClass.SQL, ) chart_snapshot.aspects.append(chart_query_native) # Ownership ownership = self._get_ownership(card_details.get("creator_id", "")) if ownership is not None: chart_snapshot.aspects.append(ownership) return chart_snapshot
def construct_chart_from_api_data(self, chart_data: dict, query: dict, path: str) -> ChartSnapshot: chart_urn = builder.make_chart_urn(self.platform, chart_data.get("token", "")) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[], ) last_modified = ChangeAuditStamps() creator = self._get_creator( chart_data.get("_links", {}).get("creator", {}).get("href", "")) if creator is not None: modified_actor = builder.make_user_urn(creator) created_ts = int( dp.parse(chart_data.get("created_at", "now")).timestamp() * 1000) modified_ts = int( dp.parse(chart_data.get("updated_at", "now")).timestamp() * 1000) last_modified = ChangeAuditStamps( created=AuditStamp(time=created_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) chart_detail = (chart_data.get("view", {}) if len(chart_data.get("view", {})) != 0 else chart_data.get("view_vegas", {})) mode_chart_type = chart_detail.get( "chartType", "") or chart_detail.get("selectedChart", "") chart_type = self._get_chart_type(chart_data.get("token", ""), mode_chart_type) description = (chart_detail.get("description") or chart_detail.get("chartDescription") or "") title = chart_detail.get("title") or chart_detail.get( "chartTitle") or "" # create datasource urn platform, db_name = self._get_platform_and_dbname( query.get("data_source_id")) source_tables = self._get_source_from_query(query.get("raw_query")) datasource_urn = self._get_datasource_urn(platform, db_name, source_tables) custom_properties = self.construct_chart_custom_properties( chart_detail, mode_chart_type) # Chart Info chart_info = ChartInfoClass( type=chart_type, description=description, title=title, lastModified=last_modified, chartUrl=f"{self.config.connect_uri}" f"{chart_data.get('_links', {}).get('report_viz_web', {}).get('href', '')}", inputs=datasource_urn, customProperties=custom_properties, ) chart_snapshot.aspects.append(chart_info) # Browse Path browse_path = BrowsePathsClass(paths=[path]) chart_snapshot.aspects.append(browse_path) # Query chart_query = ChartQueryClass( rawQuery=query.get("raw_query", ""), type=ChartQueryTypeClass.SQL, ) chart_snapshot.aspects.append(chart_query) # Ownership ownership = self._get_ownership( self._get_creator( chart_data.get("_links", {}).get("creator", {}).get("href", ""))) if ownership is not None: chart_snapshot.aspects.append(ownership) return chart_snapshot
def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() # traverse databases in sorted order so output is consistent for database_name in sorted(database_names): if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() # traverse collections in sorted order so output is consistent for collection_name in sorted(collection_names): dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})", aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if self.config.enableSchemaInference: collection_schema = construct_schema_pymongo( database[collection_name], delimiter=".", sample_size=self.config.schemaSamplingSize, ) # initialize the schema for the collection canonical_schema: List[SchemaField] = [] # append each schema field (sort so output is consistent) for schema_field in sorted( collection_schema.values(), key=lambda x: x["delimited_name"] ): field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=self.get_pymongo_type_string( schema_field["type"], dataset_name ), type=self.get_field_type( schema_field["type"], dataset_name ), description=None, nullable=schema_field["nullable"], recursive=False, ) canonical_schema.append(field) # create schema metadata object for collection actor = "urn:li:corpuser:etl" sys_time = int(time.time() * 1000) schema_metadata = SchemaMetadata( schemaName=collection_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=SchemalessClass(), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) dataset_snapshot.aspects.append(schema_metadata) # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu