Ejemplo n.º 1
0
 def _get_urn(self, strip_user_ids_from_email: bool) -> Optional[str]:
     if self.email is not None:
         if strip_user_ids_from_email:
             return builder.make_user_urn(self.email.split("@")[0])
         else:
             return builder.make_user_urn(self.email)
     return None
Ejemplo n.º 2
0
def test_simple_dataset_ownership_tranformation(mock_time):
    no_owner_aspect = make_generic_dataset()

    with_owner_aspect = make_dataset_with_owner()

    not_a_dataset = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)",
            aspects=[
                models.DataJobInfoClass(
                    name="User Deletions",
                    description="Constructs the fct_users_deleted from logging_events",
                    type=models.AzkabanJobTypeClass.SQL,
                )
            ],
        )
    )

    inputs = [
        no_owner_aspect,
        with_owner_aspect,
        not_a_dataset,
    ]

    transformer = SimpleAddDatasetOwnership.create(
        {
            "owner_urns": [
                builder.make_user_urn("person1"),
                builder.make_user_urn("person2"),
            ]
        },
        PipelineContext(run_id="test"),
    )

    outputs = list(
        transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs])
    )

    assert len(outputs) == len(inputs)

    # Check the first entry.
    first_ownership_aspect = builder.get_aspect_if_available(
        outputs[0].record, models.OwnershipClass
    )
    assert first_ownership_aspect
    assert len(first_ownership_aspect.owners) == 2

    # Check the second entry.
    second_ownership_aspect = builder.get_aspect_if_available(
        outputs[1].record, models.OwnershipClass
    )
    assert second_ownership_aspect
    assert len(second_ownership_aspect.owners) == 3

    # Verify that the third entry is unchanged.
    assert inputs[2] == outputs[2].record
Ejemplo n.º 3
0
def test_simple_dataset_ownership_with_type_transformation(mock_time):
    input = make_generic_dataset()

    transformer = SimpleAddDatasetOwnership.create(
        {
            "owner_urns": [
                builder.make_user_urn("person1"),
            ],
            "ownership_type": "PRODUCER",
        },
        PipelineContext(run_id="test"),
    )

    output = list(
        transformer.transform([
            RecordEnvelope(input, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))

    assert len(output) == 3

    # original MCE is unchanged
    assert input == output[0].record

    ownership_aspect = output[1].record.aspect

    assert isinstance(ownership_aspect, OwnershipClass)
    assert len(ownership_aspect.owners) == 1
    assert ownership_aspect.owners[
        0].type == models.OwnershipTypeClass.PRODUCER
Ejemplo n.º 4
0
    def construct_dashboard(self, space_name: str,
                            report_info: dict) -> DashboardSnapshot:
        report_token = report_info.get("token", "")
        dashboard_urn = builder.make_dashboard_urn(self.platform,
                                                   report_info.get("id", ""))
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )

        last_modified = ChangeAuditStamps()
        creator = self._get_creator(
            report_info.get("_links", {}).get("creator", {}).get("href", ""))
        if creator is not None:
            modified_actor = builder.make_user_urn(creator)
            modified_ts = int(
                dp.parse(
                    f"{report_info.get('last_saved_at', 'now')}").timestamp() *
                1000)
            created_ts = int(
                dp.parse(
                    f"{report_info.get('created_at', 'now')}").timestamp() *
                1000)
            title = report_info.get("name", "") or ""
            description = report_info.get("description", "") or ""
            last_modified = ChangeAuditStamps(
                created=AuditStamp(time=created_ts, actor=modified_actor),
                lastModified=AuditStamp(time=modified_ts,
                                        actor=modified_actor),
            )

        dashboard_info_class = DashboardInfoClass(
            description=description,
            title=title,
            charts=self._get_chart_urns(report_token),
            lastModified=last_modified,
            dashboardUrl=f"{self.config.connect_uri}/"
            f"{self.config.workspace}/"
            f"reports/{report_token}",
            customProperties={},
        )
        dashboard_snapshot.aspects.append(dashboard_info_class)

        # browse path
        browse_path = BrowsePathsClass(paths=[
            f"/mode/{self.config.workspace}/"
            f"{space_name}/"
            f"{report_info.get('name')}"
        ])
        dashboard_snapshot.aspects.append(browse_path)

        # Ownership
        ownership = self._get_ownership(
            self._get_creator(
                report_info.get("_links", {}).get("creator",
                                                  {}).get("href", "")))
        if ownership is not None:
            dashboard_snapshot.aspects.append(ownership)

        return dashboard_snapshot
def test_pattern_dataset_ownership_with_type_transformation(mock_time):
    input = make_generic_dataset()

    transformer = PatternAddDatasetOwnership.create(
        {
            "owner_pattern": {
                "rules": {
                    ".*example1.*": [builder.make_user_urn("person1")],
                }
            },
            "ownership_type": "PRODUCER",
        },
        PipelineContext(run_id="test"),
    )

    output = list(transformer.transform([RecordEnvelope(input, metadata={})]))

    assert len(output) == 1

    ownership_aspect = builder.get_aspect_if_available(output[0].record,
                                                       models.OwnershipClass)
    assert ownership_aspect
    assert len(ownership_aspect.owners) == 1
    assert ownership_aspect.owners[
        0].type == models.OwnershipTypeClass.PRODUCER
Ejemplo n.º 6
0
    def emit_workbook_as_container(
            self, workbook: Dict) -> Iterable[MetadataWorkUnit]:

        workbook_container_key = self.gen_workbook_key(workbook)
        creator = workbook.get("owner", {}).get("username")

        owner_urn = (builder.make_user_urn(creator) if
                     (creator and self.config.ingest_owner) else None)

        site_part = f"/site/{self.config.site}" if self.config.site else ""
        workbook_uri = workbook.get("uri", "")
        workbook_part = (workbook_uri[workbook_uri.index("/workbooks/"):]
                         if workbook.get("uri") else None)
        workbook_external_url = (
            f"{self.config.connect_uri}/#{site_part}{workbook_part}"
            if workbook_part else None)

        tag_list = workbook.get("tags", [])
        tag_list_str = (
            [t.get("name", "").upper() for t in tag_list if t is not None] if
            (tag_list and self.config.ingest_tags) else None)

        container_workunits = gen_containers(
            container_key=workbook_container_key,
            name=workbook.get("name", ""),
            sub_types=["Workbook"],
            description=workbook.get("description"),
            owner_urn=owner_urn,
            external_url=workbook_external_url,
            tags=tag_list_str,
        )

        for wu in container_workunits:
            self.report.report_workunit(wu)
            yield wu
Ejemplo n.º 7
0
    def _aggregate_operation_aspect_events(
        self,
        events: List[RedshiftJoinedAccessEvent],
        operation_type: Union[str, "OperationTypeClass"],
    ) -> Iterable[MetadataWorkUnit]:
        for event in events:
            if (event.database and event.usename and event.schema_
                    and event.table and event.endtime):
                resource = f"{event.database}.{event.schema_}.{event.table}"
                last_updated_timestamp: int = int(event.endtime.timestamp() *
                                                  1000)
                user_email = event.usename

                operation_aspect = OperationClass(
                    timestampMillis=last_updated_timestamp,
                    lastUpdatedTimestamp=last_updated_timestamp,
                    actor=builder.make_user_urn(user_email.split("@")[0]),
                    operationType=operation_type,
                )
                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    aspectName="operation",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=builder.make_dataset_urn("redshift",
                                                       resource.lower(),
                                                       self.config.env),
                    aspect=operation_aspect,
                )
                wu = MetadataWorkUnit(
                    id=
                    f"operation-aspect-{event.table}-{event.endtime.isoformat()}",
                    mcp=mcp,
                )
                yield wu
Ejemplo n.º 8
0
 def _get_operation_aspect_work_units(
     self, events: Iterable[SnowflakeJoinedAccessEvent]
 ) -> Iterable[MetadataWorkUnit]:
     for event in events:
         if event.query_start_time and event.query_type in OPERATION_STATEMENT_TYPES:
             start_time = event.query_start_time
             query_type = event.query_type
             user_email = event.email
             operation_type = OPERATION_STATEMENT_TYPES[query_type]
             last_updated_timestamp: int = int(start_time.timestamp() *
                                               1000)
             user_urn = builder.make_user_urn(user_email.split("@")[0])
             for obj in event.base_objects_accessed:
                 resource = obj.objectName
                 dataset_urn = builder.make_dataset_urn(
                     "snowflake", resource.lower(), self.config.env)
                 operation_aspect = OperationClass(
                     timestampMillis=last_updated_timestamp,
                     lastUpdatedTimestamp=last_updated_timestamp,
                     actor=user_urn,
                     operationType=operation_type,
                 )
                 mcp = MetadataChangeProposalWrapper(
                     entityType="dataset",
                     aspectName="operation",
                     changeType=ChangeTypeClass.UPSERT,
                     entityUrn=dataset_urn,
                     aspect=operation_aspect,
                 )
                 wu = MetadataWorkUnit(
                     id=
                     f"operation-aspect-{resource}-{start_time.isoformat()}",
                     mcp=mcp,
                 )
                 yield wu
Ejemplo n.º 9
0
    def _get_ownership(self, creator_id: int) -> Optional[OwnershipClass]:
        user_info_url = f"{self.config.connect_uri}/api/user/{creator_id}"
        try:
            user_info_response = self.session.get(user_info_url)
            user_info_response.raise_for_status()
            user_details = user_info_response.json()
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"metabase-user-{creator_id}",
                reason=f"Unable to retrieve User info. "
                f"Reason: {str(http_error)}",
            )
            return None

        owner_urn = builder.make_user_urn(user_details.get("email", ""))
        if owner_urn is not None:
            ownership: OwnershipClass = OwnershipClass(owners=[
                OwnerClass(
                    owner=owner_urn,
                    type=OwnershipTypeClass.DATAOWNER,
                )
            ])
            return ownership

        return None
Ejemplo n.º 10
0
    def construct_dashboard_from_api_data(
            self, dashboard_info: dict) -> Optional[DashboardSnapshot]:

        dashboard_id = dashboard_info.get("id", "")
        dashboard_url = f"{self.config.connect_uri}/api/dashboard/{dashboard_id}"
        try:
            dashboard_response = self.session.get(dashboard_url)
            dashboard_response.raise_for_status()
            dashboard_details = dashboard_response.json()
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"metabase-dashboard-{dashboard_id}",
                reason=f"Unable to retrieve dashboard. "
                f"Reason: {str(http_error)}",
            )
            return None

        dashboard_urn = builder.make_dashboard_urn(
            self.platform, dashboard_details.get("id", ""))
        dashboard_snapshot = DashboardSnapshot(
            urn=dashboard_urn,
            aspects=[],
        )
        last_edit_by = dashboard_details.get("last-edit-info") or {}
        modified_actor = builder.make_user_urn(
            last_edit_by.get("email", "unknown"))
        modified_ts = self.get_timestamp_millis_from_ts_string(
            f"{last_edit_by.get('timestamp')}")
        title = dashboard_details.get("name", "") or ""
        description = dashboard_details.get("description", "") or ""
        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=modified_ts, actor=modified_actor),
            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
        )

        chart_urns = []
        cards_data = dashboard_details.get("ordered_cards", "{}")
        for card_info in cards_data:
            chart_urn = builder.make_chart_urn(self.platform,
                                               card_info.get("id", ""))
            chart_urns.append(chart_urn)

        dashboard_info_class = DashboardInfoClass(
            description=description,
            title=title,
            charts=chart_urns,
            lastModified=last_modified,
            dashboardUrl=f"{self.config.connect_uri}/dashboard/{dashboard_id}",
            customProperties={},
        )
        dashboard_snapshot.aspects.append(dashboard_info_class)

        # Ownership
        ownership = self._get_ownership(dashboard_details.get(
            "creator_id", ""))
        if ownership is not None:
            dashboard_snapshot.aspects.append(ownership)

        return dashboard_snapshot
Ejemplo n.º 11
0
 def generate_ownership_aspect(self):
     ownership = OwnershipClass(
         owners=[
             OwnerClass(
                 owner=builder.make_user_urn(owner),
                 type=OwnershipTypeClass.DEVELOPER,
                 source=OwnershipSourceClass(
                     type=OwnershipSourceTypeClass.SERVICE,
                     # url=dag.filepath,
                 ),
             ) for owner in (self.owners or [])
         ],
         lastModified=AuditStampClass(time=0,
                                      actor=builder.make_user_urn(
                                          self.orchestrator)),
     )
     return [ownership]
Ejemplo n.º 12
0
class AddDatasetOwnershipConfig(ConfigModel):
    # Workaround for https://github.com/python/mypy/issues/708.
    # Suggested by https://stackoverflow.com/a/64528725/5004662.
    get_owners_to_add: Union[Callable[[DatasetSnapshotClass],
                                      List[OwnerClass]],
                             Callable[[DatasetSnapshotClass],
                                      List[OwnerClass]], ]
    default_actor: str = builder.make_user_urn("etl")
Ejemplo n.º 13
0
class SimpleDatasetOwnershipConfig(DatasetOwnershipBaseConfig):
    owner_urns: List[str]
    default_actor: str = builder.make_user_urn("etl")
    semantics: Semantics = Semantics.OVERWRITE

    @validator("semantics", pre=True)
    def upper_case_semantics(cls, v):
        if isinstance(v, str):
            return v.upper()
        return v
Ejemplo n.º 14
0
def test_simple_dataset_ownership_with_invalid_type_transformation(mock_time):
    with pytest.raises(ValueError):
        SimpleAddDatasetOwnership.create(
            {
                "owner_urns": [
                    builder.make_user_urn("person1"),
                ],
                "ownership_type": "INVALID_TYPE",
            },
            PipelineContext(run_id="test"),
        )
Ejemplo n.º 15
0
    def _get_ownership(self, user: str) -> Optional[OwnershipClass]:
        if user is not None:
            owner_urn = builder.make_user_urn(user)
            ownership: OwnershipClass = OwnershipClass(owners=[
                OwnerClass(
                    owner=owner_urn,
                    type=OwnershipTypeClass.DATAOWNER,
                )
            ])
            return ownership

        return None
Ejemplo n.º 16
0
 def _create_operation_aspect_work_unit(
         self, event: QueryEvent) -> Optional[MetadataWorkUnit]:
     if event.statementType in OPERATION_STATEMENT_TYPES and event.destinationTable:
         destination_table: BigQueryTableRef
         try:
             destination_table = event.destinationTable.remove_extras()
         except Exception as e:
             self.report.report_warning(
                 str(event.destinationTable),
                 f"Failed to clean up destination table, {e}",
             )
             return None
         reported_time: int = int(time.time() * 1000)
         last_updated_timestamp: int = int(event.timestamp.timestamp() *
                                           1000)
         affected_datasets = []
         if event.referencedTables:
             for table in event.referencedTables:
                 try:
                     affected_datasets.append(
                         _table_ref_to_urn(
                             table.remove_extras(),
                             self.config.env,
                         ))
                 except Exception as e:
                     self.report.report_warning(
                         str(table),
                         f"Failed to clean up table, {e}",
                     )
         operation_aspect = OperationClass(
             timestampMillis=reported_time,
             lastUpdatedTimestamp=last_updated_timestamp,
             actor=builder.make_user_urn(event.actor_email.split("@")[0]),
             operationType=OPERATION_STATEMENT_TYPES[event.statementType],
             affectedDatasets=affected_datasets,
         )
         mcp = MetadataChangeProposalWrapper(
             entityType="dataset",
             aspectName="operation",
             changeType=ChangeTypeClass.UPSERT,
             entityUrn=_table_ref_to_urn(
                 destination_table,
                 env=self.config.env,
             ),
             aspect=operation_aspect,
         )
         return MetadataWorkUnit(
             id=
             f"{event.timestamp.isoformat()}-operation-aspect-{destination_table}",
             mcp=mcp,
         )
     return None
Ejemplo n.º 17
0
 def get_last_modified(self, creator: str, created_at: bytes,
                       updated_at: bytes) -> ChangeAuditStamps:
     last_modified = ChangeAuditStamps()
     if creator:
         modified_actor = builder.make_user_urn(creator)
         created_ts = int(dp.parse(created_at).timestamp() * 1000)
         modified_ts = int(dp.parse(updated_at).timestamp() * 1000)
         last_modified = ChangeAuditStamps(
             created=AuditStamp(time=created_ts, actor=modified_actor),
             lastModified=AuditStamp(time=modified_ts,
                                     actor=modified_actor),
         )
     return last_modified
Ejemplo n.º 18
0
def test_pattern_dataset_ownership_with_invalid_type_transformation(mock_time):
    with pytest.raises(ValueError):
        PatternAddDatasetOwnership.create(
            {
                "owner_pattern": {
                    "rules": {
                        ".*example1.*": [builder.make_user_urn("person1")],
                    }
                },
                "ownership_type": "INVALID_TYPE",
            },
            PipelineContext(run_id="test"),
        )
Ejemplo n.º 19
0
    def _gen_operation_aspect_workunits_from_access_events(
        self,
        events_iterable: Iterable[RedshiftAccessEvent],
    ) -> Iterable[MetadataWorkUnit]:
        self.report.num_operational_stats_workunits_emitted = 0
        for event in events_iterable:
            if not (
                event.database
                and event.username
                and event.schema_
                and event.table
                and event.endtime
                and event.operation_type
            ):
                continue

            assert event.operation_type in ["insert", "delete"]

            resource: str = f"{event.database}.{event.schema_}.{event.table}"
            reported_time: int = int(time.time() * 1000)
            last_updated_timestamp: int = int(event.endtime.timestamp() * 1000)
            user_email: str = event.username
            operation_aspect = OperationClass(
                timestampMillis=reported_time,
                lastUpdatedTimestamp=last_updated_timestamp,
                actor=builder.make_user_urn(user_email.split("@")[0]),
                operationType=(
                    OperationTypeClass.INSERT
                    if event.operation_type == "insert"
                    else OperationTypeClass.DELETE
                ),
            )
            mcp = MetadataChangeProposalWrapper(
                entityType="dataset",
                aspectName="operation",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=builder.make_dataset_urn_with_platform_instance(
                    "redshift",
                    resource.lower(),
                    self.config.platform_instance,
                    self.config.env,
                ),
                aspect=operation_aspect,
            )
            wu = MetadataWorkUnit(
                id=f"operation-aspect-{event.table}-{event.endtime.isoformat()}",
                mcp=mcp,
            )
            self.report.report_workunit(wu)
            self.report.num_operational_stats_workunits_emitted += 1
            yield wu
Ejemplo n.º 20
0
    def make_usage_workunit(
        self,
        bucket_duration: BucketDuration,
        urn_builder: Callable[[ResourceType], str],
        top_n_queries: int,
        format_sql_queries: bool,
    ) -> MetadataWorkUnit:
        budget_per_query: int = int(self.total_budget_for_query_list / top_n_queries)

        usageStats = DatasetUsageStatisticsClass(
            timestampMillis=int(self.bucket_start_time.timestamp() * 1000),
            eventGranularity=TimeWindowSizeClass(unit=bucket_duration, multiple=1),
            uniqueUserCount=len(self.userFreq),
            totalSqlQueries=self.queryCount,
            topSqlQueries=[
                self.trim_query(
                    format_sql_query(query, keyword_case="upper", reindent_aligned=True)
                    if format_sql_queries
                    else query,
                    budget_per_query,
                )
                for query, _ in self.queryFreq.most_common(top_n_queries)
            ],
            userCounts=[
                DatasetUserUsageCountsClass(
                    user=builder.make_user_urn(user_email.split("@")[0]),
                    count=count,
                    userEmail=user_email,
                )
                for user_email, count in self.userFreq.most_common()
            ],
            fieldCounts=[
                DatasetFieldUsageCountsClass(
                    fieldPath=column,
                    count=count,
                )
                for column, count in self.columnFreq.most_common()
            ],
        )

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            aspectName="datasetUsageStatistics",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=urn_builder(self.resource),
            aspect=usageStats,
        )

        return MetadataWorkUnit(
            id=f"{self.bucket_start_time.isoformat()}-{self.resource}", mcp=mcp
        )
Ejemplo n.º 21
0
def get_owners(owners: Owners) -> models.OwnershipClass:
    owners_meta: List[models.OwnerClass] = []
    if owners.users is not None:
        owners_meta = owners_meta + [
            models.OwnerClass(
                owner=make_user_urn(o),
                type=models.OwnershipTypeClass.DEVELOPER,
            ) for o in owners.users
        ]
    if owners.groups is not None:
        owners_meta = owners_meta + [
            models.OwnerClass(
                owner=make_group_urn(o),
                type=models.OwnershipTypeClass.DEVELOPER,
            ) for o in owners.groups
        ]
    return models.OwnershipClass(owners=owners_meta)
Ejemplo n.º 22
0
class AddDatasetOwnershipConfig(ConfigModel):
    # Workaround for https://github.com/python/mypy/issues/708.
    # Suggested by https://stackoverflow.com/a/64528725/5004662.
    get_owners_to_add: Union[
        Callable[[DatasetSnapshotClass], List[OwnerClass]],
        Callable[[DatasetSnapshotClass], List[OwnerClass]],
    ]
    default_actor: str = builder.make_user_urn("etl")
    semantics: Semantics = Semantics.OVERWRITE

    _resolve_owner_fn = pydantic_resolve_key("get_owners_to_add")

    @validator("semantics", pre=True)
    def ensure_semantics_is_upper_case(cls, v):
        if isinstance(v, str):
            return v.upper()
        return v
Ejemplo n.º 23
0
def make_dataset_with_owner() -> models.MetadataChangeEventClass:
    return models.MetadataChangeEventClass(
        proposedSnapshot=models.DatasetSnapshotClass(
            urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)",
            aspects=[
                models.OwnershipClass(
                    owners=[
                        models.OwnerClass(
                            owner=builder.make_user_urn("fake_owner"),
                            type=models.OwnershipTypeClass.DATAOWNER,
                        ),
                    ],
                    lastModified=models.AuditStampClass(
                        time=1625266033123, actor="urn:li:corpuser:datahub"),
                )
            ],
        ), )
Ejemplo n.º 24
0
    def to_datahub_user(
            self,
            user: PowerBiAPI.User) -> List[MetadataChangeProposalWrapper]:
        """
        Map PowerBi user to datahub user
        """

        LOGGER.info("Converting user {}(id={}) to datahub's user".format(
            user.displayName, user.id))

        # Create an URN for user
        user_urn = builder.make_user_urn(user.get_urn_part())

        user_info_instance = CorpUserInfoClass(
            displayName=user.displayName,
            email=user.emailAddress,
            title=user.displayName,
            active=True,
        )

        info_mcp = self.new_mcp(
            entity_type=Constant.CORP_USER,
            entity_urn=user_urn,
            aspect_name=Constant.CORP_USER_INFO,
            aspect=user_info_instance,
        )

        # removed status mcp
        status_mcp = self.new_mcp(
            entity_type=Constant.CORP_USER,
            entity_urn=user_urn,
            aspect_name=Constant.STATUS,
            aspect=StatusClass(removed=False),
        )

        user_key = CorpUserKeyClass(username=user.id)

        user_key_mcp = self.new_mcp(
            entity_type=Constant.CORP_USER,
            entity_urn=user_urn,
            aspect_name=Constant.CORP_USER_KEY,
            aspect=user_key,
        )

        return [info_mcp, status_mcp, user_key_mcp]
Ejemplo n.º 25
0
    def _gen_operation_aspect_workunits_by_type_from_access_events(
        self,
        events_iterable: Iterable[RedshiftAccessEvent],
        operation_type: Union[str, "OperationTypeClass"],
    ) -> Iterable[MetadataWorkUnit]:
        for event in events_iterable:
            if not (event.database and event.username and event.schema_
                    and event.table and event.endtime):
                continue

            resource: str = f"{event.database}.{event.schema_}.{event.table}"
            last_updated_timestamp: int = int(event.endtime.timestamp() * 1000)
            user_email: str = event.username

            operation_aspect = OperationClass(
                timestampMillis=last_updated_timestamp,
                lastUpdatedTimestamp=last_updated_timestamp,
                actor=builder.make_user_urn(user_email.split("@")[0]),
                operationType=operation_type,
            )
            mcp = MetadataChangeProposalWrapper(
                entityType="dataset",
                aspectName="operation",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=builder.make_dataset_urn_with_platform_instance(
                    "redshift",
                    resource.lower(),
                    self.config.platform_instance,
                    self.config.env,
                ),
                aspect=operation_aspect,
            )
            wu = MetadataWorkUnit(
                id=
                f"operation-aspect-{event.table}-{event.endtime.isoformat()}",
                mcp=mcp,
            )
            self.report.report_workunit(wu)
            yield wu
Ejemplo n.º 26
0
    def construct_card_from_api_data(
            self, card_data: dict) -> Optional[ChartSnapshot]:
        card_id = card_data.get("id", "")
        card_url = f"{self.config.connect_uri}/api/card/{card_id}"
        try:
            card_response = self.session.get(card_url)
            card_response.raise_for_status()
            card_details = card_response.json()
        except HTTPError as http_error:
            self.report.report_failure(
                key=f"metabase-card-{card_id}",
                reason=f"Unable to retrieve Card info. "
                f"Reason: {str(http_error)}",
            )
            return None

        chart_urn = builder.make_chart_urn(self.platform, card_id)
        chart_snapshot = ChartSnapshot(
            urn=chart_urn,
            aspects=[],
        )

        last_edit_by = card_details.get("last-edit-info") or {}
        modified_actor = builder.make_user_urn(
            last_edit_by.get("email", "unknown"))
        modified_ts = self.get_timestamp_millis_from_ts_string(
            f"{last_edit_by.get('timestamp')}")
        last_modified = ChangeAuditStamps(
            created=AuditStamp(time=modified_ts, actor=modified_actor),
            lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
        )

        chart_type = self._get_chart_type(card_details.get("id", ""),
                                          card_details.get("display"))
        description = card_details.get("description") or ""
        title = card_details.get("name") or ""
        datasource_urn = self.get_datasource_urn(card_details)
        custom_properties = self.construct_card_custom_properties(card_details)

        chart_info = ChartInfoClass(
            type=chart_type,
            description=description,
            title=title,
            lastModified=last_modified,
            chartUrl=f"{self.config.connect_uri}/card/{card_id}",
            inputs=datasource_urn,
            customProperties=custom_properties,
        )
        chart_snapshot.aspects.append(chart_info)

        if card_details.get("query_type", "") == "native":
            raw_query = (card_details.get("dataset_query",
                                          {}).get("native",
                                                  {}).get("query", ""))
            chart_query_native = ChartQueryClass(
                rawQuery=raw_query,
                type=ChartQueryTypeClass.SQL,
            )
            chart_snapshot.aspects.append(chart_query_native)

        # Ownership
        ownership = self._get_ownership(card_details.get("creator_id", ""))
        if ownership is not None:
            chart_snapshot.aspects.append(ownership)

        return chart_snapshot
Ejemplo n.º 27
0
    def send_lineage(
        operator: "BaseOperator",
        inlets: Optional[List] = None,
        outlets: Optional[List] = None,
        context: Dict = None,
    ) -> None:
        context = context or {}  # ensure not None to satisfy mypy

        dag: "DAG" = context["dag"]
        task = context["task"]
        # task_instance: "TaskInstance" = context["task_instance"]

        # TODO: verify if task and operator are the same?
        # TODO: use dag serialization to just save the whole thing.
        # TODO: save context.get("conf")
        # TODO: save DAG tags
        # TODO: save context.get("dag_run")
        # TODO: save all the data from task_instance
        # TODO: capture raw sql from db operators

        flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id)
        job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

        timestamp = int(
            dateutil.parser.parse(context["ts"]).timestamp() * 1000)
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=dag.owner,
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=timestamp, actor=builder.make_user_urn("airflow")),
        )

        flow_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    models.DataFlowInfoClass(
                        name=dag.dag_id,
                        description=f"{dag.description}\n\n{dag.doc_md}",
                    ),
                    ownership,
                ],
            ))

        job_mce = models.MetadataChangeEventClass(
            proposedSnapshot=models.DataJobSnapshotClass(
                urn=job_urn,
                aspects=[
                    models.DataJobInfoClass(
                        name=task.task_id,
                        type=models.AzkabanJobTypeClass.COMMAND,
                        description=None,  # TODO: add datajob description
                    ),
                    models.DataJobInputOutputClass(
                        inputDatasets=_entities_to_urn_list(inlets or []),
                        outputDatasets=_entities_to_urn_list(outlets or []),
                    ),
                    ownership,
                ],
            ))

        lineage_mces = [
            builder.make_lineage_mce(_entities_to_urn_list(inlets or []),
                                     outlet)
            for outlet in _entities_to_urn_list(outlets or [])
        ]

        hook = make_emitter_hook()

        mces = [
            flow_mce,
            job_mce,
            *lineage_mces,
        ]
        operator.log.info("DataHub lineage backend - emitting metadata:\n" +
                          "\n".join(json.dumps(mce.to_obj()) for mce in mces))
        hook.emit_mces(mces)
Ejemplo n.º 28
0
# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough)
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph

# Imports for metadata model classes
from datahub.metadata.schema_classes import (
    ChangeTypeClass,
    OwnerClass,
    OwnershipClass,
    OwnershipTypeClass,
)

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Inputs -> owner, ownership_type, dataset
owner_to_add = make_user_urn("jdoe")
ownership_type = OwnershipTypeClass.DATAOWNER
dataset_urn = make_dataset_urn(platform="hive",
                               name="realestate_db.sales",
                               env="PROD")

# Some objects to help with conditional pathways later
owner_class_to_add = OwnerClass(owner=owner_to_add, type=ownership_type)
ownership_to_add = OwnershipClass(owners=[owner_class_to_add])

# First we get the current owners
gms_endpoint = "http://localhost:8080"
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))

current_owners: Optional[OwnershipClass] = graph.get_aspect_v2(
    entity_urn=dataset_urn,
Ejemplo n.º 29
0
def send_lineage_to_datahub(
    config: DatahubBasicLineageConfig,
    operator: "BaseOperator",
    inlets: List[_Entity],
    outlets: List[_Entity],
    context: Dict,
) -> None:
    # This is necessary to avoid issues with circular imports.
    from airflow.serialization.serialized_objects import (
        SerializedBaseOperator,
        SerializedDAG,
    )

    dag: "DAG" = context["dag"]
    task: "BaseOperator" = context["task"]

    # resolve URNs for upstream nodes in subdags upstream of the current task.
    upstream_subdag_task_urns: List[str] = []

    for upstream_task_id in task.upstream_task_ids:
        upstream_task = dag.task_dict[upstream_task_id]

        # if upstream task is not a subdag, then skip it
        if upstream_task.subdag is None:
            continue

        # else, link the leaf tasks of the upstream subdag as upstream tasks
        upstream_subdag = upstream_task.subdag

        upstream_subdag_flow_urn = builder.make_data_flow_urn(
            "airflow", upstream_subdag.dag_id, config.cluster)

        for upstream_subdag_task_id in upstream_subdag.task_dict:
            upstream_subdag_task = upstream_subdag.task_dict[
                upstream_subdag_task_id]

            upstream_subdag_task_urn = builder.make_data_job_urn_with_flow(
                upstream_subdag_flow_urn, upstream_subdag_task_id)

            # if subdag task is a leaf task, then link it as an upstream task
            if len(upstream_subdag_task._downstream_task_ids) == 0:

                upstream_subdag_task_urns.append(upstream_subdag_task_urn)

    # resolve URNs for upstream nodes that trigger the subdag containing the current task.
    # (if it is in a subdag at all)
    upstream_subdag_triggers: List[str] = []

    # subdags are always named with 'parent.child' style or Airflow won't run them
    # add connection from subdag trigger(s) if subdag task has no upstreams
    if (dag.is_subdag and dag.parent_dag is not None
            and len(task._upstream_task_ids) == 0):

        # filter through the parent dag's tasks and find the subdag trigger(s)
        subdags = [
            x for x in dag.parent_dag.task_dict.values()
            if x.subdag is not None
        ]
        matched_subdags = [
            x for x in subdags
            if getattr(getattr(x, "subdag"), "dag_id") == dag.dag_id
        ]

        # id of the task containing the subdag
        subdag_task_id = matched_subdags[0].task_id

        parent_dag_urn = builder.make_data_flow_urn("airflow",
                                                    dag.parent_dag.dag_id,
                                                    config.cluster)

        # iterate through the parent dag's tasks and find the ones that trigger the subdag
        for upstream_task_id in dag.parent_dag.task_dict:
            upstream_task = dag.parent_dag.task_dict[upstream_task_id]

            upstream_task_urn = builder.make_data_job_urn_with_flow(
                parent_dag_urn, upstream_task_id)

            # if the task triggers the subdag, link it to this node in the subdag
            if subdag_task_id in upstream_task._downstream_task_ids:
                upstream_subdag_triggers.append(upstream_task_urn)

    # TODO: capture context
    # context dag_run
    # task_instance: "TaskInstance" = context["task_instance"]
    # TODO: capture raw sql from db operators

    flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id,
                                          config.cluster)
    job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id)

    base_url = conf.get("webserver", "base_url")
    flow_url = f"{base_url}/tree?dag_id={dag.dag_id}"
    job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}"
    # operator.log.info(f"{flow_url=}")
    # operator.log.info(f"{job_url=}")
    # operator.log.info(f"{dag.get_serialized_fields()=}")
    # operator.log.info(f"{task.get_serialized_fields()=}")
    # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}")

    flow_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key, value) in SerializedDAG.serialize_dag(dag).items()
    }
    for key in dag.get_serialized_fields():
        if key not in flow_property_bag:
            flow_property_bag[key] = repr(getattr(dag, key))
    job_property_bag: Dict[str, str] = {
        key: repr(value)
        for (key,
             value) in SerializedBaseOperator.serialize_operator(task).items()
    }
    for key in task.get_serialized_fields():
        if key not in job_property_bag:
            job_property_bag[key] = repr(getattr(task, key))
    # operator.log.info(f"{flow_property_bag=}")
    # operator.log.info(f"{job_property_bag=}")
    allowed_task_keys = [
        "_downstream_task_ids",
        "_inlets",
        "_outlets",
        "_task_type",
        "_task_module",
        "depends_on_past",
        "email",
        "label",
        "execution_timeout",
        "end_date",
        "start_date",
        "sla",
        "sql",
        "task_id",
        "trigger_rule",
        "wait_for_downstream",
    ]
    job_property_bag = {
        k: v
        for (k, v) in job_property_bag.items() if k in allowed_task_keys
    }
    allowed_flow_keys = [
        "_access_control",
        "_concurrency",
        "_default_view",
        "catchup",
        "fileloc",
        "is_paused_upon_creation",
        "start_date",
        "tags",
        "timezone",
    ]
    flow_property_bag = {
        k: v
        for (k, v) in flow_property_bag.items() if k in allowed_flow_keys
    }

    if config.capture_ownership_info:
        ownership = models.OwnershipClass(
            owners=[
                models.OwnerClass(
                    owner=builder.make_user_urn(dag.owner),
                    type=models.OwnershipTypeClass.DEVELOPER,
                    source=models.OwnershipSourceClass(
                        type=models.OwnershipSourceTypeClass.SERVICE,
                        url=dag.filepath,
                    ),
                )
            ],
            lastModified=models.AuditStampClass(
                time=0, actor=builder.make_user_urn("airflow")),
        )
        # operator.log.info(f"{ownership=}")
        ownership_aspect = [ownership]
    else:
        ownership_aspect = []

    if config.capture_tags_info:
        tags = models.GlobalTagsClass(tags=[
            models.TagAssociationClass(tag=builder.make_tag_urn(tag))
            for tag in (dag.tags or [])
        ])
        # operator.log.info(f"{tags=}")
        tags_aspect = [tags]
    else:
        tags_aspect = []

    flow_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                models.DataFlowInfoClass(
                    name=dag.dag_id,
                    description=f"{dag.description}\n\n{dag.doc_md or ''}",
                    customProperties=flow_property_bag,
                    externalUrl=flow_url,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    # exclude subdag operator tasks since these are not emitted, resulting in empty metadata
    upstream_tasks = ([
        builder.make_data_job_urn_with_flow(flow_urn, task_id)
        for task_id in task.upstream_task_ids
        if dag.task_dict[task_id].subdag is None
    ] + upstream_subdag_task_urns + upstream_subdag_triggers)

    job_doc = ((operator.doc or operator.doc_md or operator.doc_json
                or operator.doc_yaml or operator.doc_rst)
               if not AIRFLOW_1 else None)

    job_mce = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn=job_urn,
            aspects=[
                models.DataJobInfoClass(
                    name=task.task_id,
                    type=models.AzkabanJobTypeClass.COMMAND,
                    description=job_doc,
                    customProperties=job_property_bag,
                    externalUrl=job_url,
                ),
                models.DataJobInputOutputClass(
                    inputDatasets=_entities_to_urn_list(inlets or []),
                    outputDatasets=_entities_to_urn_list(outlets or []),
                    inputDatajobs=upstream_tasks,
                ),
                *ownership_aspect,
                *tags_aspect,
            ],
        ))

    force_entity_materialization = [
        models.MetadataChangeEventClass(
            proposedSnapshot=models.DatasetSnapshotClass(
                urn=iolet,
                aspects=[
                    models.StatusClass(removed=False),
                ],
            ))
        for iolet in _entities_to_urn_list((inlets or []) + (outlets or []))
    ]

    hook = config.make_emitter_hook()

    mces = [
        flow_mce,
        job_mce,
        *force_entity_materialization,
    ]
    operator.log.info("DataHub lineage backend - emitting metadata:\n" +
                      "\n".join(json.dumps(mce.to_obj()) for mce in mces))
    hook.emit_mces(mces)
Ejemplo n.º 30
0
    def construct_chart_from_api_data(self, chart_data: dict, query: dict,
                                      path: str) -> ChartSnapshot:
        chart_urn = builder.make_chart_urn(self.platform,
                                           chart_data.get("token", ""))
        chart_snapshot = ChartSnapshot(
            urn=chart_urn,
            aspects=[],
        )

        last_modified = ChangeAuditStamps()
        creator = self._get_creator(
            chart_data.get("_links", {}).get("creator", {}).get("href", ""))
        if creator is not None:
            modified_actor = builder.make_user_urn(creator)
            created_ts = int(
                dp.parse(chart_data.get("created_at", "now")).timestamp() *
                1000)
            modified_ts = int(
                dp.parse(chart_data.get("updated_at", "now")).timestamp() *
                1000)
            last_modified = ChangeAuditStamps(
                created=AuditStamp(time=created_ts, actor=modified_actor),
                lastModified=AuditStamp(time=modified_ts,
                                        actor=modified_actor),
            )

        chart_detail = (chart_data.get("view", {})
                        if len(chart_data.get("view", {})) != 0 else
                        chart_data.get("view_vegas", {}))

        mode_chart_type = chart_detail.get(
            "chartType", "") or chart_detail.get("selectedChart", "")
        chart_type = self._get_chart_type(chart_data.get("token", ""),
                                          mode_chart_type)
        description = (chart_detail.get("description")
                       or chart_detail.get("chartDescription") or "")
        title = chart_detail.get("title") or chart_detail.get(
            "chartTitle") or ""

        # create datasource urn
        platform, db_name = self._get_platform_and_dbname(
            query.get("data_source_id"))
        source_tables = self._get_source_from_query(query.get("raw_query"))
        datasource_urn = self._get_datasource_urn(platform, db_name,
                                                  source_tables)
        custom_properties = self.construct_chart_custom_properties(
            chart_detail, mode_chart_type)

        # Chart Info
        chart_info = ChartInfoClass(
            type=chart_type,
            description=description,
            title=title,
            lastModified=last_modified,
            chartUrl=f"{self.config.connect_uri}"
            f"{chart_data.get('_links', {}).get('report_viz_web', {}).get('href', '')}",
            inputs=datasource_urn,
            customProperties=custom_properties,
        )
        chart_snapshot.aspects.append(chart_info)

        # Browse Path
        browse_path = BrowsePathsClass(paths=[path])
        chart_snapshot.aspects.append(browse_path)

        # Query
        chart_query = ChartQueryClass(
            rawQuery=query.get("raw_query", ""),
            type=ChartQueryTypeClass.SQL,
        )
        chart_snapshot.aspects.append(chart_query)

        # Ownership
        ownership = self._get_ownership(
            self._get_creator(
                chart_data.get("_links", {}).get("creator",
                                                 {}).get("href", "")))
        if ownership is not None:
            chart_snapshot.aspects.append(ownership)

        return chart_snapshot