Exemple #1
0
    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = DataHubRestSinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
            retry_status_codes=self.config.retry_status_codes,
            retry_max_times=self.config.retry_max_times,
            extra_headers=self.config.extra_headers,
            ca_certificate_path=self.config.ca_certificate_path,
        )
        try:
            gms_config = self.emitter.test_connection()
        except Exception as exc:
            raise ConfigurationError(
                f"💥 Failed to connect to DataHub@{self.config.server} (token:{'XXX-redacted' if self.config.token else 'empty'}) over REST",
                exc,
            )

        self.report.gms_version = (
            gms_config.get("versions", {})
            .get("linkedin/datahub", {})
            .get("version", "")
        )
        logger.debug("Setting env variables to override config")
        set_env_variables_override_config(self.config.server, self.config.token)
        logger.debug("Setting gms config")
        set_gms_config(gms_config)
        self.executor = concurrent.futures.ThreadPoolExecutor(
            max_workers=self.config.max_threads
        )
Exemple #2
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
        )
        self.emitter.test_connection()

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[
            Union[
                MetadataChangeEvent,
                MetadataChangeProposal,
                MetadataChangeProposalWrapper,
                UsageAggregation,
            ]
        ],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        try:
            self.emitter.emit(record)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            self.report.report_failure({"error": e.message, "info": e.info})
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass
Exemple #3
0
 def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
     super().__init__(ctx)
     self.config = config
     self.report = SinkReport()
     self.emitter = DatahubRestEmitter(
         self.config.server,
         self.config.token,
         connect_timeout_sec=self.config.timeout_sec,  # reuse timeout_sec for connect timeout
         read_timeout_sec=self.config.timeout_sec,
     )
     self.emitter.test_connection()
 def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
     super().__init__(ctx)
     self.config = config
     self.report = SinkReport()
     self.emitter = DatahubRestEmitter(
         self.config.server,
         self.config.token,
         connect_timeout_sec=self.config.
         timeout_sec,  # reuse timeout_sec for connect timeout
         read_timeout_sec=self.config.timeout_sec,
         extra_headers=self.config.extra_headers,
         ca_certificate_path=self.config.ca_certificate_path,
     )
     self.emitter.test_connection()
     self.executor = concurrent.futures.ThreadPoolExecutor(
         max_workers=self.config.max_threads)
Exemple #5
0
def test_datahub_rest_emitter(requests_mock, record, path, snapshot):
    def match_request_text(request: requests.Request) -> bool:
        requested_snapshot = request.json()
        assert (
            requested_snapshot == snapshot
        ), f"Expected snapshot to be {json.dumps(snapshot)}, got {json.dumps(requested_snapshot)}"
        return True

    requests_mock.post(
        f"{MOCK_GMS_ENDPOINT}{path}",
        request_headers={"X-RestLi-Protocol-Version": "2.0.0"},
        additional_matcher=match_request_text,
    )

    emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT)
    emitter.emit(record)
def test_datahub_rest_emitter_retry_construction():
    emitter = DatahubRestEmitter(
        MOCK_GMS_ENDPOINT,
        retry_status_codes=[418],
        retry_max_times=42,
    )
    assert emitter._retry_status_codes == [418]
    assert emitter._retry_max_times == 42
def test_datahub_rest_emitter_extra_params():
    emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT,
                                 extra_headers={
                                     "key1": "value1",
                                     "key2": "value2"
                                 })
    assert emitter._session.headers.get("key1") == "value1"
    assert emitter._session.headers.get("key2") == "value2"
Exemple #8
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(self.config.server)

    @classmethod
    def create(cls, config_dict: dict,
               ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[MetadataChangeEvent],
        write_callback: WriteCallback,
    ) -> None:
        mce = record_envelope.record

        try:
            self.emitter.emit_mce(mce)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            self.report.report_failure({"error": e.message, "info": e.info})
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass
Exemple #9
0
def process_container_relationships(
    container_id_map: Dict[str, str],
    dry_run: bool,
    src_urn: str,
    dst_urn: str,
    migration_report: MigrationReport,
    rest_emitter: DatahubRestEmitter,
) -> None:
    relationships = migration_utils.get_incoming_relationships(urn=src_urn)
    for relationship in relationships:
        log.debug(f"Incoming Relationship: {relationship}")
        target_urn = relationship["entity"]

        # We should use the new id if we already migrated it
        if target_urn in container_id_map:
            target_urn = container_id_map.get(target_urn)

        entity_type = _get_type_from_urn(target_urn)
        relationshipType = relationship["type"]
        aspect_name = migration_utils.get_aspect_name_from_relationship(
            relationshipType, entity_type)
        aspect_map = cli_utils.get_aspects_for_entity(target_urn,
                                                      aspects=[aspect_name],
                                                      typed=True)
        if aspect_name in aspect_map:
            aspect = aspect_map[aspect_name]
            assert isinstance(aspect, DictWrapper)
            aspect = migration_utils.modify_urn_list_for_aspect(
                aspect_name, aspect, relationshipType, src_urn, dst_urn)
            # use mcpw
            mcp = MetadataChangeProposalWrapper(
                entityType=entity_type,
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=target_urn,
                aspectName=aspect_name,
                aspect=aspect,
            )

            if not dry_run:
                rest_emitter.emit_mcp(mcp)
            migration_report.on_entity_affected(mcp.entityUrn,
                                                mcp.aspectName)  # type: ignore
        else:
            log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}")
Exemple #10
0
 def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
     super().__init__(ctx)
     self.config = config
     self.report = DataHubRestSinkReport()
     self.emitter = DatahubRestEmitter(
         self.config.server,
         self.config.token,
         connect_timeout_sec=self.config.
         timeout_sec,  # reuse timeout_sec for connect timeout
         read_timeout_sec=self.config.timeout_sec,
         retry_status_codes=self.config.retry_status_codes,
         retry_max_times=self.config.retry_max_times,
         extra_headers=self.config.extra_headers,
         ca_certificate_path=self.config.ca_certificate_path,
     )
     gms_config = self.emitter.test_connection()
     self.report.gms_version = (gms_config.get("versions", {}).get(
         "linkedin/datahub", {}).get("version", ""))
     logger.info("Setting gms config")
     set_gms_config(gms_config)
     self.executor = concurrent.futures.ThreadPoolExecutor(
         max_workers=self.config.max_threads)
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: SinkReport
    treat_errors_as_warnings: bool = False

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.
            timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
            extra_headers=self.config.extra_headers,
        )
        self.emitter.test_connection()

    @classmethod
    def create(cls, config_dict: dict,
               ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        if isinstance(workunit, MetadataWorkUnit):
            mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit)
            self.treat_errors_as_warnings = mwu.treat_errors_as_warnings
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[Union[MetadataChangeEvent,
                                              MetadataChangeProposal,
                                              MetadataChangeProposalWrapper,
                                              UsageAggregation, ]],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        try:
            self.emitter.emit(record)
            self.report.report_record_written(record_envelope)
            write_callback.on_success(record_envelope, {})
        except OperationalError as e:
            # only OperationalErrors should be ignored
            if not self.treat_errors_as_warnings:
                self.report.report_failure({
                    "error": e.message,
                    "info": e.info
                })
            else:
                # trim exception stacktraces when reporting warnings
                if "stackTrace" in e.info:
                    try:
                        e.info["stackTrace"] = "\n".join(
                            e.info["stackTrace"].split("\n")[0:2])
                    except Exception:
                        # ignore failures in trimming
                        pass
                if isinstance(record, MetadataChangeProposalWrapper):
                    # include information about the entity that failed
                    entity_id = cast(MetadataChangeProposalWrapper,
                                     record).entityUrn
                    e.info["id"] = entity_id
                else:
                    entity_id = None
                self.report.report_warning({
                    "warning": e.message,
                    "info": e.info
                })
            write_callback.on_failure(record_envelope, e, e.info)
        except Exception as e:
            self.report.report_failure({"e": e})
            write_callback.on_failure(record_envelope, e, {})

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        pass
# Construct upstream tables.
upstream_tables: List[UpstreamClass] = []
upstream_table_1 = UpstreamClass(
    dataset=builder.make_dataset_urn("bigquery", "upstream_table_1", "PROD"),
    type=DatasetLineageTypeClass.TRANSFORMED,
)
upstream_tables.append(upstream_table_1)
upstream_table_2 = UpstreamClass(
    dataset=builder.make_dataset_urn("bigquery", "upstream_table_2", "PROD"),
    type=DatasetLineageTypeClass.TRANSFORMED,
)
upstream_tables.append(upstream_table_2)

# Construct a lineage object.
upstream_lineage = UpstreamLineage(upstreams=upstream_tables)

# Construct a MetadataChangeProposalWrapper object.
lineage_mcp = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_dataset_urn("bigquery", "downstream"),
    aspectName="upstreamLineage",
    aspect=upstream_lineage,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(lineage_mcp)
Exemple #13
0
def emitAssertionResult(assertionResult: AssertionRunEvent) -> None:

    dataset_assertionRunEvent_mcp = MetadataChangeProposalWrapper(
        entityType="assertion",
        changeType=ChangeType.UPSERT,
        entityUrn=assertionResult.assertionUrn,
        aspectName="assertionRunEvent",
        aspect=assertionResult,
    )

    # Emit BatchAssertion Result! (timseries aspect)
    emitter.emit_mcp(dataset_assertionRunEvent_mcp)


# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

datasetProperties = DatasetProperties(
    name="bazTable",
)
# Construct a MetadataChangeProposalWrapper object for dataset
dataset_mcp = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeType.UPSERT,
    entityUrn=datasetUrn("bazTable"),
    aspectName="datasetProperties",
    aspect=datasetProperties,
)

# Emit Dataset entity properties aspect! (Skip if dataset is already present)
emitter.emit_mcp(dataset_mcp)
Exemple #14
0
def test_datahub_rest_emitter_timeout_construction():
    emitter = DatahubRestEmitter(
        MOCK_GMS_ENDPOINT, connect_timeout_sec=2, read_timeout_sec=4
    )
    assert emitter._connect_timeout_sec == 2
    assert emitter._read_timeout_sec == 4
        nativeParameters={"max_value": "99"},
    ),
    customProperties={"suite_name": "demo_suite"},
)

# Construct a MetadataChangeProposalWrapper object.
assertion_maxVal_mcp = MetadataChangeProposalWrapper(
    entityType="assertion",
    changeType=ChangeType.UPSERT,
    entityUrn=assertionUrn(assertion_maxVal),
    aspectName="assertionInfo",
    aspect=assertion_maxVal,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit Assertion entity info object!
emitter.emit_mcp(assertion_maxVal_mcp)

# Construct batch assertion result object for partition 1 batch
assertionResult_maxVal_batch_partition1 = AssertionRunEvent(
    timestampMillis=int(time.time() * 1000),
    assertionUrn=assertionUrn(assertion_maxVal),
    asserteeUrn=datasetUrn("bazTable"),
    partitionSpec=PartitionSpecClass(partition=str([{
        "country": "IN"
    }])),
    runId="uuid1",
    status=AssertionRunStatus.COMPLETE,
    result=AssertionResult(
from datahub.emitter.rest_emitter import DatahubRestEmitter

# Imports for metadata model classes
from datahub.metadata.schema_classes import (
    AuditStampClass,
    ChangeTypeClass,
    GlossaryTermAssociationClass,
    GlossaryTermsClass,
)

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# First we get the current terms
gms_endpoint = "http://localhost:8080"
rest_emitter = DatahubRestEmitter(gms_server=gms_endpoint)

dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")

term_to_add = make_term_urn("Classification.HighlyConfidential")
term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add)
# an audit stamp that basically says we have no idea when these terms were added to this dataset
# change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time of the application
unknown_audit_stamp = AuditStampClass(time=0, actor="urn:li:corpuser:ingestion")

# create a brand new terms aspect
terms_aspect = GlossaryTermsClass(
    terms=[term_association_to_add],
    auditStamp=unknown_audit_stamp,
)
Exemple #17
0
 def make_emitter(self) -> DatahubRestEmitter:
     return DatahubRestEmitter(*self._get_config())
        platformSchema=OtherSchemaClass(
            rawSchema="__insert raw schema here__"),
        fields=[
            SchemaFieldClass(
                fieldPath="address.zipcode",
                type=SchemaFieldDataTypeClass(type=StringTypeClass()),
                nativeDataType=
                "VARCHAR(50)",  # use this to provide the type of the field in the source system's vernacular
                description=
                "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States",
            ),
            SchemaFieldClass(
                fieldPath="address.street",
                type=SchemaFieldDataTypeClass(type=StringTypeClass()),
                nativeDataType="VARCHAR(100)",
                description="Street corresponding to the address",
            ),
            SchemaFieldClass(
                fieldPath="last_sold_date",
                type=SchemaFieldDataTypeClass(type=DateTypeClass()),
                nativeDataType="Date",
                description="Date of the last sale date for this property",
            ),
        ],
    ),
)

# Create rest emitter
rest_emitter = DatahubRestEmitter(gms_server="http://localhost:8080")
rest_emitter.emit(event)
    entityType="dataflow",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=dataflow_urn,
    aspectName="dataFlowInfo",
    aspect=dataflow_info,
)

datajob_info = DataJobInfoClass(name="My Job 1",
                                type="AIRFLOW",
                                flowUrn=dataflow_urn)

# Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect.
# NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job.
datajob_info_mcp = MetadataChangeProposalWrapper(
    entityType="dataJob",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_data_job_urn(orchestrator="airflow",
                                        flow_id="flow_old_api",
                                        job_id="job1",
                                        cluster="prod"),
    aspectName="dataJobInfo",
    aspect=datajob_info,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(dataflow_info_mcp)
emitter.emit_mcp(datajob_info_mcp)
Exemple #20
0
        {
            "field":
            "timestamp",
            "isFieldLevel":
            True,
            "type":
            "increasing",
            "checkDefinition":
            "n/a",
            "url":
            "https://github.com/datahub-project/datahub/blob/master/checks/increasing.sql",
        },
    ]
}

emitter: DatahubRestEmitter = DatahubRestEmitter(
    gms_server="http://localhost:8080")

dataset_urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)"
mcp_raw: MetadataChangeProposalClass = MetadataChangeProposalClass(
    entityType="dataset",
    entityUrn=dataset_urn,
    changeType=ChangeTypeClass.UPSERT,
    aspectName="customDataQualityRules",
    aspect=GenericAspectClass(
        contentType="application/json",
        value=json.dumps(dq_aspect).encode("utf-8"),
    ),
)

try:
    emitter.emit(mcp_raw)
Exemple #21
0
    def _run(
        self,
        validation_result_suite: ExpectationSuiteValidationResult,
        validation_result_suite_identifier: Union[
            ValidationResultIdentifier, GeCloudIdentifier
        ],
        data_asset: Union[Validator, DataAsset, Batch],
        payload: Any = None,
        expectation_suite_identifier: Optional[ExpectationSuiteIdentifier] = None,
        checkpoint_identifier: Any = None,
    ) -> Dict:
        datasets = []
        try:
            emitter = DatahubRestEmitter(
                gms_server=self.server_url,
                token=self.token,
                read_timeout_sec=self.timeout_sec,
                connect_timeout_sec=self.timeout_sec,
                retry_status_codes=self.retry_status_codes,
                retry_max_times=self.retry_max_times,
                extra_headers=self.extra_headers,
            )

            expectation_suite_name = validation_result_suite.meta.get(
                "expectation_suite_name"
            )
            run_id = validation_result_suite.meta.get("run_id")
            if hasattr(data_asset, "active_batch_id"):
                batch_identifier = data_asset.active_batch_id
            else:
                batch_identifier = data_asset.batch_id

            if isinstance(
                validation_result_suite_identifier, ValidationResultIdentifier
            ):
                expectation_suite_name = (
                    validation_result_suite_identifier.expectation_suite_identifier.expectation_suite_name
                )
                run_id = validation_result_suite_identifier.run_id
                batch_identifier = validation_result_suite_identifier.batch_identifier

            # Returns datasets and corresponding batch requests
            datasets = self.get_dataset_partitions(batch_identifier, data_asset)

            if len(datasets) == 0 or datasets[0]["dataset_urn"] is None:
                logger.info("Metadata not sent to datahub. No datasets found.")
                return {"datahub_notification_result": "none required"}

            # Returns assertion info and assertion results
            assertions = self.get_assertions_with_results(
                validation_result_suite,
                expectation_suite_name,
                run_id,
                payload,
                datasets,
            )

            for assertion in assertions:
                # Construct a MetadataChangeProposalWrapper object.
                assertion_info_mcp = MetadataChangeProposalWrapper(
                    entityType="assertion",
                    changeType=ChangeType.UPSERT,
                    entityUrn=assertion["assertionUrn"],
                    aspectName="assertionInfo",
                    aspect=assertion["assertionInfo"],
                )
                emitter.emit_mcp(assertion_info_mcp)

                # Construct a MetadataChangeProposalWrapper object.
                assertion_platform_mcp = MetadataChangeProposalWrapper(
                    entityType="assertion",
                    changeType=ChangeType.UPSERT,
                    entityUrn=assertion["assertionUrn"],
                    aspectName="dataPlatformInstance",
                    aspect=assertion["assertionPlatform"],
                )
                emitter.emit_mcp(assertion_platform_mcp)

                for assertionResult in assertion["assertionResults"]:
                    dataset_assertionResult_mcp = MetadataChangeProposalWrapper(
                        entityType="assertion",
                        changeType=ChangeType.UPSERT,
                        entityUrn=assertionResult.assertionUrn,
                        aspectName="assertionRunEvent",
                        aspect=assertionResult,
                    )

                    # Emit Result! (timseries aspect)
                    emitter.emit_mcp(dataset_assertionResult_mcp)

            result = "DataHub notification succeeded"
        except Exception as e:
            result = "DataHub notification failed"
            if self.graceful_exceptions:
                logger.error(e)
                logger.info("Supressing error because graceful_exceptions is set")
            else:
                raise

        return {"datahub_notification_result": result}
Exemple #22
0
        fldUrn("bar3", "c2"),
        fldUrn("bar4", "c1"),
    ],
    outputDatasetFields=[
        fldUrn("bar", "c1"),
        fldUrn("bar", "c2"),
        fldUrn("bar", "c3"),
        fldUrn("bar", "c4"),
        fldUrn("bar", "c5"),
        fldUrn("bar", "c6"),
        fldUrn("bar", "c7"),
        fldUrn("bar", "c9"),
        fldUrn("bar2", "c9"),
    ],
    fineGrainedLineages=fineGrainedLineages,
)

dataJobLineageMcp = MetadataChangeProposalWrapper(
    entityType="dataJob",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_data_job_urn("spark", "Flow1", "Task1"),
    aspectName="dataJobInputOutput",
    aspect=dataJobInputOutput,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(dataJobLineageMcp)
Exemple #23
0
class DatahubRestSink(Sink):
    config: DatahubRestSinkConfig
    emitter: DatahubRestEmitter
    report: DataHubRestSinkReport
    treat_errors_as_warnings: bool = False

    def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
        super().__init__(ctx)
        self.config = config
        self.report = DataHubRestSinkReport()
        self.emitter = DatahubRestEmitter(
            self.config.server,
            self.config.token,
            connect_timeout_sec=self.config.timeout_sec,  # reuse timeout_sec for connect timeout
            read_timeout_sec=self.config.timeout_sec,
            retry_status_codes=self.config.retry_status_codes,
            retry_max_times=self.config.retry_max_times,
            extra_headers=self.config.extra_headers,
            ca_certificate_path=self.config.ca_certificate_path,
        )
        try:
            gms_config = self.emitter.test_connection()
        except Exception as exc:
            raise ConfigurationError(
                f"💥 Failed to connect to DataHub@{self.config.server} (token:{'XXX-redacted' if self.config.token else 'empty'}) over REST",
                exc,
            )

        self.report.gms_version = (
            gms_config.get("versions", {})
            .get("linkedin/datahub", {})
            .get("version", "")
        )
        logger.debug("Setting env variables to override config")
        set_env_variables_override_config(self.config.server, self.config.token)
        logger.debug("Setting gms config")
        set_gms_config(gms_config)
        self.executor = concurrent.futures.ThreadPoolExecutor(
            max_workers=self.config.max_threads
        )

    @classmethod
    def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink":
        config = DatahubRestSinkConfig.parse_obj(config_dict)
        return cls(ctx, config)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        if isinstance(workunit, MetadataWorkUnit):
            mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit)
            self.treat_errors_as_warnings = mwu.treat_errors_as_warnings

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        pass

    def _write_done_callback(
        self,
        record_envelope: RecordEnvelope,
        write_callback: WriteCallback,
        future: concurrent.futures.Future,
    ) -> None:
        if future.cancelled():
            self.report.report_failure({"error": "future was cancelled"})
            write_callback.on_failure(
                record_envelope, OperationalError("future was cancelled"), {}
            )
        elif future.done():
            e = future.exception()
            if not e:
                self.report.report_record_written(record_envelope)
                start_time, end_time = future.result()
                self.report.report_downstream_latency(start_time, end_time)
                write_callback.on_success(record_envelope, {})
            elif isinstance(e, OperationalError):
                # only OperationalErrors should be ignored
                if not self.treat_errors_as_warnings:
                    self.report.report_failure({"error": e.message, "info": e.info})
                else:
                    # trim exception stacktraces when reporting warnings
                    if "stackTrace" in e.info:
                        try:
                            e.info["stackTrace"] = "\n".join(
                                e.info["stackTrace"].split("\n")[0:2]
                            )
                        except Exception:
                            # ignore failures in trimming
                            pass
                    record = record_envelope.record
                    if isinstance(record, MetadataChangeProposalWrapper):
                        # include information about the entity that failed
                        entity_id = cast(
                            MetadataChangeProposalWrapper, record
                        ).entityUrn
                        e.info["id"] = entity_id
                    else:
                        entity_id = None
                    self.report.report_warning({"warning": e.message, "info": e.info})
                write_callback.on_failure(record_envelope, e, e.info)
            else:
                self.report.report_failure({"e": e})
                write_callback.on_failure(record_envelope, Exception(e), {})

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[
            Union[
                MetadataChangeEvent,
                MetadataChangeProposal,
                MetadataChangeProposalWrapper,
                UsageAggregation,
            ]
        ],
        write_callback: WriteCallback,
    ) -> None:
        record = record_envelope.record

        write_future = self.executor.submit(self.emitter.emit, record)
        write_future.add_done_callback(
            functools.partial(
                self._write_done_callback, record_envelope, write_callback
            )
        )

    def get_report(self) -> SinkReport:
        return self.report

    def close(self):
        self.executor.shutdown(wait=True)
Exemple #24
0
from datahub.api.entities.datajob import DataFlow, DataJob
from datahub.emitter.rest_emitter import DatahubRestEmitter

# Create an emitter to the GMS REST API.

emitter = DatahubRestEmitter("http://localhost:8080")

jobFlow = DataFlow(cluster="prod", orchestrator="airflow", id="flow_new_api")
jobFlow.emit(emitter)

dataJob = DataJob(flow_urn=jobFlow.urn, id="job1")
dataJob.emit(emitter)
Exemple #25
0
def dataplatform2instance_func(
    instance: str,
    platform: str,
    dry_run: bool,
    env: str,
    force: bool,
    hard: bool,
    keep: bool,
) -> None:
    click.echo(
        f"Starting migration: platform:{platform}, instance={instance}, force={force}, dry-run={dry_run}"
    )
    run_id: str = f"migrate-{uuid.uuid4()}"
    migration_report = MigrationReport(run_id, dry_run, keep)
    system_metadata = SystemMetadataClass(runId=run_id)

    all_aspects = [
        "schemaMetadata",
        "datasetProperties",
        "viewProperties",
        "subTypes",
        "editableDatasetProperties",
        "ownership",
        "datasetDeprecation",
        "institutionalMemory",
        "editableSchemaMetadata",
        "globalTags",
        "glossaryTerms",
        "upstreamLineage",
        "datasetUpstreamLineage",
        "status",
    ]

    if not dry_run:
        rest_emitter = DatahubRestEmitter(
            gms_server=cli_utils.get_session_and_host()[1]
        )

    urns_to_migrate = []
    # we first calculate all the urns we will be migrating
    for src_entity_urn in cli_utils.get_urns_by_filter(platform=platform, env=env):
        key = dataset_urn_to_key(src_entity_urn)
        assert key
        # Does this urn already have a platform instance associated with it?
        response = cli_utils.get_aspects_for_entity(
            entity_urn=src_entity_urn, aspects=["dataPlatformInstance"], typed=True
        )
        if "dataPlatformInstance" in response:
            assert isinstance(
                response["dataPlatformInstance"], DataPlatformInstanceClass
            )
            data_platform_instance: DataPlatformInstanceClass = response[
                "dataPlatformInstance"
            ]
            if data_platform_instance.instance:
                log.debug("This is already an instance-specific urn, will skip")
                continue
            else:
                log.debug(
                    f"{src_entity_urn} is not an instance specific urn. {response}"
                )
                urns_to_migrate.append(src_entity_urn)

    if not force and not dry_run:
        # get a confirmation from the operator before proceeding if this is not a dry run
        sampled_urns_to_migrate = random.choices(
            urns_to_migrate, k=min(10, len(urns_to_migrate))
        )
        sampled_new_urns: List[str] = [
            make_dataset_urn_with_platform_instance(
                platform=key.platform,
                name=key.name,
                platform_instance=instance,
                env=str(key.origin),
            )
            for key in [dataset_urn_to_key(x) for x in sampled_urns_to_migrate]
            if key
        ]
        click.echo(
            f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
        )
        click.echo(f"New urns will look like {sampled_new_urns}")
        click.confirm("Ok to proceed?", abort=True)

    for src_entity_urn in progressbar.progressbar(
        urns_to_migrate, redirect_stdout=True
    ):
        key = dataset_urn_to_key(src_entity_urn)
        assert key
        new_urn = make_dataset_urn_with_platform_instance(
            platform=key.platform,
            name=key.name,
            platform_instance=instance,
            env=str(key.origin),
        )
        log.debug(f"Will migrate {src_entity_urn} to {new_urn}")
        relationships = migration_utils.get_incoming_relationships_dataset(
            src_entity_urn
        )

        for mcp in migration_utils.clone_aspect(
            src_entity_urn,
            aspect_names=all_aspects,
            dst_urn=new_urn,
            dry_run=dry_run,
            run_id=run_id,
        ):
            if not dry_run:
                rest_emitter.emit_mcp(mcp)
            migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName)  # type: ignore

        if not dry_run:
            rest_emitter.emit_mcp(
                MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=new_urn,
                    aspectName="dataPlatformInstance",
                    aspect=DataPlatformInstanceClass(
                        platform=make_data_platform_urn(platform),
                        instance=make_dataplatform_instance_urn(platform, instance),
                    ),
                    systemMetadata=system_metadata,
                )
            )
        migration_report.on_entity_create(new_urn, "dataPlatformInstance")

        for relationship in relationships:
            target_urn = relationship["entity"]
            entity_type = _get_type_from_urn(target_urn)
            relationshipType = relationship["type"]
            aspect_name = (
                migration_utils.get_aspect_name_from_relationship_type_and_entity(
                    relationshipType, entity_type
                )
            )
            aspect_map = cli_utils.get_aspects_for_entity(
                target_urn, aspects=[aspect_name], typed=True
            )
            if aspect_name in aspect_map:
                aspect = aspect_map[aspect_name]
                assert isinstance(aspect, DictWrapper)
                aspect = migration_utils.modify_urn_list_for_aspect(
                    aspect_name, aspect, relationshipType, src_entity_urn, new_urn
                )
                # use mcpw
                mcp = MetadataChangeProposalWrapper(
                    entityType=entity_type,
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=target_urn,
                    aspectName=aspect_name,
                    aspect=aspect,
                )
                if not dry_run:
                    rest_emitter.emit_mcp(mcp)
                migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName)  # type: ignore
            else:
                log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}")

        if not dry_run and not keep:
            log.info(f"will {'hard' if hard else 'soft'} delete {src_entity_urn}")
            delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id)
        migration_report.on_entity_migrated(src_entity_urn, "status")  # type: ignore

    print(f"{migration_report}")
import datahub.emitter.mce_builder as builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.com.linkedin.pegasus2avro.datajob import DataJobInfoClass
from datahub.metadata.schema_classes import ChangeTypeClass

# Construct the DataJobInfo aspect with the job -> flow lineage.
dataflow_urn = builder.make_data_flow_urn(
    orchestrator="airflow", flow_id="flow1", cluster="prod"
)

datajob_info = DataJobInfoClass(name="My Job 1", type="AIRFLOW", flowUrn=dataflow_urn)

# Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect.
# NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job.
chart_info_mcp = MetadataChangeProposalWrapper(
    entityType="dataJob",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_data_job_urn(
        orchestrator="airflow", flow_id="flow1", job_id="job1", cluster="prod"
    ),
    aspectName="dataJobInfo",
    aspect=datajob_info,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(chart_info_mcp)
Exemple #27
0
 def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig):
     super().__init__(ctx)
     self.config = config
     self.report = SinkReport()
     self.emitter = DatahubRestEmitter(self.config.server,
                                       self.config.token)
def test_datahub_rest_emitter_construction():
    emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT)
    assert emitter._connect_timeout_sec == emitter.DEFAULT_CONNECT_TIMEOUT_SEC
    assert emitter._read_timeout_sec == emitter.DEFAULT_READ_TIMEOUT_SEC
    assert emitter._retry_status_codes == emitter.DEFAULT_RETRY_STATUS_CODES
    assert emitter._retry_max_times == emitter.DEFAULT_RETRY_MAX_TIMES
Exemple #29
0
def test_datahub_rest_emitter_construction():
    emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT)
    assert emitter._connect_timeout_sec == emitter.DEFAULT_CONNECT_TIMEOUT_SEC
    assert emitter._read_timeout_sec == emitter.DEFAULT_READ_TIMEOUT_SEC
Exemple #30
0
 def make_emitter(self) -> DatahubRestEmitter:
     return DatahubRestEmitter(self._gms_endpoint())