def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = DataHubRestSinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config.timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, retry_status_codes=self.config.retry_status_codes, retry_max_times=self.config.retry_max_times, extra_headers=self.config.extra_headers, ca_certificate_path=self.config.ca_certificate_path, ) try: gms_config = self.emitter.test_connection() except Exception as exc: raise ConfigurationError( f"💥 Failed to connect to DataHub@{self.config.server} (token:{'XXX-redacted' if self.config.token else 'empty'}) over REST", exc, ) self.report.gms_version = ( gms_config.get("versions", {}) .get("linkedin/datahub", {}) .get("version", "") ) logger.debug("Setting env variables to override config") set_env_variables_override_config(self.config.server, self.config.token) logger.debug("Setting gms config") set_gms_config(gms_config) self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=self.config.max_threads )
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config.timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, ) self.emitter.test_connection() @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def write_record_async( self, record_envelope: RecordEnvelope[ Union[ MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ] ], write_callback: WriteCallback, ) -> None: record = record_envelope.record try: self.emitter.emit(record) self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) except OperationalError as e: self.report.report_failure({"error": e.message, "info": e.info}) write_callback.on_failure(record_envelope, e, e.info) except Exception as e: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, e, {}) def get_report(self) -> SinkReport: return self.report def close(self): pass
def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config.timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, ) self.emitter.test_connection()
def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config. timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, extra_headers=self.config.extra_headers, ca_certificate_path=self.config.ca_certificate_path, ) self.emitter.test_connection() self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=self.config.max_threads)
def test_datahub_rest_emitter(requests_mock, record, path, snapshot): def match_request_text(request: requests.Request) -> bool: requested_snapshot = request.json() assert ( requested_snapshot == snapshot ), f"Expected snapshot to be {json.dumps(snapshot)}, got {json.dumps(requested_snapshot)}" return True requests_mock.post( f"{MOCK_GMS_ENDPOINT}{path}", request_headers={"X-RestLi-Protocol-Version": "2.0.0"}, additional_matcher=match_request_text, ) emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT) emitter.emit(record)
def test_datahub_rest_emitter_retry_construction(): emitter = DatahubRestEmitter( MOCK_GMS_ENDPOINT, retry_status_codes=[418], retry_max_times=42, ) assert emitter._retry_status_codes == [418] assert emitter._retry_max_times == 42
def test_datahub_rest_emitter_extra_params(): emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT, extra_headers={ "key1": "value1", "key2": "value2" }) assert emitter._session.headers.get("key1") == "value1" assert emitter._session.headers.get("key2") == "value2"
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter(self.config.server) @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def write_record_async( self, record_envelope: RecordEnvelope[MetadataChangeEvent], write_callback: WriteCallback, ) -> None: mce = record_envelope.record try: self.emitter.emit_mce(mce) self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) except OperationalError as e: self.report.report_failure({"error": e.message, "info": e.info}) write_callback.on_failure(record_envelope, e, e.info) except Exception as e: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, e, {}) def get_report(self) -> SinkReport: return self.report def close(self): pass
def process_container_relationships( container_id_map: Dict[str, str], dry_run: bool, src_urn: str, dst_urn: str, migration_report: MigrationReport, rest_emitter: DatahubRestEmitter, ) -> None: relationships = migration_utils.get_incoming_relationships(urn=src_urn) for relationship in relationships: log.debug(f"Incoming Relationship: {relationship}") target_urn = relationship["entity"] # We should use the new id if we already migrated it if target_urn in container_id_map: target_urn = container_id_map.get(target_urn) entity_type = _get_type_from_urn(target_urn) relationshipType = relationship["type"] aspect_name = migration_utils.get_aspect_name_from_relationship( relationshipType, entity_type) aspect_map = cli_utils.get_aspects_for_entity(target_urn, aspects=[aspect_name], typed=True) if aspect_name in aspect_map: aspect = aspect_map[aspect_name] assert isinstance(aspect, DictWrapper) aspect = migration_utils.modify_urn_list_for_aspect( aspect_name, aspect, relationshipType, src_urn, dst_urn) # use mcpw mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=target_urn, aspectName=aspect_name, aspect=aspect, ) if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName) # type: ignore else: log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}")
def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = DataHubRestSinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config. timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, retry_status_codes=self.config.retry_status_codes, retry_max_times=self.config.retry_max_times, extra_headers=self.config.extra_headers, ca_certificate_path=self.config.ca_certificate_path, ) gms_config = self.emitter.test_connection() self.report.gms_version = (gms_config.get("versions", {}).get( "linkedin/datahub", {}).get("version", "")) logger.info("Setting gms config") set_gms_config(gms_config) self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=self.config.max_threads)
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: SinkReport treat_errors_as_warnings: bool = False def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config. timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, extra_headers=self.config.extra_headers, ) self.emitter.test_connection() @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: if isinstance(workunit, MetadataWorkUnit): mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit) self.treat_errors_as_warnings = mwu.treat_errors_as_warnings pass def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def write_record_async( self, record_envelope: RecordEnvelope[Union[MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ]], write_callback: WriteCallback, ) -> None: record = record_envelope.record try: self.emitter.emit(record) self.report.report_record_written(record_envelope) write_callback.on_success(record_envelope, {}) except OperationalError as e: # only OperationalErrors should be ignored if not self.treat_errors_as_warnings: self.report.report_failure({ "error": e.message, "info": e.info }) else: # trim exception stacktraces when reporting warnings if "stackTrace" in e.info: try: e.info["stackTrace"] = "\n".join( e.info["stackTrace"].split("\n")[0:2]) except Exception: # ignore failures in trimming pass if isinstance(record, MetadataChangeProposalWrapper): # include information about the entity that failed entity_id = cast(MetadataChangeProposalWrapper, record).entityUrn e.info["id"] = entity_id else: entity_id = None self.report.report_warning({ "warning": e.message, "info": e.info }) write_callback.on_failure(record_envelope, e, e.info) except Exception as e: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, e, {}) def get_report(self) -> SinkReport: return self.report def close(self): pass
# Construct upstream tables. upstream_tables: List[UpstreamClass] = [] upstream_table_1 = UpstreamClass( dataset=builder.make_dataset_urn("bigquery", "upstream_table_1", "PROD"), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table_1) upstream_table_2 = UpstreamClass( dataset=builder.make_dataset_urn("bigquery", "upstream_table_2", "PROD"), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table_2) # Construct a lineage object. upstream_lineage = UpstreamLineage(upstreams=upstream_tables) # Construct a MetadataChangeProposalWrapper object. lineage_mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn("bigquery", "downstream"), aspectName="upstreamLineage", aspect=upstream_lineage, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(lineage_mcp)
def emitAssertionResult(assertionResult: AssertionRunEvent) -> None: dataset_assertionRunEvent_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertionResult.assertionUrn, aspectName="assertionRunEvent", aspect=assertionResult, ) # Emit BatchAssertion Result! (timseries aspect) emitter.emit_mcp(dataset_assertionRunEvent_mcp) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") datasetProperties = DatasetProperties( name="bazTable", ) # Construct a MetadataChangeProposalWrapper object for dataset dataset_mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeType.UPSERT, entityUrn=datasetUrn("bazTable"), aspectName="datasetProperties", aspect=datasetProperties, ) # Emit Dataset entity properties aspect! (Skip if dataset is already present) emitter.emit_mcp(dataset_mcp)
def test_datahub_rest_emitter_timeout_construction(): emitter = DatahubRestEmitter( MOCK_GMS_ENDPOINT, connect_timeout_sec=2, read_timeout_sec=4 ) assert emitter._connect_timeout_sec == 2 assert emitter._read_timeout_sec == 4
nativeParameters={"max_value": "99"}, ), customProperties={"suite_name": "demo_suite"}, ) # Construct a MetadataChangeProposalWrapper object. assertion_maxVal_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertionUrn(assertion_maxVal), aspectName="assertionInfo", aspect=assertion_maxVal, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit Assertion entity info object! emitter.emit_mcp(assertion_maxVal_mcp) # Construct batch assertion result object for partition 1 batch assertionResult_maxVal_batch_partition1 = AssertionRunEvent( timestampMillis=int(time.time() * 1000), assertionUrn=assertionUrn(assertion_maxVal), asserteeUrn=datasetUrn("bazTable"), partitionSpec=PartitionSpecClass(partition=str([{ "country": "IN" }])), runId="uuid1", status=AssertionRunStatus.COMPLETE, result=AssertionResult(
from datahub.emitter.rest_emitter import DatahubRestEmitter # Imports for metadata model classes from datahub.metadata.schema_classes import ( AuditStampClass, ChangeTypeClass, GlossaryTermAssociationClass, GlossaryTermsClass, ) log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) # First we get the current terms gms_endpoint = "http://localhost:8080" rest_emitter = DatahubRestEmitter(gms_server=gms_endpoint) dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") term_to_add = make_term_urn("Classification.HighlyConfidential") term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add) # an audit stamp that basically says we have no idea when these terms were added to this dataset # change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time of the application unknown_audit_stamp = AuditStampClass(time=0, actor="urn:li:corpuser:ingestion") # create a brand new terms aspect terms_aspect = GlossaryTermsClass( terms=[term_association_to_add], auditStamp=unknown_audit_stamp, )
def make_emitter(self) -> DatahubRestEmitter: return DatahubRestEmitter(*self._get_config())
platformSchema=OtherSchemaClass( rawSchema="__insert raw schema here__"), fields=[ SchemaFieldClass( fieldPath="address.zipcode", type=SchemaFieldDataTypeClass(type=StringTypeClass()), nativeDataType= "VARCHAR(50)", # use this to provide the type of the field in the source system's vernacular description= "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States", ), SchemaFieldClass( fieldPath="address.street", type=SchemaFieldDataTypeClass(type=StringTypeClass()), nativeDataType="VARCHAR(100)", description="Street corresponding to the address", ), SchemaFieldClass( fieldPath="last_sold_date", type=SchemaFieldDataTypeClass(type=DateTypeClass()), nativeDataType="Date", description="Date of the last sale date for this property", ), ], ), ) # Create rest emitter rest_emitter = DatahubRestEmitter(gms_server="http://localhost:8080") rest_emitter.emit(event)
entityType="dataflow", changeType=ChangeTypeClass.UPSERT, entityUrn=dataflow_urn, aspectName="dataFlowInfo", aspect=dataflow_info, ) datajob_info = DataJobInfoClass(name="My Job 1", type="AIRFLOW", flowUrn=dataflow_urn) # Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect. # NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job. datajob_info_mcp = MetadataChangeProposalWrapper( entityType="dataJob", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_data_job_urn(orchestrator="airflow", flow_id="flow_old_api", job_id="job1", cluster="prod"), aspectName="dataJobInfo", aspect=datajob_info, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(dataflow_info_mcp) emitter.emit_mcp(datajob_info_mcp)
{ "field": "timestamp", "isFieldLevel": True, "type": "increasing", "checkDefinition": "n/a", "url": "https://github.com/datahub-project/datahub/blob/master/checks/increasing.sql", }, ] } emitter: DatahubRestEmitter = DatahubRestEmitter( gms_server="http://localhost:8080") dataset_urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" mcp_raw: MetadataChangeProposalClass = MetadataChangeProposalClass( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="customDataQualityRules", aspect=GenericAspectClass( contentType="application/json", value=json.dumps(dq_aspect).encode("utf-8"), ), ) try: emitter.emit(mcp_raw)
def _run( self, validation_result_suite: ExpectationSuiteValidationResult, validation_result_suite_identifier: Union[ ValidationResultIdentifier, GeCloudIdentifier ], data_asset: Union[Validator, DataAsset, Batch], payload: Any = None, expectation_suite_identifier: Optional[ExpectationSuiteIdentifier] = None, checkpoint_identifier: Any = None, ) -> Dict: datasets = [] try: emitter = DatahubRestEmitter( gms_server=self.server_url, token=self.token, read_timeout_sec=self.timeout_sec, connect_timeout_sec=self.timeout_sec, retry_status_codes=self.retry_status_codes, retry_max_times=self.retry_max_times, extra_headers=self.extra_headers, ) expectation_suite_name = validation_result_suite.meta.get( "expectation_suite_name" ) run_id = validation_result_suite.meta.get("run_id") if hasattr(data_asset, "active_batch_id"): batch_identifier = data_asset.active_batch_id else: batch_identifier = data_asset.batch_id if isinstance( validation_result_suite_identifier, ValidationResultIdentifier ): expectation_suite_name = ( validation_result_suite_identifier.expectation_suite_identifier.expectation_suite_name ) run_id = validation_result_suite_identifier.run_id batch_identifier = validation_result_suite_identifier.batch_identifier # Returns datasets and corresponding batch requests datasets = self.get_dataset_partitions(batch_identifier, data_asset) if len(datasets) == 0 or datasets[0]["dataset_urn"] is None: logger.info("Metadata not sent to datahub. No datasets found.") return {"datahub_notification_result": "none required"} # Returns assertion info and assertion results assertions = self.get_assertions_with_results( validation_result_suite, expectation_suite_name, run_id, payload, datasets, ) for assertion in assertions: # Construct a MetadataChangeProposalWrapper object. assertion_info_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertion["assertionUrn"], aspectName="assertionInfo", aspect=assertion["assertionInfo"], ) emitter.emit_mcp(assertion_info_mcp) # Construct a MetadataChangeProposalWrapper object. assertion_platform_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertion["assertionUrn"], aspectName="dataPlatformInstance", aspect=assertion["assertionPlatform"], ) emitter.emit_mcp(assertion_platform_mcp) for assertionResult in assertion["assertionResults"]: dataset_assertionResult_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertionResult.assertionUrn, aspectName="assertionRunEvent", aspect=assertionResult, ) # Emit Result! (timseries aspect) emitter.emit_mcp(dataset_assertionResult_mcp) result = "DataHub notification succeeded" except Exception as e: result = "DataHub notification failed" if self.graceful_exceptions: logger.error(e) logger.info("Supressing error because graceful_exceptions is set") else: raise return {"datahub_notification_result": result}
fldUrn("bar3", "c2"), fldUrn("bar4", "c1"), ], outputDatasetFields=[ fldUrn("bar", "c1"), fldUrn("bar", "c2"), fldUrn("bar", "c3"), fldUrn("bar", "c4"), fldUrn("bar", "c5"), fldUrn("bar", "c6"), fldUrn("bar", "c7"), fldUrn("bar", "c9"), fldUrn("bar2", "c9"), ], fineGrainedLineages=fineGrainedLineages, ) dataJobLineageMcp = MetadataChangeProposalWrapper( entityType="dataJob", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_data_job_urn("spark", "Flow1", "Task1"), aspectName="dataJobInputOutput", aspect=dataJobInputOutput, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(dataJobLineageMcp)
class DatahubRestSink(Sink): config: DatahubRestSinkConfig emitter: DatahubRestEmitter report: DataHubRestSinkReport treat_errors_as_warnings: bool = False def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = DataHubRestSinkReport() self.emitter = DatahubRestEmitter( self.config.server, self.config.token, connect_timeout_sec=self.config.timeout_sec, # reuse timeout_sec for connect timeout read_timeout_sec=self.config.timeout_sec, retry_status_codes=self.config.retry_status_codes, retry_max_times=self.config.retry_max_times, extra_headers=self.config.extra_headers, ca_certificate_path=self.config.ca_certificate_path, ) try: gms_config = self.emitter.test_connection() except Exception as exc: raise ConfigurationError( f"💥 Failed to connect to DataHub@{self.config.server} (token:{'XXX-redacted' if self.config.token else 'empty'}) over REST", exc, ) self.report.gms_version = ( gms_config.get("versions", {}) .get("linkedin/datahub", {}) .get("version", "") ) logger.debug("Setting env variables to override config") set_env_variables_override_config(self.config.server, self.config.token) logger.debug("Setting gms config") set_gms_config(gms_config) self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=self.config.max_threads ) @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "DatahubRestSink": config = DatahubRestSinkConfig.parse_obj(config_dict) return cls(ctx, config) def handle_work_unit_start(self, workunit: WorkUnit) -> None: if isinstance(workunit, MetadataWorkUnit): mwu: MetadataWorkUnit = cast(MetadataWorkUnit, workunit) self.treat_errors_as_warnings = mwu.treat_errors_as_warnings def handle_work_unit_end(self, workunit: WorkUnit) -> None: pass def _write_done_callback( self, record_envelope: RecordEnvelope, write_callback: WriteCallback, future: concurrent.futures.Future, ) -> None: if future.cancelled(): self.report.report_failure({"error": "future was cancelled"}) write_callback.on_failure( record_envelope, OperationalError("future was cancelled"), {} ) elif future.done(): e = future.exception() if not e: self.report.report_record_written(record_envelope) start_time, end_time = future.result() self.report.report_downstream_latency(start_time, end_time) write_callback.on_success(record_envelope, {}) elif isinstance(e, OperationalError): # only OperationalErrors should be ignored if not self.treat_errors_as_warnings: self.report.report_failure({"error": e.message, "info": e.info}) else: # trim exception stacktraces when reporting warnings if "stackTrace" in e.info: try: e.info["stackTrace"] = "\n".join( e.info["stackTrace"].split("\n")[0:2] ) except Exception: # ignore failures in trimming pass record = record_envelope.record if isinstance(record, MetadataChangeProposalWrapper): # include information about the entity that failed entity_id = cast( MetadataChangeProposalWrapper, record ).entityUrn e.info["id"] = entity_id else: entity_id = None self.report.report_warning({"warning": e.message, "info": e.info}) write_callback.on_failure(record_envelope, e, e.info) else: self.report.report_failure({"e": e}) write_callback.on_failure(record_envelope, Exception(e), {}) def write_record_async( self, record_envelope: RecordEnvelope[ Union[ MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregation, ] ], write_callback: WriteCallback, ) -> None: record = record_envelope.record write_future = self.executor.submit(self.emitter.emit, record) write_future.add_done_callback( functools.partial( self._write_done_callback, record_envelope, write_callback ) ) def get_report(self) -> SinkReport: return self.report def close(self): self.executor.shutdown(wait=True)
from datahub.api.entities.datajob import DataFlow, DataJob from datahub.emitter.rest_emitter import DatahubRestEmitter # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") jobFlow = DataFlow(cluster="prod", orchestrator="airflow", id="flow_new_api") jobFlow.emit(emitter) dataJob = DataJob(flow_urn=jobFlow.urn, id="job1") dataJob.emit(emitter)
def dataplatform2instance_func( instance: str, platform: str, dry_run: bool, env: str, force: bool, hard: bool, keep: bool, ) -> None: click.echo( f"Starting migration: platform:{platform}, instance={instance}, force={force}, dry-run={dry_run}" ) run_id: str = f"migrate-{uuid.uuid4()}" migration_report = MigrationReport(run_id, dry_run, keep) system_metadata = SystemMetadataClass(runId=run_id) all_aspects = [ "schemaMetadata", "datasetProperties", "viewProperties", "subTypes", "editableDatasetProperties", "ownership", "datasetDeprecation", "institutionalMemory", "editableSchemaMetadata", "globalTags", "glossaryTerms", "upstreamLineage", "datasetUpstreamLineage", "status", ] if not dry_run: rest_emitter = DatahubRestEmitter( gms_server=cli_utils.get_session_and_host()[1] ) urns_to_migrate = [] # we first calculate all the urns we will be migrating for src_entity_urn in cli_utils.get_urns_by_filter(platform=platform, env=env): key = dataset_urn_to_key(src_entity_urn) assert key # Does this urn already have a platform instance associated with it? response = cli_utils.get_aspects_for_entity( entity_urn=src_entity_urn, aspects=["dataPlatformInstance"], typed=True ) if "dataPlatformInstance" in response: assert isinstance( response["dataPlatformInstance"], DataPlatformInstanceClass ) data_platform_instance: DataPlatformInstanceClass = response[ "dataPlatformInstance" ] if data_platform_instance.instance: log.debug("This is already an instance-specific urn, will skip") continue else: log.debug( f"{src_entity_urn} is not an instance specific urn. {response}" ) urns_to_migrate.append(src_entity_urn) if not force and not dry_run: # get a confirmation from the operator before proceeding if this is not a dry run sampled_urns_to_migrate = random.choices( urns_to_migrate, k=min(10, len(urns_to_migrate)) ) sampled_new_urns: List[str] = [ make_dataset_urn_with_platform_instance( platform=key.platform, name=key.name, platform_instance=instance, env=str(key.origin), ) for key in [dataset_urn_to_key(x) for x in sampled_urns_to_migrate] if key ] click.echo( f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}" ) click.echo(f"New urns will look like {sampled_new_urns}") click.confirm("Ok to proceed?", abort=True) for src_entity_urn in progressbar.progressbar( urns_to_migrate, redirect_stdout=True ): key = dataset_urn_to_key(src_entity_urn) assert key new_urn = make_dataset_urn_with_platform_instance( platform=key.platform, name=key.name, platform_instance=instance, env=str(key.origin), ) log.debug(f"Will migrate {src_entity_urn} to {new_urn}") relationships = migration_utils.get_incoming_relationships_dataset( src_entity_urn ) for mcp in migration_utils.clone_aspect( src_entity_urn, aspect_names=all_aspects, dst_urn=new_urn, dry_run=dry_run, run_id=run_id, ): if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName) # type: ignore if not dry_run: rest_emitter.emit_mcp( MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=new_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=make_data_platform_urn(platform), instance=make_dataplatform_instance_urn(platform, instance), ), systemMetadata=system_metadata, ) ) migration_report.on_entity_create(new_urn, "dataPlatformInstance") for relationship in relationships: target_urn = relationship["entity"] entity_type = _get_type_from_urn(target_urn) relationshipType = relationship["type"] aspect_name = ( migration_utils.get_aspect_name_from_relationship_type_and_entity( relationshipType, entity_type ) ) aspect_map = cli_utils.get_aspects_for_entity( target_urn, aspects=[aspect_name], typed=True ) if aspect_name in aspect_map: aspect = aspect_map[aspect_name] assert isinstance(aspect, DictWrapper) aspect = migration_utils.modify_urn_list_for_aspect( aspect_name, aspect, relationshipType, src_entity_urn, new_urn ) # use mcpw mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=target_urn, aspectName=aspect_name, aspect=aspect, ) if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName) # type: ignore else: log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}") if not dry_run and not keep: log.info(f"will {'hard' if hard else 'soft'} delete {src_entity_urn}") delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id) migration_report.on_entity_migrated(src_entity_urn, "status") # type: ignore print(f"{migration_report}")
import datahub.emitter.mce_builder as builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.metadata.com.linkedin.pegasus2avro.datajob import DataJobInfoClass from datahub.metadata.schema_classes import ChangeTypeClass # Construct the DataJobInfo aspect with the job -> flow lineage. dataflow_urn = builder.make_data_flow_urn( orchestrator="airflow", flow_id="flow1", cluster="prod" ) datajob_info = DataJobInfoClass(name="My Job 1", type="AIRFLOW", flowUrn=dataflow_urn) # Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect. # NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job. chart_info_mcp = MetadataChangeProposalWrapper( entityType="dataJob", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_data_job_urn( orchestrator="airflow", flow_id="flow1", job_id="job1", cluster="prod" ), aspectName="dataJobInfo", aspect=datajob_info, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(chart_info_mcp)
def __init__(self, ctx: PipelineContext, config: DatahubRestSinkConfig): super().__init__(ctx) self.config = config self.report = SinkReport() self.emitter = DatahubRestEmitter(self.config.server, self.config.token)
def test_datahub_rest_emitter_construction(): emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT) assert emitter._connect_timeout_sec == emitter.DEFAULT_CONNECT_TIMEOUT_SEC assert emitter._read_timeout_sec == emitter.DEFAULT_READ_TIMEOUT_SEC assert emitter._retry_status_codes == emitter.DEFAULT_RETRY_STATUS_CODES assert emitter._retry_max_times == emitter.DEFAULT_RETRY_MAX_TIMES
def test_datahub_rest_emitter_construction(): emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT) assert emitter._connect_timeout_sec == emitter.DEFAULT_CONNECT_TIMEOUT_SEC assert emitter._read_timeout_sec == emitter.DEFAULT_READ_TIMEOUT_SEC
def make_emitter(self) -> DatahubRestEmitter: return DatahubRestEmitter(self._gms_endpoint())