def process_container_relationships( container_id_map: Dict[str, str], dry_run: bool, src_urn: str, dst_urn: str, migration_report: MigrationReport, rest_emitter: DatahubRestEmitter, ) -> None: relationships = migration_utils.get_incoming_relationships(urn=src_urn) for relationship in relationships: log.debug(f"Incoming Relationship: {relationship}") target_urn = relationship["entity"] # We should use the new id if we already migrated it if target_urn in container_id_map: target_urn = container_id_map.get(target_urn) entity_type = _get_type_from_urn(target_urn) relationshipType = relationship["type"] aspect_name = migration_utils.get_aspect_name_from_relationship( relationshipType, entity_type) aspect_map = cli_utils.get_aspects_for_entity(target_urn, aspects=[aspect_name], typed=True) if aspect_name in aspect_map: aspect = aspect_map[aspect_name] assert isinstance(aspect, DictWrapper) aspect = migration_utils.modify_urn_list_for_aspect( aspect_name, aspect, relationshipType, src_urn, dst_urn) # use mcpw mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=target_urn, aspectName=aspect_name, aspect=aspect, ) if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName) # type: ignore else: log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}")
emitter = DatahubRestEmitter("http://localhost:8080") datasetProperties = DatasetProperties( name="bazTable", ) # Construct a MetadataChangeProposalWrapper object for dataset dataset_mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeType.UPSERT, entityUrn=datasetUrn("bazTable"), aspectName="datasetProperties", aspect=datasetProperties, ) # Emit Dataset entity properties aspect! (Skip if dataset is already present) emitter.emit_mcp(dataset_mcp) # Construct an assertion object. assertion_maxVal = AssertionInfo( type=AssertionType.DATASET, datasetAssertion=DatasetAssertionInfo( scope=DatasetAssertionScope.DATASET_COLUMN, operator=AssertionStdOperator.BETWEEN, nativeType="expect_column_max_to_be_between", aggregation=AssertionStdAggregation.MAX, fields=[fldUrn("bazTable", "col1")], dataset=datasetUrn("bazTable"), nativeParameters={"max_value": "99", "min_value": "89"}, parameters=AssertionStdParameters( minValue=AssertionStdParameter( type=AssertionStdParameterType.NUMBER, value="89"
# Construct upstream tables. upstream_tables: List[UpstreamClass] = [] upstream_table_1 = UpstreamClass( dataset=builder.make_dataset_urn("bigquery", "upstream_table_1", "PROD"), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table_1) upstream_table_2 = UpstreamClass( dataset=builder.make_dataset_urn("bigquery", "upstream_table_2", "PROD"), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table_2) # Construct a lineage object. upstream_lineage = UpstreamLineage(upstreams=upstream_tables) # Construct a MetadataChangeProposalWrapper object. lineage_mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn("bigquery", "downstream"), aspectName="upstreamLineage", aspect=upstream_lineage, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(lineage_mcp)
fldUrn("bar3", "c2"), fldUrn("bar4", "c1"), ], outputDatasetFields=[ fldUrn("bar", "c1"), fldUrn("bar", "c2"), fldUrn("bar", "c3"), fldUrn("bar", "c4"), fldUrn("bar", "c5"), fldUrn("bar", "c6"), fldUrn("bar", "c7"), fldUrn("bar", "c9"), fldUrn("bar2", "c9"), ], fineGrainedLineages=fineGrainedLineages, ) dataJobLineageMcp = MetadataChangeProposalWrapper( entityType="dataJob", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_data_job_urn("spark", "Flow1", "Task1"), aspectName="dataJobInputOutput", aspect=dataJobInputOutput, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(dataJobLineageMcp)
def _run( self, validation_result_suite: ExpectationSuiteValidationResult, validation_result_suite_identifier: Union[ ValidationResultIdentifier, GeCloudIdentifier ], data_asset: Union[Validator, DataAsset, Batch], payload: Any = None, expectation_suite_identifier: Optional[ExpectationSuiteIdentifier] = None, checkpoint_identifier: Any = None, ) -> Dict: datasets = [] try: emitter = DatahubRestEmitter( gms_server=self.server_url, token=self.token, read_timeout_sec=self.timeout_sec, connect_timeout_sec=self.timeout_sec, retry_status_codes=self.retry_status_codes, retry_max_times=self.retry_max_times, extra_headers=self.extra_headers, ) expectation_suite_name = validation_result_suite.meta.get( "expectation_suite_name" ) run_id = validation_result_suite.meta.get("run_id") if hasattr(data_asset, "active_batch_id"): batch_identifier = data_asset.active_batch_id else: batch_identifier = data_asset.batch_id if isinstance( validation_result_suite_identifier, ValidationResultIdentifier ): expectation_suite_name = ( validation_result_suite_identifier.expectation_suite_identifier.expectation_suite_name ) run_id = validation_result_suite_identifier.run_id batch_identifier = validation_result_suite_identifier.batch_identifier # Returns datasets and corresponding batch requests datasets = self.get_dataset_partitions(batch_identifier, data_asset) if len(datasets) == 0 or datasets[0]["dataset_urn"] is None: logger.info("Metadata not sent to datahub. No datasets found.") return {"datahub_notification_result": "none required"} # Returns assertion info and assertion results assertions = self.get_assertions_with_results( validation_result_suite, expectation_suite_name, run_id, payload, datasets, ) for assertion in assertions: # Construct a MetadataChangeProposalWrapper object. assertion_info_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertion["assertionUrn"], aspectName="assertionInfo", aspect=assertion["assertionInfo"], ) emitter.emit_mcp(assertion_info_mcp) # Construct a MetadataChangeProposalWrapper object. assertion_platform_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertion["assertionUrn"], aspectName="dataPlatformInstance", aspect=assertion["assertionPlatform"], ) emitter.emit_mcp(assertion_platform_mcp) for assertionResult in assertion["assertionResults"]: dataset_assertionResult_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertionResult.assertionUrn, aspectName="assertionRunEvent", aspect=assertionResult, ) # Emit Result! (timseries aspect) emitter.emit_mcp(dataset_assertionResult_mcp) result = "DataHub notification succeeded" except Exception as e: result = "DataHub notification failed" if self.graceful_exceptions: logger.error(e) logger.info("Supressing error because graceful_exceptions is set") else: raise return {"datahub_notification_result": result}
import datahub.emitter.mce_builder as builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.metadata.com.linkedin.pegasus2avro.datajob import DataJobInfoClass from datahub.metadata.schema_classes import ChangeTypeClass # Construct the DataJobInfo aspect with the job -> flow lineage. dataflow_urn = builder.make_data_flow_urn( orchestrator="airflow", flow_id="flow1", cluster="prod" ) datajob_info = DataJobInfoClass(name="My Job 1", type="AIRFLOW", flowUrn=dataflow_urn) # Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect. # NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job. chart_info_mcp = MetadataChangeProposalWrapper( entityType="dataJob", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_data_job_urn( orchestrator="airflow", flow_id="flow1", job_id="job1", cluster="prod" ), aspectName="dataJobInfo", aspect=datajob_info, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(chart_info_mcp)
def dataplatform2instance_func( instance: str, platform: str, dry_run: bool, env: str, force: bool, hard: bool, keep: bool, ) -> None: click.echo( f"Starting migration: platform:{platform}, instance={instance}, force={force}, dry-run={dry_run}" ) run_id: str = f"migrate-{uuid.uuid4()}" migration_report = MigrationReport(run_id, dry_run, keep) system_metadata = SystemMetadataClass(runId=run_id) all_aspects = [ "schemaMetadata", "datasetProperties", "viewProperties", "subTypes", "editableDatasetProperties", "ownership", "datasetDeprecation", "institutionalMemory", "editableSchemaMetadata", "globalTags", "glossaryTerms", "upstreamLineage", "datasetUpstreamLineage", "status", ] if not dry_run: rest_emitter = DatahubRestEmitter( gms_server=cli_utils.get_session_and_host()[1] ) urns_to_migrate = [] # we first calculate all the urns we will be migrating for src_entity_urn in cli_utils.get_urns_by_filter(platform=platform, env=env): key = dataset_urn_to_key(src_entity_urn) assert key # Does this urn already have a platform instance associated with it? response = cli_utils.get_aspects_for_entity( entity_urn=src_entity_urn, aspects=["dataPlatformInstance"], typed=True ) if "dataPlatformInstance" in response: assert isinstance( response["dataPlatformInstance"], DataPlatformInstanceClass ) data_platform_instance: DataPlatformInstanceClass = response[ "dataPlatformInstance" ] if data_platform_instance.instance: log.debug("This is already an instance-specific urn, will skip") continue else: log.debug( f"{src_entity_urn} is not an instance specific urn. {response}" ) urns_to_migrate.append(src_entity_urn) if not force and not dry_run: # get a confirmation from the operator before proceeding if this is not a dry run sampled_urns_to_migrate = random.choices( urns_to_migrate, k=min(10, len(urns_to_migrate)) ) sampled_new_urns: List[str] = [ make_dataset_urn_with_platform_instance( platform=key.platform, name=key.name, platform_instance=instance, env=str(key.origin), ) for key in [dataset_urn_to_key(x) for x in sampled_urns_to_migrate] if key ] click.echo( f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}" ) click.echo(f"New urns will look like {sampled_new_urns}") click.confirm("Ok to proceed?", abort=True) for src_entity_urn in progressbar.progressbar( urns_to_migrate, redirect_stdout=True ): key = dataset_urn_to_key(src_entity_urn) assert key new_urn = make_dataset_urn_with_platform_instance( platform=key.platform, name=key.name, platform_instance=instance, env=str(key.origin), ) log.debug(f"Will migrate {src_entity_urn} to {new_urn}") relationships = migration_utils.get_incoming_relationships_dataset( src_entity_urn ) for mcp in migration_utils.clone_aspect( src_entity_urn, aspect_names=all_aspects, dst_urn=new_urn, dry_run=dry_run, run_id=run_id, ): if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName) # type: ignore if not dry_run: rest_emitter.emit_mcp( MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=new_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=make_data_platform_urn(platform), instance=make_dataplatform_instance_urn(platform, instance), ), systemMetadata=system_metadata, ) ) migration_report.on_entity_create(new_urn, "dataPlatformInstance") for relationship in relationships: target_urn = relationship["entity"] entity_type = _get_type_from_urn(target_urn) relationshipType = relationship["type"] aspect_name = ( migration_utils.get_aspect_name_from_relationship_type_and_entity( relationshipType, entity_type ) ) aspect_map = cli_utils.get_aspects_for_entity( target_urn, aspects=[aspect_name], typed=True ) if aspect_name in aspect_map: aspect = aspect_map[aspect_name] assert isinstance(aspect, DictWrapper) aspect = migration_utils.modify_urn_list_for_aspect( aspect_name, aspect, relationshipType, src_entity_urn, new_urn ) # use mcpw mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=target_urn, aspectName=aspect_name, aspect=aspect, ) if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName) # type: ignore else: log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}") if not dry_run and not keep: log.info(f"will {'hard' if hard else 'soft'} delete {src_entity_urn}") delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id) migration_report.on_entity_migrated(src_entity_urn, "status") # type: ignore print(f"{migration_report}")
entityType="dataflow", changeType=ChangeTypeClass.UPSERT, entityUrn=dataflow_urn, aspectName="dataFlowInfo", aspect=dataflow_info, ) datajob_info = DataJobInfoClass(name="My Job 1", type="AIRFLOW", flowUrn=dataflow_urn) # Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect. # NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job. datajob_info_mcp = MetadataChangeProposalWrapper( entityType="dataJob", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_data_job_urn(orchestrator="airflow", flow_id="flow_old_api", job_id="job1", cluster="prod"), aspectName="dataJobInfo", aspect=datajob_info, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(dataflow_info_mcp) emitter.emit_mcp(datajob_info_mcp)
def migrate_containers( dry_run: bool, env: str, platform: str, hard: bool, instance: str, keep: bool, rest_emitter: DatahubRestEmitter, ) -> None: run_id: str = f"container-migrate-{uuid.uuid4()}" migration_report = MigrationReport(run_id, dry_run, keep) # Find container ids need to be migrated container_id_map: Dict[str, str] = {} # Get all the containers need to be migrated containers = get_containers_for_migration(env) for container in progressbar.progressbar(containers, redirect_stdout=True): # Generate new container key subType = container["aspects"]["subTypes"]["value"]["typeNames"][0] customProperties = container["aspects"]["containerProperties"][ "value"]["customProperties"] if (env is not None and customProperties["instance"] != env) or ( platform is not None and customProperties["platform"] != platform): log.debug( f"{container['urn']} does not match filter criteria, skipping.. {customProperties} {env} {platform}" ) continue try: newKey: Union[SchemaKey, DatabaseKey, ProjectIdKey, BigQueryDatasetKey] if subType == "Schema": newKey = SchemaKey.parse_obj(customProperties) elif subType == "Database": newKey = DatabaseKey.parse_obj(customProperties) elif subType == "Project": newKey = ProjectIdKey.parse_obj(customProperties) elif subType == "Dataset": newKey = BigQueryDatasetKey.parse_obj(customProperties) else: log.warning(f"Invalid subtype {subType}. Skipping") continue except Exception as e: log.warning( f"Unable to map {customProperties} to key due to exception {e}" ) continue newKey.instance = instance log.debug( f"Container key migration: {container['urn']} -> urn:li:container:{newKey.guid()}" ) src_urn = container["urn"] dst_urn = f"urn:li:container:{newKey.guid()}" container_id_map[src_urn] = dst_urn # Clone aspects of container with the new urn for mcp in migration_utils.clone_aspect( src_urn, aspect_names=migration_utils.all_aspects, entity_type="container", dst_urn=dst_urn, dry_run=dry_run, run_id=run_id, ): migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName) # type: ignore assert mcp.aspect # Update containerProperties to reflect the new key if mcp.aspectName == "containerProperties": assert isinstance(mcp.aspect, ContainerPropertiesClass) containerProperties: ContainerPropertiesClass = mcp.aspect containerProperties.customProperties = newKey.dict( by_alias=True, exclude_none=True) mcp.aspect = containerProperties elif mcp.aspectName == "containerKey": assert isinstance(mcp.aspect, ContainerKeyClass) containerKey: ContainerKeyClass = mcp.aspect containerKey.guid = newKey.guid() mcp.aspect = containerKey if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_affected( mcp.entityUrn, mcp.aspectName) # type: ignore process_container_relationships( container_id_map=container_id_map, dry_run=dry_run, src_urn=src_urn, dst_urn=dst_urn, migration_report=migration_report, rest_emitter=rest_emitter, ) if not dry_run and not keep: log.info(f"will {'hard' if hard else 'soft'} delete {src_urn}") delete_cli._delete_one_urn(src_urn, soft=not hard, run_id=run_id, entity_type="container") migration_report.on_entity_migrated(src_urn, "status") # type: ignore print(f"{migration_report}")
builder.make_data_job_urn(orchestrator="airflow", flow_id="flow1", job_id="job0", cluster="PROD") ] datajob_input_output = DataJobInputOutputClass( inputDatasets=input_datasets, outputDatasets=output_datasets, inputDatajobs=input_data_jobs, ) # Construct a MetadataChangeProposalWrapper object. # NOTE: This will overwrite all of the existing lineage information associated with this job. datajob_input_output_mcp = MetadataChangeProposalWrapper( entityType="datajob", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_data_job_urn(orchestrator="airflow", flow_id="flow1", job_id="job1", cluster="PROD"), aspectName="dataJobInputOutput", aspect=datajob_input_output, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(datajob_input_output_mcp)