Example #1
0
def process_container_relationships(
    container_id_map: Dict[str, str],
    dry_run: bool,
    src_urn: str,
    dst_urn: str,
    migration_report: MigrationReport,
    rest_emitter: DatahubRestEmitter,
) -> None:
    relationships = migration_utils.get_incoming_relationships(urn=src_urn)
    for relationship in relationships:
        log.debug(f"Incoming Relationship: {relationship}")
        target_urn = relationship["entity"]

        # We should use the new id if we already migrated it
        if target_urn in container_id_map:
            target_urn = container_id_map.get(target_urn)

        entity_type = _get_type_from_urn(target_urn)
        relationshipType = relationship["type"]
        aspect_name = migration_utils.get_aspect_name_from_relationship(
            relationshipType, entity_type)
        aspect_map = cli_utils.get_aspects_for_entity(target_urn,
                                                      aspects=[aspect_name],
                                                      typed=True)
        if aspect_name in aspect_map:
            aspect = aspect_map[aspect_name]
            assert isinstance(aspect, DictWrapper)
            aspect = migration_utils.modify_urn_list_for_aspect(
                aspect_name, aspect, relationshipType, src_urn, dst_urn)
            # use mcpw
            mcp = MetadataChangeProposalWrapper(
                entityType=entity_type,
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=target_urn,
                aspectName=aspect_name,
                aspect=aspect,
            )

            if not dry_run:
                rest_emitter.emit_mcp(mcp)
            migration_report.on_entity_affected(mcp.entityUrn,
                                                mcp.aspectName)  # type: ignore
        else:
            log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}")
Example #2
0
emitter = DatahubRestEmitter("http://localhost:8080")

datasetProperties = DatasetProperties(
    name="bazTable",
)
# Construct a MetadataChangeProposalWrapper object for dataset
dataset_mcp = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeType.UPSERT,
    entityUrn=datasetUrn("bazTable"),
    aspectName="datasetProperties",
    aspect=datasetProperties,
)

# Emit Dataset entity properties aspect! (Skip if dataset is already present)
emitter.emit_mcp(dataset_mcp)

# Construct an assertion object.
assertion_maxVal = AssertionInfo(
    type=AssertionType.DATASET,
    datasetAssertion=DatasetAssertionInfo(
        scope=DatasetAssertionScope.DATASET_COLUMN,
        operator=AssertionStdOperator.BETWEEN,
        nativeType="expect_column_max_to_be_between",
        aggregation=AssertionStdAggregation.MAX,
        fields=[fldUrn("bazTable", "col1")],
        dataset=datasetUrn("bazTable"),
        nativeParameters={"max_value": "99", "min_value": "89"},
        parameters=AssertionStdParameters(
            minValue=AssertionStdParameter(
                type=AssertionStdParameterType.NUMBER, value="89"
# Construct upstream tables.
upstream_tables: List[UpstreamClass] = []
upstream_table_1 = UpstreamClass(
    dataset=builder.make_dataset_urn("bigquery", "upstream_table_1", "PROD"),
    type=DatasetLineageTypeClass.TRANSFORMED,
)
upstream_tables.append(upstream_table_1)
upstream_table_2 = UpstreamClass(
    dataset=builder.make_dataset_urn("bigquery", "upstream_table_2", "PROD"),
    type=DatasetLineageTypeClass.TRANSFORMED,
)
upstream_tables.append(upstream_table_2)

# Construct a lineage object.
upstream_lineage = UpstreamLineage(upstreams=upstream_tables)

# Construct a MetadataChangeProposalWrapper object.
lineage_mcp = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_dataset_urn("bigquery", "downstream"),
    aspectName="upstreamLineage",
    aspect=upstream_lineage,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(lineage_mcp)
Example #4
0
        fldUrn("bar3", "c2"),
        fldUrn("bar4", "c1"),
    ],
    outputDatasetFields=[
        fldUrn("bar", "c1"),
        fldUrn("bar", "c2"),
        fldUrn("bar", "c3"),
        fldUrn("bar", "c4"),
        fldUrn("bar", "c5"),
        fldUrn("bar", "c6"),
        fldUrn("bar", "c7"),
        fldUrn("bar", "c9"),
        fldUrn("bar2", "c9"),
    ],
    fineGrainedLineages=fineGrainedLineages,
)

dataJobLineageMcp = MetadataChangeProposalWrapper(
    entityType="dataJob",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_data_job_urn("spark", "Flow1", "Task1"),
    aspectName="dataJobInputOutput",
    aspect=dataJobInputOutput,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(dataJobLineageMcp)
Example #5
0
    def _run(
        self,
        validation_result_suite: ExpectationSuiteValidationResult,
        validation_result_suite_identifier: Union[
            ValidationResultIdentifier, GeCloudIdentifier
        ],
        data_asset: Union[Validator, DataAsset, Batch],
        payload: Any = None,
        expectation_suite_identifier: Optional[ExpectationSuiteIdentifier] = None,
        checkpoint_identifier: Any = None,
    ) -> Dict:
        datasets = []
        try:
            emitter = DatahubRestEmitter(
                gms_server=self.server_url,
                token=self.token,
                read_timeout_sec=self.timeout_sec,
                connect_timeout_sec=self.timeout_sec,
                retry_status_codes=self.retry_status_codes,
                retry_max_times=self.retry_max_times,
                extra_headers=self.extra_headers,
            )

            expectation_suite_name = validation_result_suite.meta.get(
                "expectation_suite_name"
            )
            run_id = validation_result_suite.meta.get("run_id")
            if hasattr(data_asset, "active_batch_id"):
                batch_identifier = data_asset.active_batch_id
            else:
                batch_identifier = data_asset.batch_id

            if isinstance(
                validation_result_suite_identifier, ValidationResultIdentifier
            ):
                expectation_suite_name = (
                    validation_result_suite_identifier.expectation_suite_identifier.expectation_suite_name
                )
                run_id = validation_result_suite_identifier.run_id
                batch_identifier = validation_result_suite_identifier.batch_identifier

            # Returns datasets and corresponding batch requests
            datasets = self.get_dataset_partitions(batch_identifier, data_asset)

            if len(datasets) == 0 or datasets[0]["dataset_urn"] is None:
                logger.info("Metadata not sent to datahub. No datasets found.")
                return {"datahub_notification_result": "none required"}

            # Returns assertion info and assertion results
            assertions = self.get_assertions_with_results(
                validation_result_suite,
                expectation_suite_name,
                run_id,
                payload,
                datasets,
            )

            for assertion in assertions:
                # Construct a MetadataChangeProposalWrapper object.
                assertion_info_mcp = MetadataChangeProposalWrapper(
                    entityType="assertion",
                    changeType=ChangeType.UPSERT,
                    entityUrn=assertion["assertionUrn"],
                    aspectName="assertionInfo",
                    aspect=assertion["assertionInfo"],
                )
                emitter.emit_mcp(assertion_info_mcp)

                # Construct a MetadataChangeProposalWrapper object.
                assertion_platform_mcp = MetadataChangeProposalWrapper(
                    entityType="assertion",
                    changeType=ChangeType.UPSERT,
                    entityUrn=assertion["assertionUrn"],
                    aspectName="dataPlatformInstance",
                    aspect=assertion["assertionPlatform"],
                )
                emitter.emit_mcp(assertion_platform_mcp)

                for assertionResult in assertion["assertionResults"]:
                    dataset_assertionResult_mcp = MetadataChangeProposalWrapper(
                        entityType="assertion",
                        changeType=ChangeType.UPSERT,
                        entityUrn=assertionResult.assertionUrn,
                        aspectName="assertionRunEvent",
                        aspect=assertionResult,
                    )

                    # Emit Result! (timseries aspect)
                    emitter.emit_mcp(dataset_assertionResult_mcp)

            result = "DataHub notification succeeded"
        except Exception as e:
            result = "DataHub notification failed"
            if self.graceful_exceptions:
                logger.error(e)
                logger.info("Supressing error because graceful_exceptions is set")
            else:
                raise

        return {"datahub_notification_result": result}
import datahub.emitter.mce_builder as builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.com.linkedin.pegasus2avro.datajob import DataJobInfoClass
from datahub.metadata.schema_classes import ChangeTypeClass

# Construct the DataJobInfo aspect with the job -> flow lineage.
dataflow_urn = builder.make_data_flow_urn(
    orchestrator="airflow", flow_id="flow1", cluster="prod"
)

datajob_info = DataJobInfoClass(name="My Job 1", type="AIRFLOW", flowUrn=dataflow_urn)

# Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect.
# NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job.
chart_info_mcp = MetadataChangeProposalWrapper(
    entityType="dataJob",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_data_job_urn(
        orchestrator="airflow", flow_id="flow1", job_id="job1", cluster="prod"
    ),
    aspectName="dataJobInfo",
    aspect=datajob_info,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(chart_info_mcp)
Example #7
0
def dataplatform2instance_func(
    instance: str,
    platform: str,
    dry_run: bool,
    env: str,
    force: bool,
    hard: bool,
    keep: bool,
) -> None:
    click.echo(
        f"Starting migration: platform:{platform}, instance={instance}, force={force}, dry-run={dry_run}"
    )
    run_id: str = f"migrate-{uuid.uuid4()}"
    migration_report = MigrationReport(run_id, dry_run, keep)
    system_metadata = SystemMetadataClass(runId=run_id)

    all_aspects = [
        "schemaMetadata",
        "datasetProperties",
        "viewProperties",
        "subTypes",
        "editableDatasetProperties",
        "ownership",
        "datasetDeprecation",
        "institutionalMemory",
        "editableSchemaMetadata",
        "globalTags",
        "glossaryTerms",
        "upstreamLineage",
        "datasetUpstreamLineage",
        "status",
    ]

    if not dry_run:
        rest_emitter = DatahubRestEmitter(
            gms_server=cli_utils.get_session_and_host()[1]
        )

    urns_to_migrate = []
    # we first calculate all the urns we will be migrating
    for src_entity_urn in cli_utils.get_urns_by_filter(platform=platform, env=env):
        key = dataset_urn_to_key(src_entity_urn)
        assert key
        # Does this urn already have a platform instance associated with it?
        response = cli_utils.get_aspects_for_entity(
            entity_urn=src_entity_urn, aspects=["dataPlatformInstance"], typed=True
        )
        if "dataPlatformInstance" in response:
            assert isinstance(
                response["dataPlatformInstance"], DataPlatformInstanceClass
            )
            data_platform_instance: DataPlatformInstanceClass = response[
                "dataPlatformInstance"
            ]
            if data_platform_instance.instance:
                log.debug("This is already an instance-specific urn, will skip")
                continue
            else:
                log.debug(
                    f"{src_entity_urn} is not an instance specific urn. {response}"
                )
                urns_to_migrate.append(src_entity_urn)

    if not force and not dry_run:
        # get a confirmation from the operator before proceeding if this is not a dry run
        sampled_urns_to_migrate = random.choices(
            urns_to_migrate, k=min(10, len(urns_to_migrate))
        )
        sampled_new_urns: List[str] = [
            make_dataset_urn_with_platform_instance(
                platform=key.platform,
                name=key.name,
                platform_instance=instance,
                env=str(key.origin),
            )
            for key in [dataset_urn_to_key(x) for x in sampled_urns_to_migrate]
            if key
        ]
        click.echo(
            f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
        )
        click.echo(f"New urns will look like {sampled_new_urns}")
        click.confirm("Ok to proceed?", abort=True)

    for src_entity_urn in progressbar.progressbar(
        urns_to_migrate, redirect_stdout=True
    ):
        key = dataset_urn_to_key(src_entity_urn)
        assert key
        new_urn = make_dataset_urn_with_platform_instance(
            platform=key.platform,
            name=key.name,
            platform_instance=instance,
            env=str(key.origin),
        )
        log.debug(f"Will migrate {src_entity_urn} to {new_urn}")
        relationships = migration_utils.get_incoming_relationships_dataset(
            src_entity_urn
        )

        for mcp in migration_utils.clone_aspect(
            src_entity_urn,
            aspect_names=all_aspects,
            dst_urn=new_urn,
            dry_run=dry_run,
            run_id=run_id,
        ):
            if not dry_run:
                rest_emitter.emit_mcp(mcp)
            migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName)  # type: ignore

        if not dry_run:
            rest_emitter.emit_mcp(
                MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=new_urn,
                    aspectName="dataPlatformInstance",
                    aspect=DataPlatformInstanceClass(
                        platform=make_data_platform_urn(platform),
                        instance=make_dataplatform_instance_urn(platform, instance),
                    ),
                    systemMetadata=system_metadata,
                )
            )
        migration_report.on_entity_create(new_urn, "dataPlatformInstance")

        for relationship in relationships:
            target_urn = relationship["entity"]
            entity_type = _get_type_from_urn(target_urn)
            relationshipType = relationship["type"]
            aspect_name = (
                migration_utils.get_aspect_name_from_relationship_type_and_entity(
                    relationshipType, entity_type
                )
            )
            aspect_map = cli_utils.get_aspects_for_entity(
                target_urn, aspects=[aspect_name], typed=True
            )
            if aspect_name in aspect_map:
                aspect = aspect_map[aspect_name]
                assert isinstance(aspect, DictWrapper)
                aspect = migration_utils.modify_urn_list_for_aspect(
                    aspect_name, aspect, relationshipType, src_entity_urn, new_urn
                )
                # use mcpw
                mcp = MetadataChangeProposalWrapper(
                    entityType=entity_type,
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=target_urn,
                    aspectName=aspect_name,
                    aspect=aspect,
                )
                if not dry_run:
                    rest_emitter.emit_mcp(mcp)
                migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName)  # type: ignore
            else:
                log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}")

        if not dry_run and not keep:
            log.info(f"will {'hard' if hard else 'soft'} delete {src_entity_urn}")
            delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id)
        migration_report.on_entity_migrated(src_entity_urn, "status")  # type: ignore

    print(f"{migration_report}")
Example #8
0
    entityType="dataflow",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=dataflow_urn,
    aspectName="dataFlowInfo",
    aspect=dataflow_info,
)

datajob_info = DataJobInfoClass(name="My Job 1",
                                type="AIRFLOW",
                                flowUrn=dataflow_urn)

# Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect.
# NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job.
datajob_info_mcp = MetadataChangeProposalWrapper(
    entityType="dataJob",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_data_job_urn(orchestrator="airflow",
                                        flow_id="flow_old_api",
                                        job_id="job1",
                                        cluster="prod"),
    aspectName="dataJobInfo",
    aspect=datajob_info,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(dataflow_info_mcp)
emitter.emit_mcp(datajob_info_mcp)
Example #9
0
def migrate_containers(
    dry_run: bool,
    env: str,
    platform: str,
    hard: bool,
    instance: str,
    keep: bool,
    rest_emitter: DatahubRestEmitter,
) -> None:
    run_id: str = f"container-migrate-{uuid.uuid4()}"
    migration_report = MigrationReport(run_id, dry_run, keep)

    # Find container ids need to be migrated
    container_id_map: Dict[str, str] = {}
    # Get all the containers need to be migrated
    containers = get_containers_for_migration(env)
    for container in progressbar.progressbar(containers, redirect_stdout=True):
        # Generate new container key
        subType = container["aspects"]["subTypes"]["value"]["typeNames"][0]
        customProperties = container["aspects"]["containerProperties"][
            "value"]["customProperties"]
        if (env is not None and customProperties["instance"] != env) or (
                platform is not None
                and customProperties["platform"] != platform):
            log.debug(
                f"{container['urn']} does not match filter criteria, skipping.. {customProperties} {env} {platform}"
            )
            continue

        try:
            newKey: Union[SchemaKey, DatabaseKey, ProjectIdKey,
                          BigQueryDatasetKey]
            if subType == "Schema":
                newKey = SchemaKey.parse_obj(customProperties)
            elif subType == "Database":
                newKey = DatabaseKey.parse_obj(customProperties)
            elif subType == "Project":
                newKey = ProjectIdKey.parse_obj(customProperties)
            elif subType == "Dataset":
                newKey = BigQueryDatasetKey.parse_obj(customProperties)
            else:
                log.warning(f"Invalid subtype {subType}. Skipping")
                continue
        except Exception as e:
            log.warning(
                f"Unable to map {customProperties} to key due to exception {e}"
            )
            continue

        newKey.instance = instance

        log.debug(
            f"Container key migration: {container['urn']} -> urn:li:container:{newKey.guid()}"
        )

        src_urn = container["urn"]
        dst_urn = f"urn:li:container:{newKey.guid()}"
        container_id_map[src_urn] = dst_urn

        # Clone aspects of container with the new urn
        for mcp in migration_utils.clone_aspect(
                src_urn,
                aspect_names=migration_utils.all_aspects,
                entity_type="container",
                dst_urn=dst_urn,
                dry_run=dry_run,
                run_id=run_id,
        ):
            migration_report.on_entity_create(mcp.entityUrn,
                                              mcp.aspectName)  # type: ignore
            assert mcp.aspect
            # Update containerProperties to reflect the new key
            if mcp.aspectName == "containerProperties":
                assert isinstance(mcp.aspect, ContainerPropertiesClass)
                containerProperties: ContainerPropertiesClass = mcp.aspect
                containerProperties.customProperties = newKey.dict(
                    by_alias=True, exclude_none=True)
                mcp.aspect = containerProperties
            elif mcp.aspectName == "containerKey":
                assert isinstance(mcp.aspect, ContainerKeyClass)
                containerKey: ContainerKeyClass = mcp.aspect
                containerKey.guid = newKey.guid()
                mcp.aspect = containerKey
            if not dry_run:
                rest_emitter.emit_mcp(mcp)
                migration_report.on_entity_affected(
                    mcp.entityUrn, mcp.aspectName)  # type: ignore

        process_container_relationships(
            container_id_map=container_id_map,
            dry_run=dry_run,
            src_urn=src_urn,
            dst_urn=dst_urn,
            migration_report=migration_report,
            rest_emitter=rest_emitter,
        )

        if not dry_run and not keep:
            log.info(f"will {'hard' if hard else 'soft'} delete {src_urn}")
            delete_cli._delete_one_urn(src_urn,
                                       soft=not hard,
                                       run_id=run_id,
                                       entity_type="container")
        migration_report.on_entity_migrated(src_urn, "status")  # type: ignore

    print(f"{migration_report}")
    builder.make_data_job_urn(orchestrator="airflow",
                              flow_id="flow1",
                              job_id="job0",
                              cluster="PROD")
]

datajob_input_output = DataJobInputOutputClass(
    inputDatasets=input_datasets,
    outputDatasets=output_datasets,
    inputDatajobs=input_data_jobs,
)

# Construct a MetadataChangeProposalWrapper object.
# NOTE: This will overwrite all of the existing lineage information associated with this job.
datajob_input_output_mcp = MetadataChangeProposalWrapper(
    entityType="datajob",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_data_job_urn(orchestrator="airflow",
                                        flow_id="flow1",
                                        job_id="job1",
                                        cluster="PROD"),
    aspectName="dataJobInputOutput",
    aspect=datajob_input_output,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(datajob_input_output_mcp)