Example #1
0
 def get_data_platform_instance() -> DataPlatformInstanceClass:
     return DataPlatformInstanceClass(
         platform=make_data_platform_urn(self.platform),
         instance=make_dataplatform_instance_urn(
             self.platform, self.source_config.platform_instance)
         if self.source_config.platform_instance else None,
     )
Example #2
0
    def construct_lineage_workunits(
            self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]:

        lineages = connector.lineages
        if lineages:
            for lineage in lineages:
                source_dataset = lineage.source_dataset
                source_platform = lineage.source_platform
                target_dataset = lineage.target_dataset
                target_platform = lineage.target_platform

                mcp = MetadataChangeProposalWrapper(
                    entityType="dataset",
                    entityUrn=builder.make_dataset_urn(target_platform,
                                                       target_dataset,
                                                       self.config.env),
                    changeType=models.ChangeTypeClass.UPSERT,
                    aspectName="dataPlatformInstance",
                    aspect=models.DataPlatformInstanceClass(
                        platform=builder.make_data_platform_urn(
                            target_platform)),
                )

                wu = MetadataWorkUnit(id=target_dataset, mcp=mcp)
                self.report.report_workunit(wu)
                yield wu
                if source_dataset:
                    mcp = MetadataChangeProposalWrapper(
                        entityType="dataset",
                        entityUrn=builder.make_dataset_urn(
                            source_platform, source_dataset, self.config.env),
                        changeType=models.ChangeTypeClass.UPSERT,
                        aspectName="dataPlatformInstance",
                        aspect=models.DataPlatformInstanceClass(
                            platform=builder.make_data_platform_urn(
                                source_platform)),
                    )

                    wu = MetadataWorkUnit(id=source_dataset, mcp=mcp)
                    self.report.report_workunit(wu)
                    yield wu
Example #3
0
    def ingest_table(self,
                     table_data: TableData) -> Iterable[MetadataWorkUnit]:

        logger.info(
            f"Extracting table schema from file: {table_data.full_path}")
        browse_path: str = (strip_s3_prefix(table_data.table_path)
                            if table_data.is_s3 else
                            table_data.table_path.strip("/"))

        data_platform_urn = make_data_platform_urn(self.source_config.platform)
        logger.info(f"Creating dataset urn with name: {browse_path}")
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.source_config.platform,
            browse_path,
            self.source_config.platform_instance,
            self.source_config.env,
        )

        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_properties = DatasetPropertiesClass(
            description="",
            name=table_data.disaply_name,
            customProperties={},
        )
        dataset_snapshot.aspects.append(dataset_properties)

        fields = self.get_fields(table_data)
        schema_metadata = SchemaMetadata(
            schemaName=table_data.disaply_name,
            platform=data_platform_urn,
            version=0,
            hash="",
            fields=fields,
            platformSchema=OtherSchemaClass(rawSchema=""),
        )
        dataset_snapshot.aspects.append(schema_metadata)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=table_data.table_path, mce=mce)
        self.report.report_workunit(wu)
        yield wu

        yield from self.create_container_hierarchy(table_data, dataset_urn)

        if self.source_config.profiling.enabled:
            yield from self.get_table_profile(table_data, dataset_urn)
Example #4
0
    def construct_dataset_workunits(
        self,
        dataset_platform: str,
        dataset_name: str,
        dataset_urn: Optional[str] = None,
        external_url: Optional[str] = None,
        datasetProperties: Optional[Dict[str, str]] = None,
    ) -> Iterable[MetadataWorkUnit]:

        if not dataset_urn:
            dataset_urn = builder.make_dataset_urn(
                dataset_platform, dataset_name, self.config.env
            )

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="dataPlatformInstance",
            aspect=DataPlatformInstanceClass(
                platform=builder.make_data_platform_urn(dataset_platform)
            ),
        )
        platform = (
            dataset_platform[dataset_platform.rindex(":") + 1 :]
            if dataset_platform.startswith("urn:")
            else dataset_platform
        )
        wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp)
        if wu.id not in self.report.workunit_ids:
            self.report.report_workunit(wu)
            yield wu

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="datasetProperties",
            aspect=DatasetPropertiesClass(
                externalUrl=external_url, customProperties=datasetProperties
            ),
        )

        wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp)
        if wu.id not in self.report.workunit_ids:
            self.report.report_workunit(wu)
            yield wu
Example #5
0
 def get_dataplatform_instance_aspect(
         self, dataset_urn: str) -> Optional[SqlWorkUnit]:
     # If we are a platform instance based source, emit the instance aspect
     if self.config.platform_instance:
         mcp = MetadataChangeProposalWrapper(
             entityType="dataset",
             changeType=ChangeTypeClass.UPSERT,
             entityUrn=dataset_urn,
             aspectName="dataPlatformInstance",
             aspect=DataPlatformInstanceClass(
                 platform=make_data_platform_urn(self.platform),
                 instance=make_dataplatform_instance_urn(
                     self.platform, self.config.platform_instance),
             ),
         )
         wu = SqlWorkUnit(id=f"{dataset_urn}-dataPlatformInstance", mcp=mcp)
         self.report.report_workunit(wu)
         return wu
     else:
         return None
Example #6
0
def get_schema_metadata(
    sql_report: SQLSourceReport,
    dataset_name: str,
    platform: str,
    columns: List[dict],
    pk_constraints: dict = None,
    foreign_keys: List[ForeignKeyConstraint] = None,
    canonical_schema: List[SchemaField] = [],
) -> SchemaMetadata:
    schema_metadata = SchemaMetadata(
        schemaName=dataset_name,
        platform=make_data_platform_urn(platform),
        version=0,
        hash="",
        platformSchema=MySqlDDL(tableSchema=""),
        fields=canonical_schema,
    )
    if foreign_keys is not None and foreign_keys != []:
        schema_metadata.foreignKeys = foreign_keys

    return schema_metadata
Example #7
0
    def get_table_schema(self, file_path: str, table_name: str,
                         is_aws: bool) -> Iterable[MetadataWorkUnit]:

        data_platform_urn = make_data_platform_urn(self.source_config.platform)
        dataset_urn = make_dataset_urn(self.source_config.platform, table_name,
                                       self.source_config.env)

        dataset_name = os.path.basename(file_path)

        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_properties = DatasetPropertiesClass(
            description="",
            customProperties={},
        )
        dataset_snapshot.aspects.append(dataset_properties)

        if is_aws:
            if self.source_config.aws_config is None:
                raise ValueError("AWS config is required for S3 file sources")

            s3_client = self.source_config.aws_config.get_s3_client()

            file = smart_open(f"s3://{file_path}",
                              "rb",
                              transport_params={"client": s3_client})

        else:

            file = open(file_path, "rb")

        fields = []

        try:
            if file_path.endswith(".parquet"):
                fields = parquet.ParquetInferrer().infer_schema(file)
            elif file_path.endswith(".csv"):
                fields = csv_tsv.CsvInferrer(
                    max_rows=self.source_config.max_rows).infer_schema(file)
            elif file_path.endswith(".tsv"):
                fields = csv_tsv.TsvInferrer(
                    max_rows=self.source_config.max_rows).infer_schema(file)
            elif file_path.endswith(".json"):
                fields = json.JsonInferrer().infer_schema(file)
            elif file_path.endswith(".avro"):
                fields = avro.AvroInferrer().infer_schema(file)
            else:
                self.report.report_warning(
                    file_path, f"file {file_path} has unsupported extension")
            file.close()
        except Exception as e:
            self.report.report_warning(
                file_path, f"could not infer schema for file {file_path}: {e}")
            file.close()

        fields = sorted(fields, key=lambda f: f.fieldPath)
        schema_metadata = SchemaMetadata(
            schemaName=dataset_name,
            platform=data_platform_urn,
            version=0,
            hash="",
            fields=fields,
            platformSchema=OtherSchemaClass(rawSchema=""),
        )

        dataset_snapshot.aspects.append(schema_metadata)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=file_path, mce=mce)
        self.report.report_workunit(wu)
        yield wu
Example #8
0
# Construct a MetadataChangeProposalWrapper object.
assertion_maxVal_mcp = MetadataChangeProposalWrapper(
    entityType="assertion",
    changeType=ChangeType.UPSERT,
    entityUrn=assertionUrn(assertion_maxVal),
    aspectName="assertionInfo",
    aspect=assertion_maxVal,
)

# Emit Assertion entity info aspect!
emitter.emit_mcp(assertion_maxVal_mcp)

# Construct an assertion platform object.
assertion_dataPlatformInstance = DataPlatformInstance(
    platform=builder.make_data_platform_urn("great-expectations")
)

# Construct a MetadataChangeProposalWrapper object for assertion platform
assertion_dataPlatformInstance_mcp = MetadataChangeProposalWrapper(
    entityType="assertion",
    changeType=ChangeType.UPSERT,
    entityUrn=assertionUrn(assertion_maxVal),
    aspectName="dataPlatformInstance",
    aspect=assertion_dataPlatformInstance,
)
# Emit Assertion entity platform aspect!
emitter.emit(assertion_dataPlatformInstance_mcp)


# Construct batch assertion result object for partition 1 batch
    SchemaFieldClass,
    SchemaFieldDataTypeClass,
    SchemaMetadataClass,
    StringTypeClass,
)

event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=make_dataset_urn(platform="hive",
                               name="realestate_db.sales",
                               env="PROD"),
    aspectName="schemaMetadata",
    aspect=SchemaMetadataClass(
        schemaName="customer",  # not used
        platform=make_data_platform_urn(
            "hive"),  # important <- platform must be an urn
        version=
        0,  # when the source system has a notion of versioning of schemas, insert this in, otherwise leave as 0
        hash=
        "",  # when the source system has a notion of unique schemas identified via hash, include a hash, else leave it as empty string
        platformSchema=OtherSchemaClass(
            rawSchema="__insert raw schema here__"),
        fields=[
            SchemaFieldClass(
                fieldPath="address.zipcode",
                type=SchemaFieldDataTypeClass(type=StringTypeClass()),
                nativeDataType=
                "VARCHAR(50)",  # use this to provide the type of the field in the source system's vernacular
                description=
                "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States",
            ),
Example #10
0
    def ingest_table(self, table_data: TableData,
                     path_spec: PathSpec) -> Iterable[MetadataWorkUnit]:

        logger.info(
            f"Extracting table schema from file: {table_data.full_path}")
        browse_path: str = (strip_s3_prefix(table_data.table_path)
                            if table_data.is_s3 else
                            table_data.table_path.strip("/"))

        data_platform_urn = make_data_platform_urn(self.source_config.platform)
        logger.info(f"Creating dataset urn with name: {browse_path}")
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.source_config.platform,
            browse_path,
            self.source_config.platform_instance,
            self.source_config.env,
        )

        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[],
        )

        dataset_properties = DatasetPropertiesClass(
            description="",
            name=table_data.display_name,
            customProperties={
                "number_of_files": str(table_data.number_of_files),
                "size_in_bytes": str(table_data.size_in_bytes),
            },
        )
        dataset_snapshot.aspects.append(dataset_properties)

        fields = self.get_fields(table_data, path_spec)
        schema_metadata = SchemaMetadata(
            schemaName=table_data.display_name,
            platform=data_platform_urn,
            version=0,
            hash="",
            fields=fields,
            platformSchema=OtherSchemaClass(rawSchema=""),
        )
        dataset_snapshot.aspects.append(schema_metadata)
        if (self.source_config.use_s3_bucket_tags
                or self.source_config.use_s3_object_tags):
            bucket = get_bucket_name(table_data.table_path)
            key_prefix = (get_key_prefix(table_data.table_path)
                          if table_data.full_path == table_data.table_path else
                          None)
            s3_tags = self.get_s3_tags(bucket, key_prefix, dataset_urn)
            if s3_tags is not None:
                dataset_snapshot.aspects.append(s3_tags)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=table_data.table_path, mce=mce)
        self.report.report_workunit(wu)
        yield wu

        yield from self.create_container_hierarchy(table_data, dataset_urn)

        if self.source_config.profiling.enabled:
            yield from self.get_table_profile(table_data, dataset_urn)
Example #11
0
    def get_assertions_with_results(
        self,
        validation_result_suite,
        expectation_suite_name,
        run_id,
        payload,
        datasets,
    ):

        dataPlatformInstance = DataPlatformInstance(
            platform=builder.make_data_platform_urn(GE_PLATFORM_NAME)
        )
        docs_link = None
        if payload:
            # process the payload
            for action_names in payload.keys():
                if payload[action_names]["class"] == "UpdateDataDocsAction":
                    data_docs_pages = payload[action_names]
                    for docs_link_key, docs_link_val in data_docs_pages.items():
                        if "file://" not in docs_link_val and docs_link_key != "class":
                            docs_link = docs_link_val

        assertions_with_results = []
        for result in validation_result_suite.results:
            expectation_config = result["expectation_config"]
            expectation_type = expectation_config["expectation_type"]
            success = True if result["success"] else False
            kwargs = {
                k: v for k, v in expectation_config["kwargs"].items() if k != "batch_id"
            }

            result = result["result"]
            assertion_datasets = [d["dataset_urn"] for d in datasets]
            if len(datasets) == 1 and "column" in kwargs:
                assertion_fields = [
                    builder.make_schema_field_urn(
                        datasets[0]["dataset_urn"], kwargs["column"]
                    )
                ]
            else:
                assertion_fields = None  # type:ignore

            # Be careful what fields to consider for creating assertion urn.
            # Any change in fields below would lead to a new assertion
            # FIXME - Currently, when using evaluation parameters, new assertion is
            # created when runtime resolved kwargs are different,
            # possibly for each validation run
            assertionUrn = builder.make_assertion_urn(
                builder.datahub_guid(
                    {
                        "platform": GE_PLATFORM_NAME,
                        "nativeType": expectation_type,
                        "nativeParameters": kwargs,
                        "dataset": assertion_datasets[0],
                        "fields": assertion_fields,
                    }
                )
            )
            assertionInfo: AssertionInfo = self.get_assertion_info(
                expectation_type,
                kwargs,
                assertion_datasets[0],
                assertion_fields,
                expectation_suite_name,
            )

            # TODO: Understand why their run time is incorrect.
            run_time = run_id.run_time.astimezone(timezone.utc)
            assertionResults = []

            evaluation_parameters = (
                {
                    k: convert_to_string(v)
                    for k, v in validation_result_suite.evaluation_parameters.items()
                }
                if validation_result_suite.evaluation_parameters
                else None
            )

            nativeResults = {
                k: convert_to_string(v)
                for k, v in result.items()
                if (
                    k
                    in [
                        "observed_value",
                        "partial_unexpected_list",
                        "partial_unexpected_counts",
                        "details",
                    ]
                    and v
                )
            }

            actualAggValue = (
                result.get("observed_value")
                if isinstance(result.get("observed_value"), (int, float))
                else None
            )

            ds = datasets[0]
            # https://docs.greatexpectations.io/docs/reference/expectations/result_format/
            assertionResult = AssertionRunEvent(
                timestampMillis=int(round(time.time() * 1000)),
                assertionUrn=assertionUrn,
                asserteeUrn=ds["dataset_urn"],
                runId=run_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
                result=AssertionResult(
                    type=AssertionResultType.SUCCESS
                    if success
                    else AssertionResultType.FAILURE,
                    rowCount=result.get("element_count"),
                    missingCount=result.get("missing_count"),
                    unexpectedCount=result.get("unexpected_count"),
                    actualAggValue=actualAggValue,
                    externalUrl=docs_link,
                    nativeResults=nativeResults,
                ),
                batchSpec=ds["batchSpec"],
                status=AssertionRunStatus.COMPLETE,
                runtimeContext=evaluation_parameters,
            )
            if ds.get("partitionSpec") is not None:
                assertionResult.partitionSpec = ds.get("partitionSpec")
            assertionResults.append(assertionResult)

            assertions_with_results.append(
                {
                    "assertionUrn": assertionUrn,
                    "assertionInfo": assertionInfo,
                    "assertionPlatform": dataPlatformInstance,
                    "assertionResults": assertionResults,
                }
            )
        return assertions_with_results
Example #12
0
class PowerBiDashboardSourceConfig(PowerBiAPIConfig):
    platform_name: str = "powerbi"
    platform_urn: str = builder.make_data_platform_urn(platform=platform_name)
    dashboard_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
    chart_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
Example #13
0
    def _extract_record(
            self, topic: str) -> Iterable[MetadataWorkUnit]:  # noqa: C901
        logger.debug(f"topic = {topic}")

        # 1. Create the default dataset snapshot for the topic.
        dataset_name = topic
        platform_urn = make_data_platform_urn(self.platform)
        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=dataset_name,
            platform_instance=self.source_config.platform_instance,
            env=self.source_config.env,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[Status(removed=False)],  # we append to this list later on
        )

        # 2. Attach schemaMetadata aspect (pass control to SchemaRegistry)
        schema_metadata = self.schema_registry_client.get_schema_metadata(
            topic, platform_urn)
        if schema_metadata is not None:
            dataset_snapshot.aspects.append(schema_metadata)

        # 3. Attach browsePaths aspect
        browse_path_suffix = (f"{self.source_config.platform_instance}/{topic}"
                              if self.source_config.platform_instance else
                              topic)
        browse_path = BrowsePathsClass([
            f"/{self.source_config.env.lower()}/{self.platform}/{browse_path_suffix}"
        ])
        dataset_snapshot.aspects.append(browse_path)

        # 4. Attach dataPlatformInstance aspect.
        if self.source_config.platform_instance:
            dataset_snapshot.aspects.append(
                DataPlatformInstanceClass(
                    platform=platform_urn,
                    instance=make_dataplatform_instance_urn(
                        self.platform, self.source_config.platform_instance),
                ))

        # 5. Emit the datasetSnapshot MCE
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=f"kafka-{topic}", mce=mce)
        self.report.report_workunit(wu)
        yield wu

        # 5. Add the subtype aspect marking this as a "topic"
        subtype_wu = MetadataWorkUnit(
            id=f"{topic}-subtype",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["topic"]),
            ),
        )
        self.report.report_workunit(subtype_wu)
        yield subtype_wu

        domain_urn: Optional[str] = None

        # 6. Emit domains aspect MCPW
        for domain, pattern in self.source_config.domain.items():
            if pattern.allowed(dataset_name):
                domain_urn = make_domain_urn(domain)

        if domain_urn:
            wus = add_domain_to_entity_wu(
                entity_type="dataset",
                entity_urn=dataset_urn,
                domain_urn=domain_urn,
            )
            for wu in wus:
                self.report.report_workunit(wu)
                yield wu
Example #14
0
def dataplatform2instance_func(
    instance: str,
    platform: str,
    dry_run: bool,
    env: str,
    force: bool,
    hard: bool,
    keep: bool,
) -> None:
    click.echo(
        f"Starting migration: platform:{platform}, instance={instance}, force={force}, dry-run={dry_run}"
    )
    run_id: str = f"migrate-{uuid.uuid4()}"
    migration_report = MigrationReport(run_id, dry_run, keep)
    system_metadata = SystemMetadataClass(runId=run_id)

    all_aspects = [
        "schemaMetadata",
        "datasetProperties",
        "viewProperties",
        "subTypes",
        "editableDatasetProperties",
        "ownership",
        "datasetDeprecation",
        "institutionalMemory",
        "editableSchemaMetadata",
        "globalTags",
        "glossaryTerms",
        "upstreamLineage",
        "datasetUpstreamLineage",
        "status",
    ]

    if not dry_run:
        rest_emitter = DatahubRestEmitter(
            gms_server=cli_utils.get_session_and_host()[1]
        )

    urns_to_migrate = []
    # we first calculate all the urns we will be migrating
    for src_entity_urn in cli_utils.get_urns_by_filter(platform=platform, env=env):
        key = dataset_urn_to_key(src_entity_urn)
        assert key
        # Does this urn already have a platform instance associated with it?
        response = cli_utils.get_aspects_for_entity(
            entity_urn=src_entity_urn, aspects=["dataPlatformInstance"], typed=True
        )
        if "dataPlatformInstance" in response:
            assert isinstance(
                response["dataPlatformInstance"], DataPlatformInstanceClass
            )
            data_platform_instance: DataPlatformInstanceClass = response[
                "dataPlatformInstance"
            ]
            if data_platform_instance.instance:
                log.debug("This is already an instance-specific urn, will skip")
                continue
            else:
                log.debug(
                    f"{src_entity_urn} is not an instance specific urn. {response}"
                )
                urns_to_migrate.append(src_entity_urn)

    if not force and not dry_run:
        # get a confirmation from the operator before proceeding if this is not a dry run
        sampled_urns_to_migrate = random.choices(
            urns_to_migrate, k=min(10, len(urns_to_migrate))
        )
        sampled_new_urns: List[str] = [
            make_dataset_urn_with_platform_instance(
                platform=key.platform,
                name=key.name,
                platform_instance=instance,
                env=str(key.origin),
            )
            for key in [dataset_urn_to_key(x) for x in sampled_urns_to_migrate]
            if key
        ]
        click.echo(
            f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
        )
        click.echo(f"New urns will look like {sampled_new_urns}")
        click.confirm("Ok to proceed?", abort=True)

    for src_entity_urn in progressbar.progressbar(
        urns_to_migrate, redirect_stdout=True
    ):
        key = dataset_urn_to_key(src_entity_urn)
        assert key
        new_urn = make_dataset_urn_with_platform_instance(
            platform=key.platform,
            name=key.name,
            platform_instance=instance,
            env=str(key.origin),
        )
        log.debug(f"Will migrate {src_entity_urn} to {new_urn}")
        relationships = migration_utils.get_incoming_relationships_dataset(
            src_entity_urn
        )

        for mcp in migration_utils.clone_aspect(
            src_entity_urn,
            aspect_names=all_aspects,
            dst_urn=new_urn,
            dry_run=dry_run,
            run_id=run_id,
        ):
            if not dry_run:
                rest_emitter.emit_mcp(mcp)
            migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName)  # type: ignore

        if not dry_run:
            rest_emitter.emit_mcp(
                MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=new_urn,
                    aspectName="dataPlatformInstance",
                    aspect=DataPlatformInstanceClass(
                        platform=make_data_platform_urn(platform),
                        instance=make_dataplatform_instance_urn(platform, instance),
                    ),
                    systemMetadata=system_metadata,
                )
            )
        migration_report.on_entity_create(new_urn, "dataPlatformInstance")

        for relationship in relationships:
            target_urn = relationship["entity"]
            entity_type = _get_type_from_urn(target_urn)
            relationshipType = relationship["type"]
            aspect_name = (
                migration_utils.get_aspect_name_from_relationship_type_and_entity(
                    relationshipType, entity_type
                )
            )
            aspect_map = cli_utils.get_aspects_for_entity(
                target_urn, aspects=[aspect_name], typed=True
            )
            if aspect_name in aspect_map:
                aspect = aspect_map[aspect_name]
                assert isinstance(aspect, DictWrapper)
                aspect = migration_utils.modify_urn_list_for_aspect(
                    aspect_name, aspect, relationshipType, src_entity_urn, new_urn
                )
                # use mcpw
                mcp = MetadataChangeProposalWrapper(
                    entityType=entity_type,
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=target_urn,
                    aspectName=aspect_name,
                    aspect=aspect,
                )
                if not dry_run:
                    rest_emitter.emit_mcp(mcp)
                migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName)  # type: ignore
            else:
                log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}")

        if not dry_run and not keep:
            log.info(f"will {'hard' if hard else 'soft'} delete {src_entity_urn}")
            delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id)
        migration_report.on_entity_migrated(src_entity_urn, "status")  # type: ignore

    print(f"{migration_report}")
Example #15
0
    def _extract_mcps(self,
                      index: str) -> Iterable[MetadataChangeProposalWrapper]:
        logger.debug(f"index = {index}")
        raw_index = self.client.indices.get(index=index)
        raw_index_metadata = raw_index[index]

        # 0. Dedup data_streams.
        data_stream = raw_index_metadata.get("data_stream")
        if data_stream:
            index = data_stream
            self.data_stream_partition_count[index] += 1
            if self.data_stream_partition_count[index] > 1:
                # This is a duplicate, skip processing it further.
                return

        # 1. Construct and emit the schemaMetadata aspect
        # 1.1 Generate the schema fields from ES mappings.
        index_mappings = raw_index_metadata["mappings"]
        index_mappings_json_str: str = json.dumps(index_mappings)
        md5_hash = md5(index_mappings_json_str.encode()).hexdigest()
        schema_fields = list(
            ElasticToSchemaFieldConverter.get_schema_fields(index_mappings))

        # 1.2 Generate the SchemaMetadata aspect
        schema_metadata = SchemaMetadata(
            schemaName=index,
            platform=make_data_platform_urn(self.platform),
            version=0,
            hash=md5_hash,
            platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str),
            fields=schema_fields,
        )

        # 1.3 Emit the mcp
        dataset_urn: str = make_dataset_urn(self.platform, index,
                                            self.source_config.env)
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="schemaMetadata",
            aspect=schema_metadata,
            changeType=ChangeTypeClass.UPSERT,
        )

        # 2. Construct and emit the status aspect.
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="status",
            aspect=StatusClass(removed=False),
            changeType=ChangeTypeClass.UPSERT,
        )

        # 3. Construct and emit subtype
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="subTypes",
            aspect=SubTypesClass(
                typeNames=["Index" if not data_stream else "DataStream"]),
            changeType=ChangeTypeClass.UPSERT,
        )

        # 4. Construct and emit properties if needed
        index_aliases = raw_index_metadata.get("aliases", {}).keys()
        if index_aliases:
            yield MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=dataset_urn,
                aspectName="datasetProperties",
                aspect=DatasetPropertiesClass(
                    customProperties={"aliases": ",".join(index_aliases)}),
                changeType=ChangeTypeClass.UPSERT,
            )
Example #16
0
    def _extract_record(self, topic: str,
                        partitioned: bool) -> Iterable[MetadataWorkUnit]:
        logger.info(f"topic = {topic}")

        # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace
        # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic
        pulsar_topic = PulsarTopic(topic)

        platform_urn = make_data_platform_urn(self.platform)
        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=pulsar_topic.fullname,
            platform_instance=self.config.platform_instance,
            env=self.config.env,
        )

        status_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-status",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="status",
                aspect=StatusClass(removed=False),
            ),
        )
        self.report.report_workunit(status_wu)
        yield status_wu

        # 2. Emit schemaMetadata aspect
        schema, schema_metadata = self._get_schema_metadata(
            pulsar_topic, platform_urn)
        if schema_metadata is not None:
            schema_metadata_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-schemaMetadata",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="schemaMetadata",
                    aspect=schema_metadata,
                ),
            )
            self.report.report_workunit(schema_metadata_wu)
            yield schema_metadata_wu

        # TODO Add topic properties (Pulsar 2.10.0 feature)
        # 3. Construct and emit dataset properties aspect
        if schema is not None:
            schema_properties = {
                "schema_version": str(schema.schema_version),
                "schema_type": schema.schema_type,
                "partitioned": str(partitioned).lower(),
            }
            # Add some static properties to the schema properties
            schema.properties.update(schema_properties)

            dataset_properties_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-datasetProperties",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="datasetProperties",
                    aspect=DatasetPropertiesClass(
                        description=schema.schema_description,
                        customProperties=schema.properties,
                    ),
                ),
            )
            self.report.report_workunit(dataset_properties_wu)
            yield dataset_properties_wu

        # 4. Emit browsePaths aspect
        pulsar_path = (
            f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}"
        )
        browse_path_suffix = (f"{self.config.platform_instance}/{pulsar_path}"
                              if self.config.platform_instance else
                              pulsar_path)

        browse_path_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-browsePaths",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="browsePaths",
                aspect=BrowsePathsClass([
                    f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}"
                ]),
            ),
        )
        self.report.report_workunit(browse_path_wu)
        yield browse_path_wu

        # 5. Emit dataPlatformInstance aspect.
        if self.config.platform_instance:
            platform_instance_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-dataPlatformInstance",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="dataPlatformInstance",
                    aspect=DataPlatformInstanceClass(
                        platform=platform_urn,
                        instance=make_dataplatform_instance_urn(
                            self.platform, self.config.platform_instance),
                    ),
                ),
            )
            self.report.report_workunit(platform_instance_wu)
            yield platform_instance_wu

        # 6. Emit subtype aspect marking this as a "topic"
        subtype_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-subTypes",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["topic"]),
            ),
        )
        self.report.report_workunit(subtype_wu)
        yield subtype_wu

        # 7. Emit domains aspect
        domain_urn: Optional[str] = None
        for domain, pattern in self.config.domain.items():
            if pattern.allowed(pulsar_topic.fullname):
                domain_urn = make_domain_urn(domain)

        if domain_urn:
            wus = add_domain_to_entity_wu(
                entity_type="dataset",
                entity_urn=dataset_urn,
                domain_urn=domain_urn,
            )
            for wu in wus:
                self.report.report_workunit(wu)
                yield wu
Example #17
0
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]:
    def strip_types(field_path: str) -> str:

        final_path = field_path
        final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path)
        final_path = re.sub(r"^\[version=2.0\]\.", "", final_path)
        return final_path

    datasets: List[DatasetSnapshotClass] = []

    for entity_name, entity_def in entity_registry.items():
        entity_display_name = entity_def.display_name
        entity_fields = []
        for aspect_name in entity_def.aspects:
            if aspect_name not in aspect_registry:
                print(
                    f"Did not find aspect name: {aspect_name} in aspect_registry"
                )
                continue

            # all aspects should have a schema
            aspect_schema = aspect_registry[aspect_name].schema
            assert aspect_schema
            entity_fields.append({
                "type": aspect_schema.to_json(),
                "name": aspect_name,
            })

        if entity_fields:
            names = avro.schema.Names()
            field_objects = []
            for f in entity_fields:
                field = avro.schema.Field(
                    type=f["type"],
                    name=f["name"],
                    has_default=False,
                )
                field_objects.append(field)

            with unittest.mock.patch("avro.schema.Names.add_name", add_name):
                entity_avro_schema = avro.schema.RecordSchema(
                    name=entity_name,
                    namespace="datahub.metadata.model",
                    names=names,
                    fields=[],
                )
                entity_avro_schema.set_prop("fields", field_objects)
            rawSchema = json.dumps(entity_avro_schema.to_json())
            # always add the URN which is the primary key
            urn_field = SchemaField(
                fieldPath="urn",
                type=SchemaFieldDataTypeClass(type=StringTypeClass()),
                nativeDataType="string",
                nullable=False,
                isPartOfKey=True,
                description=
                f"The primary identifier for the {entity_name} entity. See the {entity_def.keyAspect} field to understand the structure of this urn.",
            )
            schema_fields: List[SchemaField] = [
                urn_field
            ] + avro_schema_to_mce_fields(rawSchema)
            foreign_keys: List[ForeignKeyConstraintClass] = []
            source_dataset_urn = make_dataset_urn(
                platform=make_data_platform_urn("datahub"),
                name=f"{entity_display_name}",
            )
            for f_field in schema_fields:
                if f_field.jsonProps:
                    json_dict = json.loads(f_field.jsonProps)
                    if "Aspect" in json_dict:
                        aspect_info = json_dict["Aspect"]
                        f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            tags=[])
                        f_field.globalTags.tags.append(
                            TagAssociationClass(tag="urn:li:tag:Aspect"))
                        # if this is the key aspect, also add primary-key
                        if entity_def.keyAspect == aspect_info.get("name"):
                            f_field.isPartOfKey = True

                        if "timeseries" == aspect_info.get("type", ""):
                            # f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            #    tags=[]
                            # )
                            f_field.globalTags.tags.append(
                                TagAssociationClass(tag="urn:li:tag:Temporal"))
                        import pdb

                        # breakpoint()
                    if "Searchable" in json_dict:
                        f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            tags=[])
                        f_field.globalTags.tags.append(
                            TagAssociationClass(tag="urn:li:tag:Searchable"))
                    if "Relationship" in json_dict:
                        relationship_info = json_dict["Relationship"]
                        # detect if we have relationship specified at leaf level or thru path specs
                        if "entityTypes" not in relationship_info:
                            # path spec
                            assert (
                                len(relationship_info.keys()) == 1
                            ), "We should never have more than one path spec assigned to a relationship annotation"
                            final_info = None
                            for k, v in relationship_info.items():
                                final_info = v
                            relationship_info = final_info

                        assert "entityTypes" in relationship_info

                        entity_types: List[str] = relationship_info.get(
                            "entityTypes", [])
                        relnship_name = relationship_info.get("name", None)
                        for entity_type in entity_types:
                            destination_entity_name = capitalize_first(
                                entity_type)

                            foreign_dataset_urn = make_dataset_urn(
                                platform=make_data_platform_urn("datahub"),
                                name=destination_entity_name,
                            )
                            fkey = ForeignKeyConstraintClass(
                                name=relnship_name,
                                foreignDataset=foreign_dataset_urn,
                                foreignFields=[
                                    f"urn:li:schemaField:({foreign_dataset_urn}, urn)"
                                ],
                                sourceFields=[
                                    f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})"
                                ],
                            )
                            foreign_keys.append(fkey)
                            relnships_graph.add_edge(
                                entity_display_name,
                                destination_entity_name,
                                fkey.name,
                                f" via `{strip_types(f_field.fieldPath)}`",
                                edge_id=
                                f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}",
                            )

            schemaMetadata = SchemaMetadataClass(
                schemaName=f"{entity_name}",
                platform=make_data_platform_urn("datahub"),
                platformSchema=OtherSchemaClass(rawSchema=rawSchema),
                fields=schema_fields,
                version=0,
                hash="",
                foreignKeys=foreign_keys if foreign_keys else None,
            )

            dataset = DatasetSnapshotClass(
                urn=make_dataset_urn(
                    platform=make_data_platform_urn("datahub"),
                    name=f"{entity_display_name}",
                ),
                aspects=[
                    schemaMetadata,
                    GlobalTagsClass(
                        tags=[TagAssociationClass(tag="urn:li:tag:Entity")]),
                    BrowsePathsClass(
                        [f"/prod/datahub/entities/{entity_display_name}"]),
                ],
            )
            datasets.append(dataset)

    events: List[Union[MetadataChangeEventClass,
                       MetadataChangeProposalWrapper]] = []

    for d in datasets:
        entity_name = d.urn.split(":")[-1].split(",")[1]
        d.aspects.append(
            DatasetPropertiesClass(
                description=make_entity_docs(entity_name, relnships_graph)))

        mce = MetadataChangeEventClass(proposedSnapshot=d, )
        events.append(mce)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=d.urn,
            aspectName="subTypes",
            aspect=SubTypesClass(typeNames=["entity"]),
        )
        events.append(mcp)
    return events