def get_data_platform_instance() -> DataPlatformInstanceClass: return DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance) if self.source_config.platform_instance else None, )
def construct_lineage_workunits( self, connector: ConnectorManifest) -> Iterable[MetadataWorkUnit]: lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder.make_dataset_urn(target_platform, target_dataset, self.config.env), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( target_platform)), ) wu = MetadataWorkUnit(id=target_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu if source_dataset: mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=builder.make_dataset_urn( source_platform, source_dataset, self.config.env), changeType=models.ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=models.DataPlatformInstanceClass( platform=builder.make_data_platform_urn( source_platform)), ) wu = MetadataWorkUnit(id=source_dataset, mcp=mcp) self.report.report_workunit(wu) yield wu
def ingest_table(self, table_data: TableData) -> Iterable[MetadataWorkUnit]: logger.info( f"Extracting table schema from file: {table_data.full_path}") browse_path: str = (strip_s3_prefix(table_data.table_path) if table_data.is_s3 else table_data.table_path.strip("/")) data_platform_urn = make_data_platform_urn(self.source_config.platform) logger.info(f"Creating dataset urn with name: {browse_path}") dataset_urn = make_dataset_urn_with_platform_instance( self.source_config.platform, browse_path, self.source_config.platform_instance, self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", name=table_data.disaply_name, customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) fields = self.get_fields(table_data) schema_metadata = SchemaMetadata( schemaName=table_data.disaply_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=table_data.table_path, mce=mce) self.report.report_workunit(wu) yield wu yield from self.create_container_hierarchy(table_data, dataset_urn) if self.source_config.profiling.enabled: yield from self.get_table_profile(table_data, dataset_urn)
def construct_dataset_workunits( self, dataset_platform: str, dataset_name: str, dataset_urn: Optional[str] = None, external_url: Optional[str] = None, datasetProperties: Optional[Dict[str, str]] = None, ) -> Iterable[MetadataWorkUnit]: if not dataset_urn: dataset_urn = builder.make_dataset_urn( dataset_platform, dataset_name, self.config.env ) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=builder.make_data_platform_urn(dataset_platform) ), ) platform = ( dataset_platform[dataset_platform.rindex(":") + 1 :] if dataset_platform.startswith("urn:") else dataset_platform ) wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp) if wu.id not in self.report.workunit_ids: self.report.report_workunit(wu) yield wu mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="datasetProperties", aspect=DatasetPropertiesClass( externalUrl=external_url, customProperties=datasetProperties ), ) wu = MetadataWorkUnit(id=f"{platform}.{dataset_name}.{mcp.aspectName}", mcp=mcp) if wu.id not in self.report.workunit_ids: self.report.report_workunit(wu) yield wu
def get_dataplatform_instance_aspect( self, dataset_urn: str) -> Optional[SqlWorkUnit]: # If we are a platform instance based source, emit the instance aspect if self.config.platform_instance: mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.config.platform_instance), ), ) wu = SqlWorkUnit(id=f"{dataset_urn}-dataPlatformInstance", mcp=mcp) self.report.report_workunit(wu) return wu else: return None
def get_schema_metadata( sql_report: SQLSourceReport, dataset_name: str, platform: str, columns: List[dict], pk_constraints: dict = None, foreign_keys: List[ForeignKeyConstraint] = None, canonical_schema: List[SchemaField] = [], ) -> SchemaMetadata: schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=make_data_platform_urn(platform), version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), fields=canonical_schema, ) if foreign_keys is not None and foreign_keys != []: schema_metadata.foreignKeys = foreign_keys return schema_metadata
def get_table_schema(self, file_path: str, table_name: str, is_aws: bool) -> Iterable[MetadataWorkUnit]: data_platform_urn = make_data_platform_urn(self.source_config.platform) dataset_urn = make_dataset_urn(self.source_config.platform, table_name, self.source_config.env) dataset_name = os.path.basename(file_path) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if is_aws: if self.source_config.aws_config is None: raise ValueError("AWS config is required for S3 file sources") s3_client = self.source_config.aws_config.get_s3_client() file = smart_open(f"s3://{file_path}", "rb", transport_params={"client": s3_client}) else: file = open(file_path, "rb") fields = [] try: if file_path.endswith(".parquet"): fields = parquet.ParquetInferrer().infer_schema(file) elif file_path.endswith(".csv"): fields = csv_tsv.CsvInferrer( max_rows=self.source_config.max_rows).infer_schema(file) elif file_path.endswith(".tsv"): fields = csv_tsv.TsvInferrer( max_rows=self.source_config.max_rows).infer_schema(file) elif file_path.endswith(".json"): fields = json.JsonInferrer().infer_schema(file) elif file_path.endswith(".avro"): fields = avro.AvroInferrer().infer_schema(file) else: self.report.report_warning( file_path, f"file {file_path} has unsupported extension") file.close() except Exception as e: self.report.report_warning( file_path, f"could not infer schema for file {file_path}: {e}") file.close() fields = sorted(fields, key=lambda f: f.fieldPath) schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=file_path, mce=mce) self.report.report_workunit(wu) yield wu
# Construct a MetadataChangeProposalWrapper object. assertion_maxVal_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertionUrn(assertion_maxVal), aspectName="assertionInfo", aspect=assertion_maxVal, ) # Emit Assertion entity info aspect! emitter.emit_mcp(assertion_maxVal_mcp) # Construct an assertion platform object. assertion_dataPlatformInstance = DataPlatformInstance( platform=builder.make_data_platform_urn("great-expectations") ) # Construct a MetadataChangeProposalWrapper object for assertion platform assertion_dataPlatformInstance_mcp = MetadataChangeProposalWrapper( entityType="assertion", changeType=ChangeType.UPSERT, entityUrn=assertionUrn(assertion_maxVal), aspectName="dataPlatformInstance", aspect=assertion_dataPlatformInstance, ) # Emit Assertion entity platform aspect! emitter.emit(assertion_dataPlatformInstance_mcp) # Construct batch assertion result object for partition 1 batch
SchemaFieldClass, SchemaFieldDataTypeClass, SchemaMetadataClass, StringTypeClass, ) event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD"), aspectName="schemaMetadata", aspect=SchemaMetadataClass( schemaName="customer", # not used platform=make_data_platform_urn( "hive"), # important <- platform must be an urn version= 0, # when the source system has a notion of versioning of schemas, insert this in, otherwise leave as 0 hash= "", # when the source system has a notion of unique schemas identified via hash, include a hash, else leave it as empty string platformSchema=OtherSchemaClass( rawSchema="__insert raw schema here__"), fields=[ SchemaFieldClass( fieldPath="address.zipcode", type=SchemaFieldDataTypeClass(type=StringTypeClass()), nativeDataType= "VARCHAR(50)", # use this to provide the type of the field in the source system's vernacular description= "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States", ),
def ingest_table(self, table_data: TableData, path_spec: PathSpec) -> Iterable[MetadataWorkUnit]: logger.info( f"Extracting table schema from file: {table_data.full_path}") browse_path: str = (strip_s3_prefix(table_data.table_path) if table_data.is_s3 else table_data.table_path.strip("/")) data_platform_urn = make_data_platform_urn(self.source_config.platform) logger.info(f"Creating dataset urn with name: {browse_path}") dataset_urn = make_dataset_urn_with_platform_instance( self.source_config.platform, browse_path, self.source_config.platform_instance, self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) dataset_properties = DatasetPropertiesClass( description="", name=table_data.display_name, customProperties={ "number_of_files": str(table_data.number_of_files), "size_in_bytes": str(table_data.size_in_bytes), }, ) dataset_snapshot.aspects.append(dataset_properties) fields = self.get_fields(table_data, path_spec) schema_metadata = SchemaMetadata( schemaName=table_data.display_name, platform=data_platform_urn, version=0, hash="", fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) dataset_snapshot.aspects.append(schema_metadata) if (self.source_config.use_s3_bucket_tags or self.source_config.use_s3_object_tags): bucket = get_bucket_name(table_data.table_path) key_prefix = (get_key_prefix(table_data.table_path) if table_data.full_path == table_data.table_path else None) s3_tags = self.get_s3_tags(bucket, key_prefix, dataset_urn) if s3_tags is not None: dataset_snapshot.aspects.append(s3_tags) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=table_data.table_path, mce=mce) self.report.report_workunit(wu) yield wu yield from self.create_container_hierarchy(table_data, dataset_urn) if self.source_config.profiling.enabled: yield from self.get_table_profile(table_data, dataset_urn)
def get_assertions_with_results( self, validation_result_suite, expectation_suite_name, run_id, payload, datasets, ): dataPlatformInstance = DataPlatformInstance( platform=builder.make_data_platform_urn(GE_PLATFORM_NAME) ) docs_link = None if payload: # process the payload for action_names in payload.keys(): if payload[action_names]["class"] == "UpdateDataDocsAction": data_docs_pages = payload[action_names] for docs_link_key, docs_link_val in data_docs_pages.items(): if "file://" not in docs_link_val and docs_link_key != "class": docs_link = docs_link_val assertions_with_results = [] for result in validation_result_suite.results: expectation_config = result["expectation_config"] expectation_type = expectation_config["expectation_type"] success = True if result["success"] else False kwargs = { k: v for k, v in expectation_config["kwargs"].items() if k != "batch_id" } result = result["result"] assertion_datasets = [d["dataset_urn"] for d in datasets] if len(datasets) == 1 and "column" in kwargs: assertion_fields = [ builder.make_schema_field_urn( datasets[0]["dataset_urn"], kwargs["column"] ) ] else: assertion_fields = None # type:ignore # Be careful what fields to consider for creating assertion urn. # Any change in fields below would lead to a new assertion # FIXME - Currently, when using evaluation parameters, new assertion is # created when runtime resolved kwargs are different, # possibly for each validation run assertionUrn = builder.make_assertion_urn( builder.datahub_guid( { "platform": GE_PLATFORM_NAME, "nativeType": expectation_type, "nativeParameters": kwargs, "dataset": assertion_datasets[0], "fields": assertion_fields, } ) ) assertionInfo: AssertionInfo = self.get_assertion_info( expectation_type, kwargs, assertion_datasets[0], assertion_fields, expectation_suite_name, ) # TODO: Understand why their run time is incorrect. run_time = run_id.run_time.astimezone(timezone.utc) assertionResults = [] evaluation_parameters = ( { k: convert_to_string(v) for k, v in validation_result_suite.evaluation_parameters.items() } if validation_result_suite.evaluation_parameters else None ) nativeResults = { k: convert_to_string(v) for k, v in result.items() if ( k in [ "observed_value", "partial_unexpected_list", "partial_unexpected_counts", "details", ] and v ) } actualAggValue = ( result.get("observed_value") if isinstance(result.get("observed_value"), (int, float)) else None ) ds = datasets[0] # https://docs.greatexpectations.io/docs/reference/expectations/result_format/ assertionResult = AssertionRunEvent( timestampMillis=int(round(time.time() * 1000)), assertionUrn=assertionUrn, asserteeUrn=ds["dataset_urn"], runId=run_time.strftime("%Y-%m-%dT%H:%M:%SZ"), result=AssertionResult( type=AssertionResultType.SUCCESS if success else AssertionResultType.FAILURE, rowCount=result.get("element_count"), missingCount=result.get("missing_count"), unexpectedCount=result.get("unexpected_count"), actualAggValue=actualAggValue, externalUrl=docs_link, nativeResults=nativeResults, ), batchSpec=ds["batchSpec"], status=AssertionRunStatus.COMPLETE, runtimeContext=evaluation_parameters, ) if ds.get("partitionSpec") is not None: assertionResult.partitionSpec = ds.get("partitionSpec") assertionResults.append(assertionResult) assertions_with_results.append( { "assertionUrn": assertionUrn, "assertionInfo": assertionInfo, "assertionPlatform": dataPlatformInstance, "assertionResults": assertionResults, } ) return assertions_with_results
class PowerBiDashboardSourceConfig(PowerBiAPIConfig): platform_name: str = "powerbi" platform_urn: str = builder.make_data_platform_urn(platform=platform_name) dashboard_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() chart_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
def _extract_record( self, topic: str) -> Iterable[MetadataWorkUnit]: # noqa: C901 logger.debug(f"topic = {topic}") # 1. Create the default dataset snapshot for the topic. dataset_name = topic platform_urn = make_data_platform_urn(self.platform) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=dataset_name, platform_instance=self.source_config.platform_instance, env=self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[Status(removed=False)], # we append to this list later on ) # 2. Attach schemaMetadata aspect (pass control to SchemaRegistry) schema_metadata = self.schema_registry_client.get_schema_metadata( topic, platform_urn) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) # 3. Attach browsePaths aspect browse_path_suffix = (f"{self.source_config.platform_instance}/{topic}" if self.source_config.platform_instance else topic) browse_path = BrowsePathsClass([ f"/{self.source_config.env.lower()}/{self.platform}/{browse_path_suffix}" ]) dataset_snapshot.aspects.append(browse_path) # 4. Attach dataPlatformInstance aspect. if self.source_config.platform_instance: dataset_snapshot.aspects.append( DataPlatformInstanceClass( platform=platform_urn, instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance), )) # 5. Emit the datasetSnapshot MCE mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=f"kafka-{topic}", mce=mce) self.report.report_workunit(wu) yield wu # 5. Add the subtype aspect marking this as a "topic" subtype_wu = MetadataWorkUnit( id=f"{topic}-subtype", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["topic"]), ), ) self.report.report_workunit(subtype_wu) yield subtype_wu domain_urn: Optional[str] = None # 6. Emit domains aspect MCPW for domain, pattern in self.source_config.domain.items(): if pattern.allowed(dataset_name): domain_urn = make_domain_urn(domain) if domain_urn: wus = add_domain_to_entity_wu( entity_type="dataset", entity_urn=dataset_urn, domain_urn=domain_urn, ) for wu in wus: self.report.report_workunit(wu) yield wu
def dataplatform2instance_func( instance: str, platform: str, dry_run: bool, env: str, force: bool, hard: bool, keep: bool, ) -> None: click.echo( f"Starting migration: platform:{platform}, instance={instance}, force={force}, dry-run={dry_run}" ) run_id: str = f"migrate-{uuid.uuid4()}" migration_report = MigrationReport(run_id, dry_run, keep) system_metadata = SystemMetadataClass(runId=run_id) all_aspects = [ "schemaMetadata", "datasetProperties", "viewProperties", "subTypes", "editableDatasetProperties", "ownership", "datasetDeprecation", "institutionalMemory", "editableSchemaMetadata", "globalTags", "glossaryTerms", "upstreamLineage", "datasetUpstreamLineage", "status", ] if not dry_run: rest_emitter = DatahubRestEmitter( gms_server=cli_utils.get_session_and_host()[1] ) urns_to_migrate = [] # we first calculate all the urns we will be migrating for src_entity_urn in cli_utils.get_urns_by_filter(platform=platform, env=env): key = dataset_urn_to_key(src_entity_urn) assert key # Does this urn already have a platform instance associated with it? response = cli_utils.get_aspects_for_entity( entity_urn=src_entity_urn, aspects=["dataPlatformInstance"], typed=True ) if "dataPlatformInstance" in response: assert isinstance( response["dataPlatformInstance"], DataPlatformInstanceClass ) data_platform_instance: DataPlatformInstanceClass = response[ "dataPlatformInstance" ] if data_platform_instance.instance: log.debug("This is already an instance-specific urn, will skip") continue else: log.debug( f"{src_entity_urn} is not an instance specific urn. {response}" ) urns_to_migrate.append(src_entity_urn) if not force and not dry_run: # get a confirmation from the operator before proceeding if this is not a dry run sampled_urns_to_migrate = random.choices( urns_to_migrate, k=min(10, len(urns_to_migrate)) ) sampled_new_urns: List[str] = [ make_dataset_urn_with_platform_instance( platform=key.platform, name=key.name, platform_instance=instance, env=str(key.origin), ) for key in [dataset_urn_to_key(x) for x in sampled_urns_to_migrate] if key ] click.echo( f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}" ) click.echo(f"New urns will look like {sampled_new_urns}") click.confirm("Ok to proceed?", abort=True) for src_entity_urn in progressbar.progressbar( urns_to_migrate, redirect_stdout=True ): key = dataset_urn_to_key(src_entity_urn) assert key new_urn = make_dataset_urn_with_platform_instance( platform=key.platform, name=key.name, platform_instance=instance, env=str(key.origin), ) log.debug(f"Will migrate {src_entity_urn} to {new_urn}") relationships = migration_utils.get_incoming_relationships_dataset( src_entity_urn ) for mcp in migration_utils.clone_aspect( src_entity_urn, aspect_names=all_aspects, dst_urn=new_urn, dry_run=dry_run, run_id=run_id, ): if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_create(mcp.entityUrn, mcp.aspectName) # type: ignore if not dry_run: rest_emitter.emit_mcp( MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=new_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=make_data_platform_urn(platform), instance=make_dataplatform_instance_urn(platform, instance), ), systemMetadata=system_metadata, ) ) migration_report.on_entity_create(new_urn, "dataPlatformInstance") for relationship in relationships: target_urn = relationship["entity"] entity_type = _get_type_from_urn(target_urn) relationshipType = relationship["type"] aspect_name = ( migration_utils.get_aspect_name_from_relationship_type_and_entity( relationshipType, entity_type ) ) aspect_map = cli_utils.get_aspects_for_entity( target_urn, aspects=[aspect_name], typed=True ) if aspect_name in aspect_map: aspect = aspect_map[aspect_name] assert isinstance(aspect, DictWrapper) aspect = migration_utils.modify_urn_list_for_aspect( aspect_name, aspect, relationshipType, src_entity_urn, new_urn ) # use mcpw mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=target_urn, aspectName=aspect_name, aspect=aspect, ) if not dry_run: rest_emitter.emit_mcp(mcp) migration_report.on_entity_affected(mcp.entityUrn, mcp.aspectName) # type: ignore else: log.debug(f"Didn't find aspect {aspect_name} for urn {target_urn}") if not dry_run and not keep: log.info(f"will {'hard' if hard else 'soft'} delete {src_entity_urn}") delete_cli._delete_one_urn(src_entity_urn, soft=not hard, run_id=run_id) migration_report.on_entity_migrated(src_entity_urn, "status") # type: ignore print(f"{migration_report}")
def _extract_mcps(self, index: str) -> Iterable[MetadataChangeProposalWrapper]: logger.debug(f"index = {index}") raw_index = self.client.indices.get(index=index) raw_index_metadata = raw_index[index] # 0. Dedup data_streams. data_stream = raw_index_metadata.get("data_stream") if data_stream: index = data_stream self.data_stream_partition_count[index] += 1 if self.data_stream_partition_count[index] > 1: # This is a duplicate, skip processing it further. return # 1. Construct and emit the schemaMetadata aspect # 1.1 Generate the schema fields from ES mappings. index_mappings = raw_index_metadata["mappings"] index_mappings_json_str: str = json.dumps(index_mappings) md5_hash = md5(index_mappings_json_str.encode()).hexdigest() schema_fields = list( ElasticToSchemaFieldConverter.get_schema_fields(index_mappings)) # 1.2 Generate the SchemaMetadata aspect schema_metadata = SchemaMetadata( schemaName=index, platform=make_data_platform_urn(self.platform), version=0, hash=md5_hash, platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str), fields=schema_fields, ) # 1.3 Emit the mcp dataset_urn: str = make_dataset_urn(self.platform, index, self.source_config.env) yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="schemaMetadata", aspect=schema_metadata, changeType=ChangeTypeClass.UPSERT, ) # 2. Construct and emit the status aspect. yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="status", aspect=StatusClass(removed=False), changeType=ChangeTypeClass.UPSERT, ) # 3. Construct and emit subtype yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass( typeNames=["Index" if not data_stream else "DataStream"]), changeType=ChangeTypeClass.UPSERT, ) # 4. Construct and emit properties if needed index_aliases = raw_index_metadata.get("aliases", {}).keys() if index_aliases: yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( customProperties={"aliases": ",".join(index_aliases)}), changeType=ChangeTypeClass.UPSERT, )
def _extract_record(self, topic: str, partitioned: bool) -> Iterable[MetadataWorkUnit]: logger.info(f"topic = {topic}") # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic pulsar_topic = PulsarTopic(topic) platform_urn = make_data_platform_urn(self.platform) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=pulsar_topic.fullname, platform_instance=self.config.platform_instance, env=self.config.env, ) status_wu = MetadataWorkUnit( id=f"{dataset_urn}-status", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="status", aspect=StatusClass(removed=False), ), ) self.report.report_workunit(status_wu) yield status_wu # 2. Emit schemaMetadata aspect schema, schema_metadata = self._get_schema_metadata( pulsar_topic, platform_urn) if schema_metadata is not None: schema_metadata_wu = MetadataWorkUnit( id=f"{dataset_urn}-schemaMetadata", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="schemaMetadata", aspect=schema_metadata, ), ) self.report.report_workunit(schema_metadata_wu) yield schema_metadata_wu # TODO Add topic properties (Pulsar 2.10.0 feature) # 3. Construct and emit dataset properties aspect if schema is not None: schema_properties = { "schema_version": str(schema.schema_version), "schema_type": schema.schema_type, "partitioned": str(partitioned).lower(), } # Add some static properties to the schema properties schema.properties.update(schema_properties) dataset_properties_wu = MetadataWorkUnit( id=f"{dataset_urn}-datasetProperties", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( description=schema.schema_description, customProperties=schema.properties, ), ), ) self.report.report_workunit(dataset_properties_wu) yield dataset_properties_wu # 4. Emit browsePaths aspect pulsar_path = ( f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}" ) browse_path_suffix = (f"{self.config.platform_instance}/{pulsar_path}" if self.config.platform_instance else pulsar_path) browse_path_wu = MetadataWorkUnit( id=f"{dataset_urn}-browsePaths", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="browsePaths", aspect=BrowsePathsClass([ f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}" ]), ), ) self.report.report_workunit(browse_path_wu) yield browse_path_wu # 5. Emit dataPlatformInstance aspect. if self.config.platform_instance: platform_instance_wu = MetadataWorkUnit( id=f"{dataset_urn}-dataPlatformInstance", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=platform_urn, instance=make_dataplatform_instance_urn( self.platform, self.config.platform_instance), ), ), ) self.report.report_workunit(platform_instance_wu) yield platform_instance_wu # 6. Emit subtype aspect marking this as a "topic" subtype_wu = MetadataWorkUnit( id=f"{dataset_urn}-subTypes", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["topic"]), ), ) self.report.report_workunit(subtype_wu) yield subtype_wu # 7. Emit domains aspect domain_urn: Optional[str] = None for domain, pattern in self.config.domain.items(): if pattern.allowed(pulsar_topic.fullname): domain_urn = make_domain_urn(domain) if domain_urn: wus = add_domain_to_entity_wu( entity_type="dataset", entity_urn=dataset_urn, domain_urn=domain_urn, ) for wu in wus: self.report.report_workunit(wu) yield wu
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]: def strip_types(field_path: str) -> str: final_path = field_path final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path) final_path = re.sub(r"^\[version=2.0\]\.", "", final_path) return final_path datasets: List[DatasetSnapshotClass] = [] for entity_name, entity_def in entity_registry.items(): entity_display_name = entity_def.display_name entity_fields = [] for aspect_name in entity_def.aspects: if aspect_name not in aspect_registry: print( f"Did not find aspect name: {aspect_name} in aspect_registry" ) continue # all aspects should have a schema aspect_schema = aspect_registry[aspect_name].schema assert aspect_schema entity_fields.append({ "type": aspect_schema.to_json(), "name": aspect_name, }) if entity_fields: names = avro.schema.Names() field_objects = [] for f in entity_fields: field = avro.schema.Field( type=f["type"], name=f["name"], has_default=False, ) field_objects.append(field) with unittest.mock.patch("avro.schema.Names.add_name", add_name): entity_avro_schema = avro.schema.RecordSchema( name=entity_name, namespace="datahub.metadata.model", names=names, fields=[], ) entity_avro_schema.set_prop("fields", field_objects) rawSchema = json.dumps(entity_avro_schema.to_json()) # always add the URN which is the primary key urn_field = SchemaField( fieldPath="urn", type=SchemaFieldDataTypeClass(type=StringTypeClass()), nativeDataType="string", nullable=False, isPartOfKey=True, description= f"The primary identifier for the {entity_name} entity. See the {entity_def.keyAspect} field to understand the structure of this urn.", ) schema_fields: List[SchemaField] = [ urn_field ] + avro_schema_to_mce_fields(rawSchema) foreign_keys: List[ForeignKeyConstraintClass] = [] source_dataset_urn = make_dataset_urn( platform=make_data_platform_urn("datahub"), name=f"{entity_display_name}", ) for f_field in schema_fields: if f_field.jsonProps: json_dict = json.loads(f_field.jsonProps) if "Aspect" in json_dict: aspect_info = json_dict["Aspect"] f_field.globalTags = f_field.globalTags or GlobalTagsClass( tags=[]) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Aspect")) # if this is the key aspect, also add primary-key if entity_def.keyAspect == aspect_info.get("name"): f_field.isPartOfKey = True if "timeseries" == aspect_info.get("type", ""): # f_field.globalTags = f_field.globalTags or GlobalTagsClass( # tags=[] # ) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Temporal")) import pdb # breakpoint() if "Searchable" in json_dict: f_field.globalTags = f_field.globalTags or GlobalTagsClass( tags=[]) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Searchable")) if "Relationship" in json_dict: relationship_info = json_dict["Relationship"] # detect if we have relationship specified at leaf level or thru path specs if "entityTypes" not in relationship_info: # path spec assert ( len(relationship_info.keys()) == 1 ), "We should never have more than one path spec assigned to a relationship annotation" final_info = None for k, v in relationship_info.items(): final_info = v relationship_info = final_info assert "entityTypes" in relationship_info entity_types: List[str] = relationship_info.get( "entityTypes", []) relnship_name = relationship_info.get("name", None) for entity_type in entity_types: destination_entity_name = capitalize_first( entity_type) foreign_dataset_urn = make_dataset_urn( platform=make_data_platform_urn("datahub"), name=destination_entity_name, ) fkey = ForeignKeyConstraintClass( name=relnship_name, foreignDataset=foreign_dataset_urn, foreignFields=[ f"urn:li:schemaField:({foreign_dataset_urn}, urn)" ], sourceFields=[ f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})" ], ) foreign_keys.append(fkey) relnships_graph.add_edge( entity_display_name, destination_entity_name, fkey.name, f" via `{strip_types(f_field.fieldPath)}`", edge_id= f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}", ) schemaMetadata = SchemaMetadataClass( schemaName=f"{entity_name}", platform=make_data_platform_urn("datahub"), platformSchema=OtherSchemaClass(rawSchema=rawSchema), fields=schema_fields, version=0, hash="", foreignKeys=foreign_keys if foreign_keys else None, ) dataset = DatasetSnapshotClass( urn=make_dataset_urn( platform=make_data_platform_urn("datahub"), name=f"{entity_display_name}", ), aspects=[ schemaMetadata, GlobalTagsClass( tags=[TagAssociationClass(tag="urn:li:tag:Entity")]), BrowsePathsClass( [f"/prod/datahub/entities/{entity_display_name}"]), ], ) datasets.append(dataset) events: List[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]] = [] for d in datasets: entity_name = d.urn.split(":")[-1].split(",")[1] d.aspects.append( DatasetPropertiesClass( description=make_entity_docs(entity_name, relnships_graph))) mce = MetadataChangeEventClass(proposedSnapshot=d, ) events.append(mce) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=d.urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["entity"]), ) events.append(mcp) return events