def __init__(self, config: PatternDatasetTagsConfig, ctx: PipelineContext): tag_pattern = config.tag_pattern generic_config = AddDatasetTagsConfig( get_tags_to_add=lambda _: [TagAssociationClass(tag=urn) for urn in tag_pattern.value(_.urn)], ) super().__init__(generic_config, ctx)
def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass: tags = [ TagAssociationClass(tag=builder.make_tag_urn(tag.upper())) for tag in params if tag ] return GlobalTagsClass(tags=tags)
def test_mcp_add_tags_existing(mock_time): dataset_mcp = make_generic_dataset_mcp( aspect_name="globalTags", aspect=GlobalTagsClass( tags=[TagAssociationClass(tag=builder.make_tag_urn("Test"))]), ) transformer = SimpleAddDatasetTags.create( { "tag_urns": [ builder.make_tag_urn("NeedsDocumentation"), builder.make_tag_urn("Legacy"), ] }, PipelineContext(run_id="test-tags"), ) input_stream: List[RecordEnvelope] = [ RecordEnvelope(input, metadata={}) for input in [dataset_mcp] ] input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={})) outputs = list(transformer.transform(input_stream)) assert len(outputs) == 2 # Check that tags were added, this will be the second result tags_aspect = outputs[0].record.aspect assert tags_aspect assert len(tags_aspect.tags) == 3 assert tags_aspect.tags[0].tag == builder.make_tag_urn("Test") assert tags_aspect.tags[1].tag == builder.make_tag_urn( "NeedsDocumentation") assert isinstance(outputs[-1].record, EndOfStream)
def get_s3_tags() -> Optional[GlobalTagsClass]: bucket_name = s3_util.get_bucket_name( table["StorageDescriptor"]["Location"]) tags_to_add = [] if self.source_config.use_s3_bucket_tags: try: bucket_tags = self.s3_client.get_bucket_tagging( Bucket=bucket_name) tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in bucket_tags["TagSet"] ]) except self.s3_client.exceptions.ClientError: logger.warn(f"No tags found for bucket={bucket_name}") if self.source_config.use_s3_object_tags: key_prefix = s3_util.get_key_prefix( table["StorageDescriptor"]["Location"]) object_tagging = self.s3_client.get_object_tagging( Bucket=bucket_name, Key=key_prefix) tag_set = object_tagging["TagSet"] if tag_set: tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in tag_set ]) else: # Unlike bucket tags, if an object does not have tags, it will just return an empty array # as opposed to an exception. logger.warn( f"No tags found for bucket={bucket_name} key={key_prefix}" ) if len(tags_to_add) == 0: return None if self.ctx.graph is not None: logger.debug( "Connected to DatahubApi, grabbing current tags to maintain." ) current_tags: Optional[ GlobalTagsClass] = self.ctx.graph.get_aspect_v2( entity_urn=dataset_urn, aspect="globalTags", aspect_type=GlobalTagsClass, ) if current_tags: tags_to_add.extend( [current_tag.tag for current_tag in current_tags.tags]) else: logger.warn( "Could not connect to DatahubApi. No current tags to maintain" ) # Remove duplicate tags tags_to_add = list(set(tags_to_add)) new_tags = GlobalTagsClass(tags=[ TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add ]) return new_tags
def get_s3_tags(self, bucket_name: str, key_name: Optional[str], dataset_urn: str) -> Optional[GlobalTagsClass]: if self.source_config.aws_config is None: raise ValueError("aws_config not set. Cannot browse s3") new_tags = GlobalTagsClass(tags=[]) tags_to_add = [] if self.source_config.use_s3_bucket_tags: s3 = self.source_config.aws_config.get_s3_resource() bucket = s3.Bucket(bucket_name) try: tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in bucket.Tagging().tag_set ]) except s3.meta.client.exceptions.ClientError: logger.warn(f"No tags found for bucket={bucket_name}") if self.source_config.use_s3_object_tags and key_name is not None: s3_client = self.source_config.aws_config.get_s3_client() object_tagging = s3_client.get_object_tagging(Bucket=bucket_name, Key=key_name) tag_set = object_tagging["TagSet"] if tag_set: tags_to_add.extend([ make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") for tag in tag_set ]) else: # Unlike bucket tags, if an object does not have tags, it will just return an empty array # as opposed to an exception. logger.warn( f"No tags found for bucket={bucket_name} key={key_name}") if len(tags_to_add) == 0: return None if self.ctx.graph is not None: logger.debug( "Connected to DatahubApi, grabbing current tags to maintain.") current_tags: Optional[ GlobalTagsClass] = self.ctx.graph.get_aspect_v2( entity_urn=dataset_urn, aspect="globalTags", aspect_type=GlobalTagsClass, ) if current_tags: tags_to_add.extend( [current_tag.tag for current_tag in current_tags.tags]) else: logger.warn( "Could not connect to DatahubApi. No current tags to maintain") # Remove duplicate tags tags_to_add = list(set(tags_to_add)) new_tags = GlobalTagsClass(tags=[ TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add ]) return new_tags
def get_schema_metadata(report: SourceReport, node: DBTNode, platform: str) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in node.columns: description = None if (column.comment and column.description and column.comment != column.description): description = f"{platform} comment: {column.comment}\n\ndbt model description: {column.description}" elif column.comment: description = column.comment elif column.description: description = column.description globalTags = None if column.tags: globalTags = GlobalTagsClass(tags=[ TagAssociationClass(f"urn:li:tag:{tag}") for tag in column.tags ]) field = SchemaField( fieldPath=column.name, nativeDataType=column.data_type, type=get_column_type(report, node.dbt_name, column.data_type), description=description, nullable=False, # TODO: actually autodetect this recursive=False, globalTags=globalTags, ) canonical_schema.append(field) last_modified = None if node.max_loaded_at is not None: actor = "urn:li:corpuser:dbt_executor" last_modified = AuditStamp( time=int( dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000), actor=actor, ) description = None return SchemaMetadata( schemaName=node.dbt_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), lastModified=last_modified, fields=canonical_schema, )
def add_tags_to_entity_wu(entity_type: str, entity_urn: str, tags: List[str]) -> Iterable[MetadataWorkUnit]: mcp = MetadataChangeProposalWrapper( entityType=entity_type, changeType=ChangeTypeClass.UPSERT, entityUrn=f"{entity_urn}", aspectName="globalTags", aspect=GlobalTagsClass( tags=[TagAssociationClass(f"urn:li:tag:{tag}") for tag in tags]), ) wu = MetadataWorkUnit(id=f"tags-to-{entity_urn}", mcp=mcp) yield wu
def _get_tags_from_field_type( field_type: ViewFieldType, reporter: SourceReport) -> Optional[GlobalTagsClass]: if field_type in LookerUtil.type_to_tag_map: return GlobalTagsClass(tags=[ TagAssociationClass(tag=tag_name) for tag_name in LookerUtil.type_to_tag_map[field_type] ]) else: reporter.report_warning( "lookml", f"Failed to map view field type {field_type}. Won't emit tags for it", ) return None
def get_transformed_tags_by_prefix( self, new_tags: List[TagAssociationClass], entity_urn: str, tags_prefix_filter: str, ) -> List[TagAssociationClass]: tag_set = set([new_tag.tag for new_tag in new_tags]) if self.ctx.graph: existing_tags_class = self.ctx.graph.get_tags(entity_urn) if existing_tags_class and existing_tags_class.tags: for exiting_tag in existing_tags_class.tags: if not exiting_tag.tag.startswith(tags_prefix_filter): tag_set.add(exiting_tag.tag) return [TagAssociationClass(tag) for tag in sorted(tag_set)]
def init_dataset( self, endpoint_k: str, endpoint_dets: dict ) -> Tuple[DatasetSnapshot, str]: config = self.config dataset_name = endpoint_k[1:].replace("/", ".") if len(dataset_name) > 0: if dataset_name[-1] == ".": dataset_name = dataset_name[:-1] else: dataset_name = "root" dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)", aspects=[], ) # adding description dataset_properties = DatasetPropertiesClass( description=endpoint_dets["description"], customProperties={} ) dataset_snapshot.aspects.append(dataset_properties) # adding tags tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]] tags_tac = [TagAssociationClass(t) for t in tags_str] gtc = GlobalTagsClass(tags_tac) dataset_snapshot.aspects.append(gtc) # the link will appear in the "documentation" link_url = clean_url(config.url + self.url_basepath + endpoint_k) link_description = "Link to call for the dataset." creation = AuditStampClass( time=int(time.time()), actor="urn:li:corpuser:etl", impersonator=None ) link_metadata = InstitutionalMemoryMetadataClass( url=link_url, description=link_description, createStamp=creation ) inst_memory = InstitutionalMemoryClass([link_metadata]) dataset_snapshot.aspects.append(inst_memory) return dataset_snapshot, dataset_name
def get_workunits(self) -> Iterable[WorkUnit]: catalog = open_catalog( app_dir=Path(typer.get_app_dir("tokern")), secret=self.config.secret, path=Path(self.config.path) if self.config.path is not None else None, user=self.config.user, password=self.config.password, host=self.config.host, port=self.config.port, database=self.config.database, ) with closing(catalog) as catalog: with catalog.managed_session: if (self.config.source_names is not None and len(self.config.source_names) > 0): sources = [ catalog.get_source(source_name) for source_name in self.config.source_names ] else: sources = catalog.get_sources() for source in sources: for schema, table in table_generator( catalog=catalog, source=source, include_schema_regex_str=self.config. include_schema_regex, exclude_schema_regex_str=self.config. exclude_schema_regex, include_table_regex_str=self.config. include_table_regex, exclude_table_regex_str=self.config. exclude_table_regex, ): if self.config.include_source_name: dataset_name = f"{source.name}.{schema.name}.{table.name}" else: dataset_name = f"{schema.name}.{table.name}" self.report.report_entity_scanned(dataset_name) dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{source.source_type},{dataset_name},{self.config.env})" dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) schema_fields = [] for column in catalog.get_columns_for_table(table): global_tags: Optional[GlobalTagsClass] = None if column.pii_type is not None: global_tags = GlobalTagsClass(tags=[ TagAssociationClass("urn:li:tag:pii"), TagAssociationClass( f"urn:li:tag:{column.pii_type.name.lower()}" ), ]) schema_fields.append( SchemaField( fieldPath=column.name, type=CatalogSource.get_column_type( column.data_type), nativeDataType=column.data_type, description=None, nullable=True, recursive=False, globalTags=global_tags, )) schema_metadata = get_schema_metadata( sql_report=self.report, dataset_name=dataset_name, platform=source.source_type, columns=[], canonical_schema=schema_fields, ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]: def strip_types(field_path: str) -> str: final_path = field_path final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path) final_path = re.sub(r"^\[version=2.0\]\.", "", final_path) return final_path datasets: List[DatasetSnapshotClass] = [] for entity_name, entity_def in entity_registry.items(): entity_display_name = entity_def.display_name entity_fields = [] for aspect_name in entity_def.aspects: if aspect_name not in aspect_registry: print( f"Did not find aspect name: {aspect_name} in aspect_registry" ) continue # all aspects should have a schema aspect_schema = aspect_registry[aspect_name].schema assert aspect_schema entity_fields.append({ "type": aspect_schema.to_json(), "name": aspect_name, }) if entity_fields: names = avro.schema.Names() field_objects = [] for f in entity_fields: field = avro.schema.Field( type=f["type"], name=f["name"], has_default=False, ) field_objects.append(field) with unittest.mock.patch("avro.schema.Names.add_name", add_name): entity_avro_schema = avro.schema.RecordSchema( name=entity_name, namespace="datahub.metadata.model", names=names, fields=[], ) entity_avro_schema.set_prop("fields", field_objects) rawSchema = json.dumps(entity_avro_schema.to_json()) # always add the URN which is the primary key urn_field = SchemaField( fieldPath="urn", type=SchemaFieldDataTypeClass(type=StringTypeClass()), nativeDataType="string", nullable=False, isPartOfKey=True, description= f"The primary identifier for the {entity_name} entity. See the {entity_def.keyAspect} field to understand the structure of this urn.", ) schema_fields: List[SchemaField] = [ urn_field ] + avro_schema_to_mce_fields(rawSchema) foreign_keys: List[ForeignKeyConstraintClass] = [] source_dataset_urn = make_dataset_urn( platform=make_data_platform_urn("datahub"), name=f"{entity_display_name}", ) for f_field in schema_fields: if f_field.jsonProps: json_dict = json.loads(f_field.jsonProps) if "Aspect" in json_dict: aspect_info = json_dict["Aspect"] f_field.globalTags = f_field.globalTags or GlobalTagsClass( tags=[]) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Aspect")) # if this is the key aspect, also add primary-key if entity_def.keyAspect == aspect_info.get("name"): f_field.isPartOfKey = True if "timeseries" == aspect_info.get("type", ""): # f_field.globalTags = f_field.globalTags or GlobalTagsClass( # tags=[] # ) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Temporal")) import pdb # breakpoint() if "Searchable" in json_dict: f_field.globalTags = f_field.globalTags or GlobalTagsClass( tags=[]) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Searchable")) if "Relationship" in json_dict: relationship_info = json_dict["Relationship"] # detect if we have relationship specified at leaf level or thru path specs if "entityTypes" not in relationship_info: # path spec assert ( len(relationship_info.keys()) == 1 ), "We should never have more than one path spec assigned to a relationship annotation" final_info = None for k, v in relationship_info.items(): final_info = v relationship_info = final_info assert "entityTypes" in relationship_info entity_types: List[str] = relationship_info.get( "entityTypes", []) relnship_name = relationship_info.get("name", None) for entity_type in entity_types: destination_entity_name = capitalize_first( entity_type) foreign_dataset_urn = make_dataset_urn( platform=make_data_platform_urn("datahub"), name=destination_entity_name, ) fkey = ForeignKeyConstraintClass( name=relnship_name, foreignDataset=foreign_dataset_urn, foreignFields=[ f"urn:li:schemaField:({foreign_dataset_urn}, urn)" ], sourceFields=[ f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})" ], ) foreign_keys.append(fkey) relnships_graph.add_edge( entity_display_name, destination_entity_name, fkey.name, f" via `{strip_types(f_field.fieldPath)}`", edge_id= f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}", ) schemaMetadata = SchemaMetadataClass( schemaName=f"{entity_name}", platform=make_data_platform_urn("datahub"), platformSchema=OtherSchemaClass(rawSchema=rawSchema), fields=schema_fields, version=0, hash="", foreignKeys=foreign_keys if foreign_keys else None, ) dataset = DatasetSnapshotClass( urn=make_dataset_urn( platform=make_data_platform_urn("datahub"), name=f"{entity_display_name}", ), aspects=[ schemaMetadata, GlobalTagsClass( tags=[TagAssociationClass(tag="urn:li:tag:Entity")]), BrowsePathsClass( [f"/prod/datahub/entities/{entity_display_name}"]), ], ) datasets.append(dataset) events: List[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]] = [] for d in datasets: entity_name = d.urn.split(":")[-1].split(",")[1] d.aspects.append( DatasetPropertiesClass( description=make_entity_docs(entity_name, relnships_graph))) mce = MetadataChangeEventClass(proposedSnapshot=d, ) events.append(mce) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=d.urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["entity"]), ) events.append(mcp) return events
def make_global_tag_aspect_with_tag_list(tags: List[str]) -> GlobalTagsClass: return GlobalTagsClass( tags=[TagAssociationClass(f"urn:li:tag:{tag}") for tag in tags])
def __init__(self, config: SimpleDatasetTagConfig, ctx: PipelineContext): tags = [TagAssociationClass(tag=tag) for tag in config.tag_urns] generic_config = AddDatasetTagsConfig(get_tags_to_add=lambda _: tags, ) super().__init__(generic_config, ctx)
def emit(self) -> Generator[SchemaField, None, None]: if (not isinstance( self._actual_schema, ( avro.schema.ArraySchema, avro.schema.Field, avro.schema.MapSchema, avro.schema.RecordSchema, ), ) and self._converter._fields_stack): # We are in the context of a non-nested(simple) field or the special-cased union. yield from self._converter._gen_from_last_field() else: # Just emit the SchemaField from schema provided in the Ctor. schema = self._schema actual_schema = self._actual_schema if isinstance(schema, avro.schema.Field): # Field's schema is actually it's type. schema = schema.type actual_schema = (self._converter. _get_underlying_type_if_option_as_union( schema, schema)) description = self._description if description is None: description = schema.props.get("doc", None) native_data_type = self._converter._prefix_name_stack[-1] if isinstance(schema, (avro.schema.Field, avro.schema.UnionSchema)): native_data_type = self._converter._prefix_name_stack[-2] type_prefix = "[type=" if native_data_type.startswith(type_prefix): native_data_type = native_data_type[slice( len(type_prefix), len(native_data_type) - 1)] native_data_type = actual_schema.props.get( "native_data_type", native_data_type) field_path = self._converter._get_cur_field_path() merged_props = {} merged_props.update(self._schema.other_props) merged_props.update(schema.other_props) tags = None if "deprecated" in merged_props: description = ( f"<span style=\"color:red\">DEPRECATED: {merged_props['deprecated']}</span>\n" + description) tags = GlobalTagsClass(tags=[ TagAssociationClass(tag="urn:li:tag:Deprecated") ]) field = SchemaField( fieldPath=field_path, # Populate it with the simple native type for now. nativeDataType=native_data_type, type=self._converter._get_column_type( actual_schema.type, actual_schema.props.get("logicalType")), description=description, recursive=False, nullable=self._converter._is_nullable(schema), isPartOfKey=self._converter._is_key_schema, globalTags=tags, jsonProps=json.dumps(merged_props) if merged_props else None, ) yield field
def get_workunits(self) -> Iterable[MetadataWorkUnit]: ( nodes, manifest_schema, manifest_version, catalog_schema, catalog_version, ) = loadManifestAndCatalog( self.config.manifest_path, self.config.catalog_path, self.config.sources_path, self.config.load_schemas, self.config.use_identifiers, self.config.tag_prefix, self.config.target_platform, self.config.env, self.config.node_type_pattern, self.report, ) additional_custom_props = { "manifest_schema": manifest_schema, "manifest_version": manifest_version, "catalog_schema": catalog_schema, "catalog_version": catalog_version, } additional_custom_props_filtered = { key: value for key, value in additional_custom_props.items() if value is not None } for node in nodes: dataset_snapshot = DatasetSnapshot(urn=node.datahub_urn, aspects=[]) description = None if node.comment and node.description and node.comment != node.description: description = f"{self.config.target_platform} comment: {node.comment}\n\ndbt model description: {node.description}" elif node.comment: description = node.comment elif node.description: description = node.description custom_props = { **get_custom_properties(node), **additional_custom_props_filtered, } dbt_properties = DatasetPropertiesClass( description=description, customProperties=custom_props, tags=node.tags) dataset_snapshot.aspects.append(dbt_properties) if node.owner: owners = [ OwnerClass( owner=f"urn:li:corpuser:{node.owner}", type=OwnershipTypeClass.DATAOWNER, ) ] dataset_snapshot.aspects.append(OwnershipClass( owners=owners, )) if node.tags: dataset_snapshot.aspects.append( GlobalTagsClass(tags=[ TagAssociationClass(f"urn:li:tag:{tag}") for tag in node.tags ])) upstreams = get_upstream_lineage(node.upstream_urns) if upstreams is not None: dataset_snapshot.aspects.append(upstreams) if self.config.load_schemas: schema_metadata = get_schema_metadata( self.report, node, self.config.target_platform) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce) self.report.report_workunit(wu) yield wu
rawSchema="__insert raw schema here__"), fields=[ SchemaFieldClass( fieldPath="address.zipcode", type=SchemaFieldDataTypeClass(type=StringTypeClass()), nativeDataType= "VARCHAR(100)", # use this to provide the type of the field in the source system's vernacular jsonPath="", # Unused field, can omit nullable=True, description= "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States", recursive=False, # Unused field, can omit # It is rare to attach tags to fields as part of the technical schema unless you are purely reflecting state that exists in the source system. # For an editable (in UI) version of this, use the editableSchemaMetadata aspect globalTags=GlobalTagsClass( tags=[TagAssociationClass(tag=make_tag_urn("location"))]), # It is rare to attach glossary terms to fields as part of the technical schema unless you are purely reflecting state that exists in the source system. # For an editable (in UI) version of this, use the editableSchemaMetadata aspect glossaryTerms=GlossaryTermsClass( terms=[ GlossaryTermAssociationClass( urn=make_term_urn("Classification.PII")) ], auditStamp= AuditStampClass( # represents the time when this term was attached to this field? time= 0, # time in milliseconds, leave as 0 if no time of association is known actor= "urn:li:corpuser:ingestion", # if this is a system provided tag, use a bot user id like ingestion ), ),
# First we get the current editable schema metadata gms_endpoint = "http://localhost:8080" graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) current_editable_schema_metadata = graph.get_aspect_v2( entity_urn=dataset_urn, aspect="editableSchemaMetadata", aspect_type=EditableSchemaMetadataClass, ) # Some pre-built objects to help all the conditional pathways tag_association_to_add = TagAssociationClass(tag=tag_to_add) tags_aspect_to_set = GlobalTagsClass(tags=[tag_association_to_add]) field_info_to_set = EditableSchemaFieldInfoClass( fieldPath=column, globalTags=tags_aspect_to_set ) need_write = False field_match = False if current_editable_schema_metadata: for fieldInfo in current_editable_schema_metadata.editableSchemaFieldInfo: if get_simple_field_path_from_v2_field_path(fieldInfo.fieldPath) == column: # we have some editable schema metadata for this field field_match = True if fieldInfo.globalTags: if tag_to_add not in [x.tag for x in fieldInfo.globalTags.tags]:
# First we get the current tags gms_endpoint = "http://localhost:8080" graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") current_tags: Optional[GlobalTagsClass] = graph.get_aspect_v2( entity_urn=dataset_urn, aspect="globalTags", aspect_type=GlobalTagsClass, ) tag_to_add = make_tag_urn("purchase") tag_association_to_add = TagAssociationClass(tag=tag_to_add) need_write = False if current_tags: if tag_to_add not in [x.tag for x in current_tags.tags]: # tags exist, but this tag is not present in the current tags current_tags.tags.append(TagAssociationClass(tag_to_add)) need_write = True else: # create a brand new tags aspect current_tags = GlobalTagsClass(tags=[tag_association_to_add]) need_write = True if need_write: event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( entityType="dataset",
def test_mark_status_dataset(tmp_path): dataset = make_generic_dataset() transformer = MarkDatasetStatus.create( {"removed": True}, PipelineContext(run_id="test"), ) removed = list( transformer.transform([ RecordEnvelope(dataset, metadata={}), ])) assert len(removed) == 1 status_aspect = builder.get_aspect_if_available(removed[0].record, models.StatusClass) assert status_aspect assert status_aspect.removed is True transformer = MarkDatasetStatus.create( {"removed": False}, PipelineContext(run_id="test"), ) not_removed = list( transformer.transform([ RecordEnvelope(dataset, metadata={}), ])) assert len(not_removed) == 1 status_aspect = builder.get_aspect_if_available(not_removed[0].record, models.StatusClass) assert status_aspect assert status_aspect.removed is False mcp = make_generic_dataset_mcp( aspect_name="datasetProperties", aspect=DatasetPropertiesClass(description="Test dataset"), ) events_file = create_and_run_test_pipeline( events=[mcp], transformers=[{ "type": "mark_dataset_status", "config": { "removed": True } }], path=tmp_path, ) # assert dataset properties aspect was preserved assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="datasetProperties", aspect_field_matcher={"description": "Test dataset"}, file=events_file, ) == 1) # assert Status aspect was generated assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="status", aspect_field_matcher={"removed": True}, file=events_file, ) == 1) # MCE only test_aspect = DatasetPropertiesClass(description="Test dataset") events_file = create_and_run_test_pipeline( events=[make_generic_dataset(aspects=[test_aspect])], transformers=[{ "type": "mark_dataset_status", "config": { "removed": True } }], path=tmp_path, ) # assert dataset properties aspect was preserved assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect( entity_urn=mcp.entityUrn or "", aspect=test_aspect, aspect_type=DatasetPropertiesClass, file=events_file, ) == 1) # assert Status aspect was generated assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="status", aspect_field_matcher={"removed": True}, file=events_file, ) == 1) # MCE (non-matching) + MCP (matching) test_aspect = DatasetPropertiesClass(description="Test dataset") events_file = create_and_run_test_pipeline( events=[ make_generic_dataset(aspects=[test_aspect]), make_generic_dataset_mcp(), ], transformers=[{ "type": "mark_dataset_status", "config": { "removed": True } }], path=tmp_path, ) # assert dataset properties aspect was preserved assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect( entity_urn=mcp.entityUrn or "", aspect=test_aspect, aspect_type=DatasetPropertiesClass, file=events_file, ) == 1) # assert Status aspect was generated assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="status", aspect_field_matcher={"removed": True}, file=events_file, ) == 1) # MCE (matching) + MCP (non-matching) test_status_aspect = StatusClass(removed=False) events_file = create_and_run_test_pipeline( events=[ make_generic_dataset(aspects=[test_status_aspect]), make_generic_dataset_mcp( aspect_name="datasetProperties", aspect=DatasetPropertiesClass(description="test dataset"), ), ], transformers=[{ "type": "mark_dataset_status", "config": { "removed": True } }], path=tmp_path, ) # assert MCE was transformed assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect( entity_urn=mcp.entityUrn or "", aspect=StatusClass(removed=True), aspect_type=StatusClass, file=events_file, ) == 1) # assert MCP aspect was preserved assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="datasetProperties", aspect_field_matcher={"description": "test dataset"}, file=events_file, ) == 1) # MCE (non-matching) + MCP (non-matching) test_mcp_aspect = GlobalTagsClass( tags=[TagAssociationClass(tag="urn:li:tag:test")]) test_dataset_props_aspect = DatasetPropertiesClass( description="Test dataset") events_file = create_and_run_test_pipeline( events=[ make_generic_dataset(aspects=[test_dataset_props_aspect]), make_generic_dataset_mcp(aspect_name="globalTags", aspect=test_mcp_aspect), ], transformers=[{ "type": "mark_dataset_status", "config": { "removed": True } }], path=tmp_path, ) # assert MCE was preserved assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect( entity_urn=mcp.entityUrn or "", aspect=test_dataset_props_aspect, aspect_type=DatasetPropertiesClass, file=events_file, ) == 1) # assert MCP aspect was preserved assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="globalTags", aspect_field_matcher={"tags": [{ "tag": "urn:li:tag:test" }]}, file=events_file, ) == 1) # assert MCP Status aspect was generated assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="status", aspect_field_matcher={"removed": True}, file=events_file, ) == 1)
def test_mcp_multiple_transformers_replace(mock_time, tmp_path): mcps: MutableSequence[Union[ MetadataChangeEventClass, MetadataChangeProposalWrapper]] = [ MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=str( DatasetUrn.create_from_ids( platform_id="elasticsearch", table_name=f"fooBarIndex{i}", env="PROD", )), aspectName="globalTags", aspect=GlobalTagsClass( tags=[TagAssociationClass(tag="urn:li:tag:Test")]), ) for i in range(0, 10) ] mcps.extend([ MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=str( DatasetUrn.create_from_ids( platform_id="elasticsearch", table_name=f"fooBarIndex{i}", env="PROD", )), aspectName="datasetProperties", aspect=DatasetPropertiesClass(description="test dataset"), ) for i in range(0, 10) ]) # shuffle the mcps import random random.shuffle(mcps) events_file = create_and_run_test_pipeline( events=list(mcps), transformers=[ { "type": "set_dataset_browse_path", "config": { "path_templates": ["/ENV/PLATFORM/EsComments/DATASET_PARTS"] }, }, { "type": "simple_add_dataset_tags", "config": { "tag_urns": ["urn:li:tag:EsComments"] }, }, ], path=tmp_path, ) urn_pattern = "^" + re.escape( "urn:li:dataset:(urn:li:dataPlatform:elasticsearch,fooBarIndex") # there should be 30 MCP-s assert (tests.test_helpers.mce_helpers.assert_mcp_entity_urn( filter="ALL", entity_type="dataset", regex_pattern=urn_pattern, file=events_file, ) == 30) # 10 globalTags aspects with new tag attached assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="globalTags", aspect_field_matcher={ "tags": [{ "tag": "urn:li:tag:Test" }, { "tag": "urn:li:tag:EsComments" }] }, file=events_file, ) == 10) # check on browsePaths aspect for i in range(0, 10): tests.test_helpers.mce_helpers.assert_entity_mcp_aspect( entity_urn=str( DatasetUrn.create_from_ids( platform_id="elasticsearch", table_name=f"fooBarIndex{i}", env="PROD", )), aspect_name="browsePaths", aspect_field_matcher={ "paths": [f"/prod/elasticsearch/EsComments/fooBarIndex{i}"] }, file=events_file, ) == 1
def generate_tags_aspect(self) -> Iterable[GlobalTagsClass]: tags = GlobalTagsClass(tags=[ TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (self.tags or []) ]) return [tags]
from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter # Imports for metadata model classes from datahub.metadata.schema_classes import ( ChangeTypeClass, GlobalTagsClass, TagAssociationClass, ) log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") tag_urn = make_tag_urn("purchase") event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="globalTags", aspect=GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)]), ) # Create rest emitter rest_emitter = DatahubRestEmitter(gms_server="http://localhost:8080") rest_emitter.emit(event) log.info(f"Set tags to {tag_urn} for dataset {dataset_urn}")