def test_pattern_dataset_terms_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = PatternAddDatasetTerms.create( { "term_pattern": { "rules": { ".*example1.*": [ builder.make_term_urn("AccountBalance"), builder.make_term_urn("Email"), ], ".*example2.*": [builder.make_term_urn("Address")], } }, }, PipelineContext(run_id="test-terms"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in [dataset_mce]])) assert len(outputs) == 1 # Check that glossary terms were added. terms_aspect = builder.get_aspect_if_available(outputs[0].record, models.GlossaryTermsClass) assert terms_aspect assert len(terms_aspect.terms) == 2 assert terms_aspect.terms[0].urn == builder.make_term_urn("AccountBalance") assert builder.make_term_urn("AccountBalance") not in terms_aspect.terms
def test_simple_dataset_terms_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = SimpleAddDatasetTerms.create( { "term_urns": [ builder.make_term_urn("Test"), builder.make_term_urn("Needs Review"), ] }, PipelineContext(run_id="test-terms"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, EndOfStream()] ])) assert len(outputs) == 3 # Check that glossary terms were added. terms_aspect = outputs[1].record.aspect assert terms_aspect assert len(terms_aspect.terms) == 2 assert terms_aspect.terms[0].urn == builder.make_term_urn("Test")
def test_pattern_dataset_tags_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = PatternAddDatasetTags.create( { "tag_pattern": { "rules": { ".*example1.*": [ builder.make_tag_urn("Private"), builder.make_tag_urn("Legacy"), ], ".*example2.*": [builder.make_term_urn("Needs Documentation")], } }, }, PipelineContext(run_id="test-tags"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in [dataset_mce]])) assert len(outputs) == 1 # Check that glossary terms were added. tags_aspect = builder.get_aspect_if_available(outputs[0].record, models.GlobalTagsClass) assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private") assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
def get_operation_value( self, operation_key: str, operation_type: str, operation_config: Dict, raw_props: Dict, ) -> Optional[str]: if (operation_type == Constants.ADD_TAG_OPERATION and operation_config[Constants.TAG]): tag = operation_config[Constants.TAG] if self.tag_prefix: tag = self.tag_prefix + tag return tag elif (operation_type == Constants.ADD_OWNER_OPERATION and operation_config[Constants.OWNER_TYPE]): owner_id = raw_props[operation_key] if self.strip_owner_email_id: owner_id = self.sanitize_owner_ids(owner_id) if operation_config[Constants.OWNER_TYPE] == Constants.USER_OWNER: return mce_builder.make_owner_urn(owner_id, OwnerType.USER) elif operation_config[ Constants.OWNER_TYPE] == Constants.GROUP_OWNER: return mce_builder.make_owner_urn(owner_id, OwnerType.GROUP) elif (operation_type == Constants.ADD_TERM_OPERATION and operation_config[Constants.TERM]): term = operation_config[Constants.TERM] return mce_builder.make_term_urn(term) return None
def test_pattern_dataset_tags_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = PatternAddDatasetTags.create( { "tag_pattern": { "rules": { ".*example1.*": [ builder.make_tag_urn("Private"), builder.make_tag_urn("Legacy"), ], ".*example2.*": [builder.make_term_urn("Needs Documentation")], } }, }, PipelineContext(run_id="test-tags"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, EndOfStream()] ])) assert len(outputs) == 3 tags_aspect = outputs[1].record.aspect assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private") assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
def get_operation_value( self, operation_key: str, operation_type: str, operation_config: Dict, raw_props: Dict, ) -> Optional[str]: match_regexp = r"{{\s*\$match\s*}}" if ( operation_type == Constants.ADD_TAG_OPERATION and operation_config[Constants.TAG] ): tag = operation_config[Constants.TAG] if isinstance(raw_props[operation_key], str): tag = re.sub( match_regexp, raw_props[operation_key], tag, 0, re.MULTILINE ) if self.tag_prefix: tag = self.tag_prefix + tag return tag elif ( operation_type == Constants.ADD_OWNER_OPERATION and operation_config[Constants.OWNER_TYPE] ): owner_id = raw_props[operation_key] if self.strip_owner_email_id: owner_id = self.sanitize_owner_ids(owner_id) if operation_config[Constants.OWNER_TYPE] == Constants.USER_OWNER: return mce_builder.make_owner_urn(owner_id, OwnerType.USER) elif operation_config[Constants.OWNER_TYPE] == Constants.GROUP_OWNER: return mce_builder.make_owner_urn(owner_id, OwnerType.GROUP) elif ( operation_type == Constants.ADD_TERM_OPERATION and operation_config[Constants.TERM] ): term = operation_config[Constants.TERM] if isinstance(raw_props[operation_key], str): term = re.sub( match_regexp, raw_props[operation_key], term, 0, re.MULTILINE ) return mce_builder.make_term_urn(term) return None
AuditStampClass, ChangeTypeClass, GlossaryTermAssociationClass, GlossaryTermsClass, ) log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) # First we get the current terms gms_endpoint = "http://localhost:8080" rest_emitter = DatahubRestEmitter(gms_server=gms_endpoint) dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") term_to_add = make_term_urn("Classification.HighlyConfidential") term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add) # an audit stamp that basically says we have no idea when these terms were added to this dataset # change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time of the application unknown_audit_stamp = AuditStampClass(time=0, actor="urn:li:corpuser:ingestion") # create a brand new terms aspect terms_aspect = GlossaryTermsClass( terms=[term_association_to_add], auditStamp=unknown_audit_stamp, ) event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn,
"VARCHAR(100)", # use this to provide the type of the field in the source system's vernacular jsonPath="", # Unused field, can omit nullable=True, description= "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States", recursive=False, # Unused field, can omit # It is rare to attach tags to fields as part of the technical schema unless you are purely reflecting state that exists in the source system. # For an editable (in UI) version of this, use the editableSchemaMetadata aspect globalTags=GlobalTagsClass( tags=[TagAssociationClass(tag=make_tag_urn("location"))]), # It is rare to attach glossary terms to fields as part of the technical schema unless you are purely reflecting state that exists in the source system. # For an editable (in UI) version of this, use the editableSchemaMetadata aspect glossaryTerms=GlossaryTermsClass( terms=[ GlossaryTermAssociationClass( urn=make_term_urn("Classification.PII")) ], auditStamp= AuditStampClass( # represents the time when this term was attached to this field? time= 0, # time in milliseconds, leave as 0 if no time of association is known actor= "urn:li:corpuser:ingestion", # if this is a system provided tag, use a bot user id like ingestion ), ), ) ], ), ) # Create rest emitter