def test_pattern_dataset_terms_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTerms.create(
        {
            "term_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_term_urn("AccountBalance"),
                        builder.make_term_urn("Email"),
                    ],
                    ".*example2.*": [builder.make_term_urn("Address")],
                }
            },
        },
        PipelineContext(run_id="test-terms"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))

    assert len(outputs) == 1
    # Check that glossary terms were added.
    terms_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                   models.GlossaryTermsClass)
    assert terms_aspect
    assert len(terms_aspect.terms) == 2
    assert terms_aspect.terms[0].urn == builder.make_term_urn("AccountBalance")
    assert builder.make_term_urn("AccountBalance") not in terms_aspect.terms
def test_simple_dataset_terms_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = SimpleAddDatasetTerms.create(
        {
            "term_urns": [
                builder.make_term_urn("Test"),
                builder.make_term_urn("Needs Review"),
            ]
        },
        PipelineContext(run_id="test-terms"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, EndOfStream()]
        ]))
    assert len(outputs) == 3

    # Check that glossary terms were added.
    terms_aspect = outputs[1].record.aspect
    assert terms_aspect
    assert len(terms_aspect.terms) == 2
    assert terms_aspect.terms[0].urn == builder.make_term_urn("Test")
def test_pattern_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTags.create(
        {
            "tag_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_tag_urn("Private"),
                        builder.make_tag_urn("Legacy"),
                    ],
                    ".*example2.*":
                    [builder.make_term_urn("Needs Documentation")],
                }
            },
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))

    assert len(outputs) == 1
    # Check that glossary terms were added.
    tags_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                  models.GlobalTagsClass)
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private")
    assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
Exemple #4
0
 def get_operation_value(
     self,
     operation_key: str,
     operation_type: str,
     operation_config: Dict,
     raw_props: Dict,
 ) -> Optional[str]:
     if (operation_type == Constants.ADD_TAG_OPERATION
             and operation_config[Constants.TAG]):
         tag = operation_config[Constants.TAG]
         if self.tag_prefix:
             tag = self.tag_prefix + tag
         return tag
     elif (operation_type == Constants.ADD_OWNER_OPERATION
           and operation_config[Constants.OWNER_TYPE]):
         owner_id = raw_props[operation_key]
         if self.strip_owner_email_id:
             owner_id = self.sanitize_owner_ids(owner_id)
         if operation_config[Constants.OWNER_TYPE] == Constants.USER_OWNER:
             return mce_builder.make_owner_urn(owner_id, OwnerType.USER)
         elif operation_config[
                 Constants.OWNER_TYPE] == Constants.GROUP_OWNER:
             return mce_builder.make_owner_urn(owner_id, OwnerType.GROUP)
     elif (operation_type == Constants.ADD_TERM_OPERATION
           and operation_config[Constants.TERM]):
         term = operation_config[Constants.TERM]
         return mce_builder.make_term_urn(term)
     return None
def test_pattern_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTags.create(
        {
            "tag_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_tag_urn("Private"),
                        builder.make_tag_urn("Legacy"),
                    ],
                    ".*example2.*":
                    [builder.make_term_urn("Needs Documentation")],
                }
            },
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, EndOfStream()]
        ]))

    assert len(outputs) == 3
    tags_aspect = outputs[1].record.aspect
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private")
    assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
Exemple #6
0
    def get_operation_value(
        self,
        operation_key: str,
        operation_type: str,
        operation_config: Dict,
        raw_props: Dict,
    ) -> Optional[str]:
        match_regexp = r"{{\s*\$match\s*}}"

        if (
            operation_type == Constants.ADD_TAG_OPERATION
            and operation_config[Constants.TAG]
        ):
            tag = operation_config[Constants.TAG]
            if isinstance(raw_props[operation_key], str):
                tag = re.sub(
                    match_regexp, raw_props[operation_key], tag, 0, re.MULTILINE
                )

            if self.tag_prefix:
                tag = self.tag_prefix + tag
            return tag
        elif (
            operation_type == Constants.ADD_OWNER_OPERATION
            and operation_config[Constants.OWNER_TYPE]
        ):
            owner_id = raw_props[operation_key]
            if self.strip_owner_email_id:
                owner_id = self.sanitize_owner_ids(owner_id)
            if operation_config[Constants.OWNER_TYPE] == Constants.USER_OWNER:
                return mce_builder.make_owner_urn(owner_id, OwnerType.USER)
            elif operation_config[Constants.OWNER_TYPE] == Constants.GROUP_OWNER:
                return mce_builder.make_owner_urn(owner_id, OwnerType.GROUP)
        elif (
            operation_type == Constants.ADD_TERM_OPERATION
            and operation_config[Constants.TERM]
        ):
            term = operation_config[Constants.TERM]
            if isinstance(raw_props[operation_key], str):
                term = re.sub(
                    match_regexp, raw_props[operation_key], term, 0, re.MULTILINE
                )
            return mce_builder.make_term_urn(term)
        return None
    AuditStampClass,
    ChangeTypeClass,
    GlossaryTermAssociationClass,
    GlossaryTermsClass,
)

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# First we get the current terms
gms_endpoint = "http://localhost:8080"
rest_emitter = DatahubRestEmitter(gms_server=gms_endpoint)

dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD")

term_to_add = make_term_urn("Classification.HighlyConfidential")
term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add)
# an audit stamp that basically says we have no idea when these terms were added to this dataset
# change the time value to (time.time() * 1000) if you want to specify the current time of running this code as the time of the application
unknown_audit_stamp = AuditStampClass(time=0, actor="urn:li:corpuser:ingestion")

# create a brand new terms aspect
terms_aspect = GlossaryTermsClass(
    terms=[term_association_to_add],
    auditStamp=unknown_audit_stamp,
)

event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper(
    entityType="dataset",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=dataset_urn,
                "VARCHAR(100)",  # use this to provide the type of the field in the source system's vernacular
                jsonPath="",  # Unused field, can omit
                nullable=True,
                description=
                "This is the zipcode of the address. Specified using extended form and limited to addresses in the United States",
                recursive=False,  # Unused field, can omit
                # It is rare to attach tags to fields as part of the technical schema unless you are purely reflecting state that exists in the source system.
                # For an editable (in UI) version of this, use the editableSchemaMetadata aspect
                globalTags=GlobalTagsClass(
                    tags=[TagAssociationClass(tag=make_tag_urn("location"))]),
                # It is rare to attach glossary terms to fields as part of the technical schema unless you are purely reflecting state that exists in the source system.
                # For an editable (in UI) version of this, use the editableSchemaMetadata aspect
                glossaryTerms=GlossaryTermsClass(
                    terms=[
                        GlossaryTermAssociationClass(
                            urn=make_term_urn("Classification.PII"))
                    ],
                    auditStamp=
                    AuditStampClass(  # represents the time when this term was attached to this field?
                        time=
                        0,  # time in milliseconds, leave as 0 if no time of association is known
                        actor=
                        "urn:li:corpuser:ingestion",  # if this is a system provided tag, use a bot user id like ingestion
                    ),
                ),
            )
        ],
    ),
)

# Create rest emitter