Example #1
0
def test_add_dataset_browse_paths():
    dataset = make_generic_dataset()

    transformer = AddDatasetBrowsePathTransformer.create(
        {"path_templates": ["/abc"]},
        PipelineContext(run_id="test"),
    )
    transformed = list(transformer.transform([RecordEnvelope(dataset, metadata={})]))
    browse_path_aspect = builder.get_aspect_if_available(
        transformed[0].record, models.BrowsePathsClass
    )
    assert browse_path_aspect
    assert browse_path_aspect.paths == ["/abc"]

    transformer = AddDatasetBrowsePathTransformer.create(
        {
            "path_templates": [
                "/PLATFORM/foo/DATASET_PARTS/ENV",
                "/ENV/PLATFORM/bar/DATASET_PARTS/",
            ]
        },
        PipelineContext(run_id="test"),
    )
    transformed = list(transformer.transform([RecordEnvelope(dataset, metadata={})]))
    browse_path_aspect = builder.get_aspect_if_available(
        transformed[0].record, models.BrowsePathsClass
    )
    assert browse_path_aspect
    assert browse_path_aspect.paths == [
        "/abc",
        "/bigquery/foo/example1/prod",
        "/prod/bigquery/bar/example1/",
    ]
Example #2
0
def test_simple_dataset_ownership_tranformation(mock_time):
    no_owner_aspect = make_generic_dataset()

    with_owner_aspect = make_dataset_with_owner()

    not_a_dataset = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)",
            aspects=[
                models.DataJobInfoClass(
                    name="User Deletions",
                    description="Constructs the fct_users_deleted from logging_events",
                    type=models.AzkabanJobTypeClass.SQL,
                )
            ],
        )
    )

    inputs = [
        no_owner_aspect,
        with_owner_aspect,
        not_a_dataset,
    ]

    transformer = SimpleAddDatasetOwnership.create(
        {
            "owner_urns": [
                builder.make_user_urn("person1"),
                builder.make_user_urn("person2"),
            ]
        },
        PipelineContext(run_id="test"),
    )

    outputs = list(
        transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs])
    )

    assert len(outputs) == len(inputs)

    # Check the first entry.
    first_ownership_aspect = builder.get_aspect_if_available(
        outputs[0].record, models.OwnershipClass
    )
    assert first_ownership_aspect
    assert len(first_ownership_aspect.owners) == 2

    # Check the second entry.
    second_ownership_aspect = builder.get_aspect_if_available(
        outputs[1].record, models.OwnershipClass
    )
    assert second_ownership_aspect
    assert len(second_ownership_aspect.owners) == 3

    # Verify that the third entry is unchanged.
    assert inputs[2] == outputs[2].record
Example #3
0
def test_simple_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = SimpleAddDatasetTags.create(
        {
            "tag_urns": [
                builder.make_tag_urn("NeedsDocumentation"),
                builder.make_tag_urn("Legacy"),
            ]
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]
        )
    )
    assert len(outputs) == 1

    # Check that tags were added.
    tags_aspect = builder.get_aspect_if_available(
        outputs[0].record, models.GlobalTagsClass
    )
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("NeedsDocumentation")
def test_pattern_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTags.create(
        {
            "tag_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_tag_urn("Private"),
                        builder.make_tag_urn("Legacy"),
                    ],
                    ".*example2.*":
                    [builder.make_term_urn("Needs Documentation")],
                }
            },
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))

    assert len(outputs) == 1
    # Check that glossary terms were added.
    tags_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                  models.GlobalTagsClass)
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private")
    assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
def test_pattern_dataset_terms_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTerms.create(
        {
            "term_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_term_urn("AccountBalance"),
                        builder.make_term_urn("Email"),
                    ],
                    ".*example2.*": [builder.make_term_urn("Address")],
                }
            },
        },
        PipelineContext(run_id="test-terms"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))

    assert len(outputs) == 1
    # Check that glossary terms were added.
    terms_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                   models.GlossaryTermsClass)
    assert terms_aspect
    assert len(terms_aspect.terms) == 2
    assert terms_aspect.terms[0].urn == builder.make_term_urn("AccountBalance")
    assert builder.make_term_urn("AccountBalance") not in terms_aspect.terms
def test_simple_add_dataset_properties(mock_time):
    dataset_mce = make_dataset_with_properties()

    new_properties = {"new-simple-property": "new-value"}
    transformer = SimpleAddDatasetProperties.create(
        {
            "properties": new_properties,
        },
        PipelineContext(run_id="test-simple-properties"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))
    assert len(outputs) == 1

    custom_properties = builder.get_aspect_if_available(
        outputs[0].record, models.DatasetPropertiesClass)

    print(str(custom_properties))
    assert custom_properties is not None
    assert custom_properties.customProperties == {
        **EXISTING_PROPERTIES,
        **new_properties,
    }
def test_pattern_dataset_ownership_with_type_transformation(mock_time):
    input = make_generic_dataset()

    transformer = PatternAddDatasetOwnership.create(
        {
            "owner_pattern": {
                "rules": {
                    ".*example1.*": [builder.make_user_urn("person1")],
                }
            },
            "ownership_type": "PRODUCER",
        },
        PipelineContext(run_id="test"),
    )

    output = list(transformer.transform([RecordEnvelope(input, metadata={})]))

    assert len(output) == 1

    ownership_aspect = builder.get_aspect_if_available(output[0].record,
                                                       models.OwnershipClass)
    assert ownership_aspect
    assert len(ownership_aspect.owners) == 1
    assert ownership_aspect.owners[
        0].type == models.OwnershipTypeClass.PRODUCER
def test_old_transformers_working_as_before(mock_time):

    dataset_mce = make_generic_dataset()
    dataset_mcp = make_generic_dataset_mcp()
    transformer = OldMCETransformer.create(
        {},
        PipelineContext(run_id="test-old-transformer"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, dataset_mcp,
                          EndOfStream()]
        ]))

    assert len(outputs) == 3  # MCP will come back untouched

    assert outputs[0].record == dataset_mce
    # Check that glossary terms were added.
    props_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                   DatasetPropertiesClass)
    assert props_aspect
    assert props_aspect.description == "Old Transformer was here"

    assert outputs[1].record == dataset_mcp

    assert isinstance(outputs[-1].record, EndOfStream)

    # MCP only stream
    dataset_mcps = [
        make_generic_dataset_mcp(),
        make_generic_dataset_mcp(aspect=DatasetPropertiesClass(
            description="Another test MCP")),
        EndOfStream(),
    ]
    transformer = OldMCETransformer.create(
        {},
        PipelineContext(run_id="test-old-transformer"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in dataset_mcps]))

    assert len(outputs) == 3  # MCP-s will come back untouched

    assert outputs[0].record == dataset_mcps[0]
    assert outputs[1].record == dataset_mcps[1]
    assert isinstance(outputs[-1].record, EndOfStream)
Example #9
0
def test_mark_status_dataset():
    dataset = make_generic_dataset()

    transformer = MarkDatasetStatus.create(
        {"removed": True},
        PipelineContext(run_id="test"),
    )
    removed = list(transformer.transform([RecordEnvelope(dataset, metadata={})]))
    status_aspect = builder.get_aspect_if_available(
        removed[0].record, models.StatusClass
    )
    assert status_aspect
    assert status_aspect.removed is True

    transformer = MarkDatasetStatus.create(
        {"removed": False},
        PipelineContext(run_id="test"),
    )
    not_removed = list(transformer.transform([RecordEnvelope(dataset, metadata={})]))
    status_aspect = builder.get_aspect_if_available(
        not_removed[0].record, models.StatusClass
    )
    assert status_aspect
    assert status_aspect.removed is False
def test_simple_remove_dataset_ownership():
    with_owner_aspect = make_dataset_with_owner()

    transformer = SimpleRemoveDatasetOwnership.create(
        {},
        PipelineContext(run_id="test"),
    )
    outputs = list(
        transformer.transform([RecordEnvelope(with_owner_aspect,
                                              metadata={})]))

    ownership_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                       models.OwnershipClass)
    assert ownership_aspect
    assert len(ownership_aspect.owners) == 0
Example #11
0
 def get_lineage_if_enabled(
     self, mce: MetadataChangeEventClass
 ) -> Optional[MetadataChangeProposalWrapper]:
     if self.source_config.emit_s3_lineage:
         # extract dataset properties aspect
         dataset_properties: Optional[
             DatasetPropertiesClass] = mce_builder.get_aspect_if_available(
                 mce, DatasetPropertiesClass)
         if dataset_properties and "Location" in dataset_properties.customProperties:
             location = dataset_properties.customProperties["Location"]
             if location.startswith("s3://"):
                 s3_dataset_urn = make_s3_urn(location,
                                              self.source_config.env)
                 if self.source_config.glue_s3_lineage_direction == "upstream":
                     upstream_lineage = UpstreamLineageClass(upstreams=[
                         UpstreamClass(
                             dataset=s3_dataset_urn,
                             type=DatasetLineageTypeClass.COPY,
                         )
                     ])
                     mcp = MetadataChangeProposalWrapper(
                         entityType="dataset",
                         entityUrn=mce.proposedSnapshot.urn,
                         changeType=ChangeTypeClass.UPSERT,
                         aspectName="upstreamLineage",
                         aspect=upstream_lineage,
                     )
                     return mcp
                 else:
                     # Need to mint the s3 dataset with upstream lineage from it to glue
                     upstream_lineage = UpstreamLineageClass(upstreams=[
                         UpstreamClass(
                             dataset=mce.proposedSnapshot.urn,
                             type=DatasetLineageTypeClass.COPY,
                         )
                     ])
                     mcp = MetadataChangeProposalWrapper(
                         entityType="dataset",
                         entityUrn=s3_dataset_urn,
                         changeType=ChangeTypeClass.UPSERT,
                         aspectName="upstreamLineage",
                         aspect=upstream_lineage,
                     )
                     return mcp
     return None
def test_simple_dataset_terms_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = SimpleAddDatasetTerms.create(
        {
            "term_urns": [
                builder.make_term_urn("Test"),
                builder.make_term_urn("Needs Review"),
            ]
        },
        PipelineContext(run_id="test-terms"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))
    assert len(outputs) == 1

    # Check that glossary terms were added.
    terms_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                   models.GlossaryTermsClass)
    assert terms_aspect
    assert len(terms_aspect.terms) == 2
    assert terms_aspect.terms[0].urn == builder.make_term_urn("Test")
def test_add_dataset_properties(mock_time):
    dataset_mce = make_dataset_with_properties()

    transformer = AddDatasetProperties.create(
        {
            "add_properties_resolver_class":
            "tests.unit.test_transform_dataset.DummyPropertiesResolverClass"
        },
        PipelineContext(run_id="test-properties"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))
    assert len(outputs) == 1

    custom_properties = builder.get_aspect_if_available(
        outputs[0].record, models.DatasetPropertiesClass)

    assert custom_properties is not None
    assert custom_properties.customProperties == {
        **EXISTING_PROPERTIES,
        **PROPERTIES_TO_ADD,
    }
Example #14
0
def test_pattern_dataset_ownership_tranformation(mock_time):
    no_owner_aspect = make_generic_dataset()

    with_owner_aspect = models.MetadataChangeEventClass(
        proposedSnapshot=models.DatasetSnapshotClass(
            urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)",
            aspects=[
                models.OwnershipClass(
                    owners=[
                        models.OwnerClass(
                            owner=builder.make_user_urn("fake_owner"),
                            type=models.OwnershipTypeClass.DATAOWNER,
                        ),
                    ],
                    lastModified=models.AuditStampClass(
                        time=1625266033123, actor="urn:li:corpuser:datahub"
                    ),
                )
            ],
        ),
    )

    not_a_dataset = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)",
            aspects=[
                models.DataJobInfoClass(
                    name="User Deletions",
                    description="Constructs the fct_users_deleted from logging_events",
                    type=models.AzkabanJobTypeClass.SQL,
                )
            ],
        )
    )

    inputs = [
        no_owner_aspect,
        with_owner_aspect,
        not_a_dataset,
    ]

    transformer = PatternAddDatasetOwnership.create(
        {
            "owner_pattern": {
                "rules": {
                    ".*example1.*": [builder.make_user_urn("person1")],
                    ".*example2.*": [builder.make_user_urn("person2")],
                }
            },
        },
        PipelineContext(run_id="test"),
    )

    outputs = list(
        transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs])
    )

    assert len(outputs) == len(inputs)

    # Check the first entry.
    first_ownership_aspect = builder.get_aspect_if_available(
        outputs[0].record, models.OwnershipClass
    )
    assert first_ownership_aspect
    assert len(first_ownership_aspect.owners) == 1

    # Check the second entry.
    second_ownership_aspect = builder.get_aspect_if_available(
        outputs[1].record, models.OwnershipClass
    )
    assert second_ownership_aspect
    assert len(second_ownership_aspect.owners) == 2

    # Verify that the third entry is unchanged.
    assert inputs[2] == outputs[2].record
def test_add_dataset_browse_paths():
    dataset = make_generic_dataset()

    transformer = AddDatasetBrowsePathTransformer.create(
        {"path_templates": ["/abc"]},
        PipelineContext(run_id="test"),
    )
    transformed = list(
        transformer.transform([
            RecordEnvelope(dataset, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))
    browse_path_aspect = transformed[1].record.aspect
    assert browse_path_aspect
    assert browse_path_aspect.paths == ["/abc"]

    # use an mce with a pre-existing browse path
    dataset_mce = make_generic_dataset(
        aspects=[StatusClass(removed=False), browse_path_aspect])

    transformer = AddDatasetBrowsePathTransformer.create(
        {
            "path_templates": [
                "/PLATFORM/foo/DATASET_PARTS/ENV",
                "/ENV/PLATFORM/bar/DATASET_PARTS/",
            ]
        },
        PipelineContext(run_id="test"),
    )
    transformed = list(
        transformer.transform([
            RecordEnvelope(dataset_mce, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))
    assert len(transformed) == 2
    browse_path_aspect = builder.get_aspect_if_available(
        transformed[0].record, BrowsePathsClass)
    assert browse_path_aspect
    assert browse_path_aspect.paths == [
        "/abc",
        "/bigquery/foo/example1/prod",
        "/prod/bigquery/bar/example1/",
    ]

    transformer = AddDatasetBrowsePathTransformer.create(
        {
            "path_templates": [
                "/xyz",
            ],
            "replace_existing": True,
        },
        PipelineContext(run_id="test"),
    )
    transformed = list(
        transformer.transform([
            RecordEnvelope(dataset_mce, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))
    assert len(transformed) == 2
    browse_path_aspect = builder.get_aspect_if_available(
        transformed[0].record, BrowsePathsClass)
    assert browse_path_aspect
    assert browse_path_aspect.paths == [
        "/xyz",
    ]
def test_mark_status_dataset(tmp_path):
    dataset = make_generic_dataset()

    transformer = MarkDatasetStatus.create(
        {"removed": True},
        PipelineContext(run_id="test"),
    )
    removed = list(
        transformer.transform([
            RecordEnvelope(dataset, metadata={}),
        ]))
    assert len(removed) == 1
    status_aspect = builder.get_aspect_if_available(removed[0].record,
                                                    models.StatusClass)
    assert status_aspect
    assert status_aspect.removed is True

    transformer = MarkDatasetStatus.create(
        {"removed": False},
        PipelineContext(run_id="test"),
    )
    not_removed = list(
        transformer.transform([
            RecordEnvelope(dataset, metadata={}),
        ]))
    assert len(not_removed) == 1
    status_aspect = builder.get_aspect_if_available(not_removed[0].record,
                                                    models.StatusClass)
    assert status_aspect
    assert status_aspect.removed is False

    mcp = make_generic_dataset_mcp(
        aspect_name="datasetProperties",
        aspect=DatasetPropertiesClass(description="Test dataset"),
    )
    events_file = create_and_run_test_pipeline(
        events=[mcp],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert dataset properties aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="datasetProperties",
        aspect_field_matcher={"description": "Test dataset"},
        file=events_file,
    ) == 1)

    # assert Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)

    # MCE only
    test_aspect = DatasetPropertiesClass(description="Test dataset")
    events_file = create_and_run_test_pipeline(
        events=[make_generic_dataset(aspects=[test_aspect])],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert dataset properties aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=test_aspect,
        aspect_type=DatasetPropertiesClass,
        file=events_file,
    ) == 1)

    # assert Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)

    # MCE (non-matching) + MCP (matching)
    test_aspect = DatasetPropertiesClass(description="Test dataset")
    events_file = create_and_run_test_pipeline(
        events=[
            make_generic_dataset(aspects=[test_aspect]),
            make_generic_dataset_mcp(),
        ],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert dataset properties aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=test_aspect,
        aspect_type=DatasetPropertiesClass,
        file=events_file,
    ) == 1)

    # assert Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)

    # MCE (matching) + MCP (non-matching)
    test_status_aspect = StatusClass(removed=False)
    events_file = create_and_run_test_pipeline(
        events=[
            make_generic_dataset(aspects=[test_status_aspect]),
            make_generic_dataset_mcp(
                aspect_name="datasetProperties",
                aspect=DatasetPropertiesClass(description="test dataset"),
            ),
        ],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert MCE was transformed
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=StatusClass(removed=True),
        aspect_type=StatusClass,
        file=events_file,
    ) == 1)

    # assert MCP aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="datasetProperties",
        aspect_field_matcher={"description": "test dataset"},
        file=events_file,
    ) == 1)

    # MCE (non-matching) + MCP (non-matching)
    test_mcp_aspect = GlobalTagsClass(
        tags=[TagAssociationClass(tag="urn:li:tag:test")])
    test_dataset_props_aspect = DatasetPropertiesClass(
        description="Test dataset")
    events_file = create_and_run_test_pipeline(
        events=[
            make_generic_dataset(aspects=[test_dataset_props_aspect]),
            make_generic_dataset_mcp(aspect_name="globalTags",
                                     aspect=test_mcp_aspect),
        ],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert MCE was preserved
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=test_dataset_props_aspect,
        aspect_type=DatasetPropertiesClass,
        file=events_file,
    ) == 1)

    # assert MCP aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="globalTags",
        aspect_field_matcher={"tags": [{
            "tag": "urn:li:tag:test"
        }]},
        file=events_file,
    ) == 1)

    # assert MCP Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)