def test_old_transformers_working_as_before(mock_time):

    dataset_mce = make_generic_dataset()
    dataset_mcp = make_generic_dataset_mcp()
    transformer = OldMCETransformer.create(
        {},
        PipelineContext(run_id="test-old-transformer"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, dataset_mcp,
                          EndOfStream()]
        ]))

    assert len(outputs) == 3  # MCP will come back untouched

    assert outputs[0].record == dataset_mce
    # Check that glossary terms were added.
    props_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                   DatasetPropertiesClass)
    assert props_aspect
    assert props_aspect.description == "Old Transformer was here"

    assert outputs[1].record == dataset_mcp

    assert isinstance(outputs[-1].record, EndOfStream)

    # MCP only stream
    dataset_mcps = [
        make_generic_dataset_mcp(),
        make_generic_dataset_mcp(aspect=DatasetPropertiesClass(
            description="Another test MCP")),
        EndOfStream(),
    ]
    transformer = OldMCETransformer.create(
        {},
        PipelineContext(run_id="test-old-transformer"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in dataset_mcps]))

    assert len(outputs) == 3  # MCP-s will come back untouched

    assert outputs[0].record == dataset_mcps[0]
    assert outputs[1].record == dataset_mcps[1]
    assert isinstance(outputs[-1].record, EndOfStream)
def test_mcp_add_tags_missing(mock_time):

    dataset_mcp = make_generic_dataset_mcp()

    transformer = SimpleAddDatasetTags.create(
        {
            "tag_urns": [
                builder.make_tag_urn("NeedsDocumentation"),
                builder.make_tag_urn("Legacy"),
            ]
        },
        PipelineContext(run_id="test-tags"),
    )
    input_stream: List[RecordEnvelope] = [
        RecordEnvelope(input, metadata={}) for input in [dataset_mcp]
    ]
    input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={}))
    outputs = list(transformer.transform(input_stream))
    assert len(outputs) == 3
    assert outputs[0].record == dataset_mcp
    # Check that tags were added, this will be the second result
    tags_aspect = outputs[1].record.aspect
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn(
        "NeedsDocumentation")
    assert isinstance(outputs[-1].record, EndOfStream)
def test_pattern_dataset_terms_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTerms.create(
        {
            "term_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_term_urn("AccountBalance"),
                        builder.make_term_urn("Email"),
                    ],
                    ".*example2.*": [builder.make_term_urn("Address")],
                }
            },
        },
        PipelineContext(run_id="test-terms"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, EndOfStream()]
        ]))

    assert len(outputs) == 3
    # Check that glossary terms were added.
    terms_aspect = outputs[1].record.aspect
    assert terms_aspect
    assert len(terms_aspect.terms) == 2
    assert terms_aspect.terms[0].urn == builder.make_term_urn("AccountBalance")
    assert builder.make_term_urn("AccountBalance") not in terms_aspect.terms
def test_simple_dataset_terms_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = SimpleAddDatasetTerms.create(
        {
            "term_urns": [
                builder.make_term_urn("Test"),
                builder.make_term_urn("Needs Review"),
            ]
        },
        PipelineContext(run_id="test-terms"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, EndOfStream()]
        ]))
    assert len(outputs) == 3

    # Check that glossary terms were added.
    terms_aspect = outputs[1].record.aspect
    assert terms_aspect
    assert len(terms_aspect.terms) == 2
    assert terms_aspect.terms[0].urn == builder.make_term_urn("Test")
def test_pattern_dataset_ownership_with_type_transformation(mock_time):
    input = make_generic_dataset()

    transformer = PatternAddDatasetOwnership.create(
        {
            "owner_pattern": {
                "rules": {
                    ".*example1.*": [builder.make_user_urn("person1")],
                }
            },
            "ownership_type": "PRODUCER",
        },
        PipelineContext(run_id="test"),
    )

    output = list(
        transformer.transform([
            RecordEnvelope(input, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))

    assert len(output) == 3

    ownership_aspect = output[1].record.aspect
    assert ownership_aspect
    assert len(ownership_aspect.owners) == 1
    assert ownership_aspect.owners[
        0].type == models.OwnershipTypeClass.PRODUCER
def test_pattern_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTags.create(
        {
            "tag_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_tag_urn("Private"),
                        builder.make_tag_urn("Legacy"),
                    ],
                    ".*example2.*":
                    [builder.make_term_urn("Needs Documentation")],
                }
            },
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, EndOfStream()]
        ]))

    assert len(outputs) == 3
    tags_aspect = outputs[1].record.aspect
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private")
    assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
def test_simple_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = SimpleAddDatasetTags.create(
        {
            "tag_urns": [
                builder.make_tag_urn("NeedsDocumentation"),
                builder.make_tag_urn("Legacy"),
            ]
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, EndOfStream()]
        ]))
    assert len(outputs) == 3

    # Check that tags were added.
    tags_aspect = outputs[1].record.aspect
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn(
        "NeedsDocumentation")
def test_simple_dataset_ownership_with_type_transformation(mock_time):
    input = make_generic_dataset()

    transformer = SimpleAddDatasetOwnership.create(
        {
            "owner_urns": [
                builder.make_user_urn("person1"),
            ],
            "ownership_type": "PRODUCER",
        },
        PipelineContext(run_id="test"),
    )

    output = list(
        transformer.transform([
            RecordEnvelope(input, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))

    assert len(output) == 3

    # original MCE is unchanged
    assert input == output[0].record

    ownership_aspect = output[1].record.aspect

    assert isinstance(ownership_aspect, OwnershipClass)
    assert len(ownership_aspect.owners) == 1
    assert ownership_aspect.owners[
        0].type == models.OwnershipTypeClass.PRODUCER
Example #9
0
    def run(self) -> None:

        callback = LoggingCallback()
        extractor: Extractor = self.extractor_class()
        for wu in itertools.islice(self.source.get_workunits(),
                                   10 if self.preview_mode else None):
            # TODO: change extractor interface
            extractor.configure({}, self.ctx)

            if not self.dry_run:
                self.sink.handle_work_unit_start(wu)
            record_envelopes = extractor.get_records(wu)
            for record_envelope in self.transform(record_envelopes):
                if not self.dry_run:
                    self.sink.write_record_async(record_envelope, callback)

            extractor.close()
            if not self.dry_run:
                self.sink.handle_work_unit_end(wu)
        self.source.close()
        # no more data is coming, we need to let the transformers produce any additional records if they are holding on to state
        for record_envelope in self.transform([
                RecordEnvelope(record=EndOfStream(),
                               metadata={"workunit_id": "end-of-stream"})
        ]):
            if not self.dry_run and not isinstance(record_envelope.record,
                                                   EndOfStream):
                # TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc.
                self.sink.write_record_async(record_envelope, callback)

        self.sink.close()
        self.process_commits()
def test_supression_works():
    dataset_mce = make_generic_dataset()
    dataset_mcp = make_generic_dataset_mcp(
        aspect_name="datasetProperties",
        aspect=DatasetPropertiesClass(description="supressable description"),
    )
    transformer = SuppressingTransformer.create(
        {},
        PipelineContext(run_id="test-suppress-transformer"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, dataset_mcp,
                          EndOfStream()]
        ]))

    assert len(outputs) == 2  # MCP will be dropped
def test_pattern_dataset_ownership_transformation(mock_time):
    no_owner_aspect = make_generic_dataset()

    with_owner_aspect = models.MetadataChangeEventClass(
        proposedSnapshot=models.DatasetSnapshotClass(
            urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)",
            aspects=[
                models.OwnershipClass(
                    owners=[
                        models.OwnerClass(
                            owner=builder.make_user_urn("fake_owner"),
                            type=models.OwnershipTypeClass.DATAOWNER,
                        ),
                    ],
                    lastModified=models.AuditStampClass(
                        time=1625266033123, actor="urn:li:corpuser:datahub"),
                )
            ],
        ), )

    not_a_dataset = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn=
            "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)",
            aspects=[
                models.DataJobInfoClass(
                    name="User Deletions",
                    description=
                    "Constructs the fct_users_deleted from logging_events",
                    type=models.AzkabanJobTypeClass.SQL,
                )
            ],
        ))

    inputs = [no_owner_aspect, with_owner_aspect, not_a_dataset, EndOfStream()]

    transformer = PatternAddDatasetOwnership.create(
        {
            "owner_pattern": {
                "rules": {
                    ".*example1.*": [builder.make_user_urn("person1")],
                    ".*example2.*": [builder.make_user_urn("person2")],
                }
            },
        },
        PipelineContext(run_id="test"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in inputs]))

    assert len(
        outputs) == len(inputs) + 1  # additional MCP due to the no-owner MCE

    # Check the first entry.
    assert inputs[0] == outputs[0].record

    first_ownership_aspect = outputs[3].record.aspect
    assert first_ownership_aspect
    assert len(first_ownership_aspect.owners) == 1
    assert all([
        owner.type == models.OwnershipTypeClass.DATAOWNER
        for owner in first_ownership_aspect.owners
    ])

    # Check the second entry.
    second_ownership_aspect = builder.get_aspect_if_available(
        outputs[1].record, models.OwnershipClass)
    assert second_ownership_aspect
    assert len(second_ownership_aspect.owners) == 2
    assert all([
        owner.type == models.OwnershipTypeClass.DATAOWNER
        for owner in second_ownership_aspect.owners
    ])

    # Verify that the third entry is unchanged.
    assert inputs[2] == outputs[2].record

    # Verify that the last entry is unchanged (EOS)
    assert inputs[-1] == outputs[-1].record
def test_add_dataset_browse_paths():
    dataset = make_generic_dataset()

    transformer = AddDatasetBrowsePathTransformer.create(
        {"path_templates": ["/abc"]},
        PipelineContext(run_id="test"),
    )
    transformed = list(
        transformer.transform([
            RecordEnvelope(dataset, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))
    browse_path_aspect = transformed[1].record.aspect
    assert browse_path_aspect
    assert browse_path_aspect.paths == ["/abc"]

    # use an mce with a pre-existing browse path
    dataset_mce = make_generic_dataset(
        aspects=[StatusClass(removed=False), browse_path_aspect])

    transformer = AddDatasetBrowsePathTransformer.create(
        {
            "path_templates": [
                "/PLATFORM/foo/DATASET_PARTS/ENV",
                "/ENV/PLATFORM/bar/DATASET_PARTS/",
            ]
        },
        PipelineContext(run_id="test"),
    )
    transformed = list(
        transformer.transform([
            RecordEnvelope(dataset_mce, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))
    assert len(transformed) == 2
    browse_path_aspect = builder.get_aspect_if_available(
        transformed[0].record, BrowsePathsClass)
    assert browse_path_aspect
    assert browse_path_aspect.paths == [
        "/abc",
        "/bigquery/foo/example1/prod",
        "/prod/bigquery/bar/example1/",
    ]

    transformer = AddDatasetBrowsePathTransformer.create(
        {
            "path_templates": [
                "/xyz",
            ],
            "replace_existing": True,
        },
        PipelineContext(run_id="test"),
    )
    transformed = list(
        transformer.transform([
            RecordEnvelope(dataset_mce, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))
    assert len(transformed) == 2
    browse_path_aspect = builder.get_aspect_if_available(
        transformed[0].record, BrowsePathsClass)
    assert browse_path_aspect
    assert browse_path_aspect.paths == [
        "/xyz",
    ]
def test_simple_dataset_ownership_transformation(mock_time):
    no_owner_aspect = make_generic_dataset()

    with_owner_aspect = make_dataset_with_owner()

    not_a_dataset = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn=
            "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)",
            aspects=[
                models.DataJobInfoClass(
                    name="User Deletions",
                    description=
                    "Constructs the fct_users_deleted from logging_events",
                    type=models.AzkabanJobTypeClass.SQL,
                )
            ],
        ))

    inputs = [no_owner_aspect, with_owner_aspect, not_a_dataset, EndOfStream()]

    transformer = SimpleAddDatasetOwnership.create(
        {
            "owner_urns": [
                builder.make_user_urn("person1"),
                builder.make_user_urn("person2"),
            ]
        },
        PipelineContext(run_id="test"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in inputs]))

    assert len(outputs) == len(inputs) + 1

    # Check the first entry.
    first_ownership_aspect = builder.get_aspect_if_available(
        outputs[0].record, models.OwnershipClass)
    assert first_ownership_aspect is None

    last_event = outputs[3].record
    assert isinstance(last_event, MetadataChangeProposalWrapper)
    assert isinstance(last_event.aspect, OwnershipClass)
    assert len(last_event.aspect.owners) == 2
    assert last_event.entityUrn == outputs[0].record.proposedSnapshot.urn
    assert all([
        owner.type == models.OwnershipTypeClass.DATAOWNER
        for owner in last_event.aspect.owners
    ])

    # Check the second entry.
    second_ownership_aspect = builder.get_aspect_if_available(
        outputs[1].record, models.OwnershipClass)
    assert second_ownership_aspect
    assert len(second_ownership_aspect.owners) == 3
    assert all([
        owner.type == models.OwnershipTypeClass.DATAOWNER
        for owner in second_ownership_aspect.owners
    ])

    # Verify that the third entry is unchanged.
    assert inputs[2] == outputs[2].record

    # Verify that the last entry is EndOfStream
    assert inputs[3] == outputs[4].record