Ejemplo n.º 1
0
def test_add_dataset_browse_paths():
    dataset = make_generic_dataset()

    transformer = AddDatasetBrowsePathTransformer.create(
        {"path_templates": ["/abc"]},
        PipelineContext(run_id="test"),
    )
    transformed = list(transformer.transform([RecordEnvelope(dataset, metadata={})]))
    browse_path_aspect = builder.get_aspect_if_available(
        transformed[0].record, models.BrowsePathsClass
    )
    assert browse_path_aspect
    assert browse_path_aspect.paths == ["/abc"]

    transformer = AddDatasetBrowsePathTransformer.create(
        {
            "path_templates": [
                "/PLATFORM/foo/DATASET_PARTS/ENV",
                "/ENV/PLATFORM/bar/DATASET_PARTS/",
            ]
        },
        PipelineContext(run_id="test"),
    )
    transformed = list(transformer.transform([RecordEnvelope(dataset, metadata={})]))
    browse_path_aspect = builder.get_aspect_if_available(
        transformed[0].record, models.BrowsePathsClass
    )
    assert browse_path_aspect
    assert browse_path_aspect.paths == [
        "/abc",
        "/bigquery/foo/example1/prod",
        "/prod/bigquery/bar/example1/",
    ]
Ejemplo n.º 2
0
def test_mcp_add_tags_missing(mock_time):

    dataset_mcp = make_generic_dataset_mcp()

    transformer = SimpleAddDatasetTags.create(
        {
            "tag_urns": [
                builder.make_tag_urn("NeedsDocumentation"),
                builder.make_tag_urn("Legacy"),
            ]
        },
        PipelineContext(run_id="test-tags"),
    )
    input_stream: List[RecordEnvelope] = [
        RecordEnvelope(input, metadata={}) for input in [dataset_mcp]
    ]
    input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={}))
    outputs = list(transformer.transform(input_stream))
    assert len(outputs) == 3
    assert outputs[0].record == dataset_mcp
    # Check that tags were added, this will be the second result
    tags_aspect = outputs[1].record.aspect
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn(
        "NeedsDocumentation")
    assert isinstance(outputs[-1].record, EndOfStream)
Ejemplo n.º 3
0
def test_pattern_dataset_ownership_with_type_transformation(mock_time):
    input = make_generic_dataset()

    transformer = PatternAddDatasetOwnership.create(
        {
            "owner_pattern": {
                "rules": {
                    ".*example1.*": [builder.make_user_urn("person1")],
                }
            },
            "ownership_type": "PRODUCER",
        },
        PipelineContext(run_id="test"),
    )

    output = list(
        transformer.transform([
            RecordEnvelope(input, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))

    assert len(output) == 3

    ownership_aspect = output[1].record.aspect
    assert ownership_aspect
    assert len(ownership_aspect.owners) == 1
    assert ownership_aspect.owners[
        0].type == models.OwnershipTypeClass.PRODUCER
Ejemplo n.º 4
0
def test_simple_dataset_ownership_with_type_transformation(mock_time):
    input = make_generic_dataset()

    transformer = SimpleAddDatasetOwnership.create(
        {
            "owner_urns": [
                builder.make_user_urn("person1"),
            ],
            "ownership_type": "PRODUCER",
        },
        PipelineContext(run_id="test"),
    )

    output = list(
        transformer.transform([
            RecordEnvelope(input, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))

    assert len(output) == 3

    # original MCE is unchanged
    assert input == output[0].record

    ownership_aspect = output[1].record.aspect

    assert isinstance(ownership_aspect, OwnershipClass)
    assert len(ownership_aspect.owners) == 1
    assert ownership_aspect.owners[
        0].type == models.OwnershipTypeClass.PRODUCER
Ejemplo n.º 5
0
def test_old_transformers_working_as_before(mock_time):

    dataset_mce = make_generic_dataset()
    dataset_mcp = make_generic_dataset_mcp()
    transformer = OldMCETransformer.create(
        {},
        PipelineContext(run_id="test-old-transformer"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, dataset_mcp,
                          EndOfStream()]
        ]))

    assert len(outputs) == 3  # MCP will come back untouched

    assert outputs[0].record == dataset_mce
    # Check that glossary terms were added.
    props_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                   DatasetPropertiesClass)
    assert props_aspect
    assert props_aspect.description == "Old Transformer was here"

    assert outputs[1].record == dataset_mcp

    assert isinstance(outputs[-1].record, EndOfStream)

    # MCP only stream
    dataset_mcps = [
        make_generic_dataset_mcp(),
        make_generic_dataset_mcp(aspect=DatasetPropertiesClass(
            description="Another test MCP")),
        EndOfStream(),
    ]
    transformer = OldMCETransformer.create(
        {},
        PipelineContext(run_id="test-old-transformer"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in dataset_mcps]))

    assert len(outputs) == 3  # MCP-s will come back untouched

    assert outputs[0].record == dataset_mcps[0]
    assert outputs[1].record == dataset_mcps[1]
    assert isinstance(outputs[-1].record, EndOfStream)
def test_pattern_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTags.create(
        {
            "tag_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_tag_urn("Private"),
                        builder.make_tag_urn("Legacy"),
                    ],
                    ".*example2.*":
                    [builder.make_term_urn("Needs Documentation")],
                }
            },
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))

    assert len(outputs) == 1
    # Check that glossary terms were added.
    tags_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                  models.GlobalTagsClass)
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private")
    assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
Ejemplo n.º 7
0
def test_simple_dataset_terms_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = SimpleAddDatasetTerms.create(
        {
            "term_urns": [
                builder.make_term_urn("Test"),
                builder.make_term_urn("Needs Review"),
            ]
        },
        PipelineContext(run_id="test-terms"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, EndOfStream()]
        ]))
    assert len(outputs) == 3

    # Check that glossary terms were added.
    terms_aspect = outputs[1].record.aspect
    assert terms_aspect
    assert len(terms_aspect.terms) == 2
    assert terms_aspect.terms[0].urn == builder.make_term_urn("Test")
Ejemplo n.º 8
0
def test_pattern_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTags.create(
        {
            "tag_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_tag_urn("Private"),
                        builder.make_tag_urn("Legacy"),
                    ],
                    ".*example2.*":
                    [builder.make_term_urn("Needs Documentation")],
                }
            },
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, EndOfStream()]
        ]))

    assert len(outputs) == 3
    tags_aspect = outputs[1].record.aspect
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private")
    assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
def test_simple_add_dataset_properties(mock_time):
    dataset_mce = make_dataset_with_properties()

    new_properties = {"new-simple-property": "new-value"}
    transformer = SimpleAddDatasetProperties.create(
        {
            "properties": new_properties,
        },
        PipelineContext(run_id="test-simple-properties"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))
    assert len(outputs) == 1

    custom_properties = builder.get_aspect_if_available(
        outputs[0].record, models.DatasetPropertiesClass)

    print(str(custom_properties))
    assert custom_properties is not None
    assert custom_properties.customProperties == {
        **EXISTING_PROPERTIES,
        **new_properties,
    }
Ejemplo n.º 10
0
def test_pattern_dataset_terms_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = PatternAddDatasetTerms.create(
        {
            "term_pattern": {
                "rules": {
                    ".*example1.*": [
                        builder.make_term_urn("AccountBalance"),
                        builder.make_term_urn("Email"),
                    ],
                    ".*example2.*": [builder.make_term_urn("Address")],
                }
            },
        },
        PipelineContext(run_id="test-terms"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))

    assert len(outputs) == 1
    # Check that glossary terms were added.
    terms_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                   models.GlossaryTermsClass)
    assert terms_aspect
    assert len(terms_aspect.terms) == 2
    assert terms_aspect.terms[0].urn == builder.make_term_urn("AccountBalance")
    assert builder.make_term_urn("AccountBalance") not in terms_aspect.terms
    def test_kafka_sink_write(self, mock_k_callback, mock_producer,
                              mock_context):
        mock_producer_instance = mock_producer.return_value
        mock_k_callback_instance = mock_k_callback.return_value
        callback = MagicMock(spec=WriteCallback)
        kafka_sink = DatahubKafkaSink.create(
            {"connection": {
                "bootstrap": "foobar:9092"
            }}, mock_context)
        mce = builder.make_lineage_mce(
            [
                builder.make_dataset_urn("bigquery", "upstream1"),
                builder.make_dataset_urn("bigquery", "upstream2"),
            ],
            builder.make_dataset_urn("bigquery", "downstream1"),
        )

        re = RecordEnvelope(record=mce, metadata={})
        kafka_sink.write_record_async(re, callback)

        mock_producer_instance.poll.assert_called_once(
        )  # producer should call poll() first
        self.validate_kafka_callback(
            mock_k_callback, re,
            callback)  # validate kafka callback was constructed appropriately

        # validate that confluent_kafka.Producer.produce was called with the right arguments
        mock_producer_instance.produce.assert_called_once()
        args, kwargs = mock_producer_instance.produce.call_args
        assert kwargs["value"] == mce
        assert kwargs["key"]  # produce call should include a Kafka key
        created_callback = kwargs["on_delivery"]
        assert created_callback == mock_k_callback_instance.kafka_callback
Ejemplo n.º 12
0
def test_simple_dataset_tags_transformation(mock_time):
    dataset_mce = make_generic_dataset()

    transformer = SimpleAddDatasetTags.create(
        {
            "tag_urns": [
                builder.make_tag_urn("NeedsDocumentation"),
                builder.make_tag_urn("Legacy"),
            ]
        },
        PipelineContext(run_id="test-tags"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]
        )
    )
    assert len(outputs) == 1

    # Check that tags were added.
    tags_aspect = builder.get_aspect_if_available(
        outputs[0].record, models.GlobalTagsClass
    )
    assert tags_aspect
    assert len(tags_aspect.tags) == 2
    assert tags_aspect.tags[0].tag == builder.make_tag_urn("NeedsDocumentation")
Ejemplo n.º 13
0
    def run(self) -> None:

        callback = LoggingCallback()
        extractor: Extractor = self.extractor_class()
        for wu in itertools.islice(self.source.get_workunits(),
                                   10 if self.preview_mode else None):
            # TODO: change extractor interface
            extractor.configure({}, self.ctx)

            if not self.dry_run:
                self.sink.handle_work_unit_start(wu)
            record_envelopes = extractor.get_records(wu)
            for record_envelope in self.transform(record_envelopes):
                if not self.dry_run:
                    self.sink.write_record_async(record_envelope, callback)

            extractor.close()
            if not self.dry_run:
                self.sink.handle_work_unit_end(wu)
        self.source.close()
        # no more data is coming, we need to let the transformers produce any additional records if they are holding on to state
        for record_envelope in self.transform([
                RecordEnvelope(record=EndOfStream(),
                               metadata={"workunit_id": "end-of-stream"})
        ]):
            if not self.dry_run and not isinstance(record_envelope.record,
                                                   EndOfStream):
                # TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc.
                self.sink.write_record_async(record_envelope, callback)

        self.sink.close()
        self.process_commits()
Ejemplo n.º 14
0
    def transform(
        self, record_envelopes: Iterable[RecordEnvelope]
    ) -> Iterable[RecordEnvelope]:
        for envelope in record_envelopes:
            if not self._should_process(envelope.record):
                # early exit
                pass
            elif isinstance(envelope.record, MetadataChangeEventClass):
                envelope = self._transform_or_record_mce(envelope)
            elif isinstance(envelope.record,
                            MetadataChangeProposalWrapper) and isinstance(
                                self, SingleAspectTransformer):
                return_envelope = self._transform_or_record_mcp(envelope)
                if return_envelope is None:
                    continue
                else:
                    envelope = return_envelope
            elif isinstance(envelope.record, EndOfStream) and isinstance(
                    self, SingleAspectTransformer):
                # walk through state and call transform for any unprocessed entities
                for urn, state in self.entity_map.items():
                    if "seen" in state:
                        # call transform on this entity_urn
                        last_seen_mcp = state["seen"].get("mcp")
                        last_seen_mce_system_metadata = state["seen"].get(
                            "mce")

                        transformed_aspect = self.transform_aspect(
                            entity_urn=urn,
                            aspect_name=self.aspect_name(),
                            aspect=last_seen_mcp.aspect if last_seen_mcp
                            and last_seen_mcp.aspectName == self.aspect_name()
                            else None,
                        )
                        if transformed_aspect:
                            # for end of stream records, we modify the workunit-id
                            structured_urn = Urn.create_from_string(urn)
                            simple_name = "-".join(
                                structured_urn.get_entity_id())
                            record_metadata = envelope.metadata.copy()
                            record_metadata.update({
                                "workunit_id":
                                f"txform-{simple_name}-{self.aspect_name()}"
                            })
                            yield RecordEnvelope(
                                record=MetadataChangeProposalWrapper(
                                    entityUrn=urn,
                                    entityType=structured_urn.get_type(),
                                    changeType=ChangeTypeClass.UPSERT,
                                    systemMetadata=last_seen_mcp.systemMetadata
                                    if last_seen_mcp else
                                    last_seen_mce_system_metadata,
                                    aspectName=self.aspect_name(),
                                    aspect=transformed_aspect,
                                ),
                                metadata=record_metadata,
                            )
                    self._mark_processed(urn)
            yield envelope
Ejemplo n.º 15
0
def test_simple_dataset_ownership_tranformation(mock_time):
    no_owner_aspect = make_generic_dataset()

    with_owner_aspect = make_dataset_with_owner()

    not_a_dataset = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)",
            aspects=[
                models.DataJobInfoClass(
                    name="User Deletions",
                    description="Constructs the fct_users_deleted from logging_events",
                    type=models.AzkabanJobTypeClass.SQL,
                )
            ],
        )
    )

    inputs = [
        no_owner_aspect,
        with_owner_aspect,
        not_a_dataset,
    ]

    transformer = SimpleAddDatasetOwnership.create(
        {
            "owner_urns": [
                builder.make_user_urn("person1"),
                builder.make_user_urn("person2"),
            ]
        },
        PipelineContext(run_id="test"),
    )

    outputs = list(
        transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs])
    )

    assert len(outputs) == len(inputs)

    # Check the first entry.
    first_ownership_aspect = builder.get_aspect_if_available(
        outputs[0].record, models.OwnershipClass
    )
    assert first_ownership_aspect
    assert len(first_ownership_aspect.owners) == 2

    # Check the second entry.
    second_ownership_aspect = builder.get_aspect_if_available(
        outputs[1].record, models.OwnershipClass
    )
    assert second_ownership_aspect
    assert len(second_ownership_aspect.owners) == 3

    # Verify that the third entry is unchanged.
    assert inputs[2] == outputs[2].record
Ejemplo n.º 16
0
def test_import_resolver():
    transformer = AddDatasetTags.create(
        {
            "get_tags_to_add": "tests.unit.test_transform_dataset.dummy_tag_resolver_method"
        },
        PipelineContext(run_id="test-tags"),
    )
    output = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [make_generic_dataset()]]
        )
    )
    assert output
Ejemplo n.º 17
0
def test_simple_remove_dataset_ownership():
    with_owner_aspect = make_dataset_with_owner()

    transformer = SimpleRemoveDatasetOwnership.create(
        {},
        PipelineContext(run_id="test"),
    )
    outputs = list(
        transformer.transform([RecordEnvelope(with_owner_aspect,
                                              metadata={})]))

    ownership_aspect = builder.get_aspect_if_available(outputs[0].record,
                                                       models.OwnershipClass)
    assert ownership_aspect
    assert len(ownership_aspect.owners) == 0
Ejemplo n.º 18
0
def test_mark_status_dataset():
    dataset = make_generic_dataset()

    transformer = MarkDatasetStatus.create(
        {"removed": True},
        PipelineContext(run_id="test"),
    )
    removed = list(transformer.transform([RecordEnvelope(dataset, metadata={})]))
    status_aspect = builder.get_aspect_if_available(
        removed[0].record, models.StatusClass
    )
    assert status_aspect
    assert status_aspect.removed is True

    transformer = MarkDatasetStatus.create(
        {"removed": False},
        PipelineContext(run_id="test"),
    )
    not_removed = list(transformer.transform([RecordEnvelope(dataset, metadata={})]))
    status_aspect = builder.get_aspect_if_available(
        not_removed[0].record, models.StatusClass
    )
    assert status_aspect
    assert status_aspect.removed is False
Ejemplo n.º 19
0
    def test_kafka_sink_write(self, mock_k_callback, mock_producer, mock_context):
        mock_producer_instance = mock_producer.return_value
        mock_k_callback_instance = mock_k_callback.return_value
        callback = MagicMock(spec=WriteCallback)
        kafka_sink = DatahubKafkaSink.create(
            {"connection": {"bootstrap": "foobar:9092"}}, mock_context
        )
        re = RecordEnvelope(record=sentinel, metadata={})
        kafka_sink.write_record_async(re, callback)
        assert mock_producer_instance.poll.call_count == 1  # poll() called once
        self.validate_kafka_callback(
            mock_k_callback, re, callback
        )  # validate kafka callback was constructed appropriately

        # validate that confluent_kafka.Producer.produce was called with the right arguments
        args, kwargs = mock_producer_instance.produce.call_args
        created_callback = kwargs["on_delivery"]
        assert created_callback == mock_k_callback_instance.kafka_callback
Ejemplo n.º 20
0
def test_supression_works():
    dataset_mce = make_generic_dataset()
    dataset_mcp = make_generic_dataset_mcp(
        aspect_name="datasetProperties",
        aspect=DatasetPropertiesClass(description="supressable description"),
    )
    transformer = SuppressingTransformer.create(
        {},
        PipelineContext(run_id="test-suppress-transformer"),
    )

    outputs = list(
        transformer.transform([
            RecordEnvelope(input, metadata={})
            for input in [dataset_mce, dataset_mcp,
                          EndOfStream()]
        ]))

    assert len(outputs) == 2  # MCP will be dropped
Ejemplo n.º 21
0
def test_simple_dataset_ownership_with_type_transformation(mock_time):
    input = make_generic_dataset()

    transformer = SimpleAddDatasetOwnership.create(
        {
            "owner_urns": [
                builder.make_user_urn("person1"),
            ],
            "ownership_type": "PRODUCER",
        },
        PipelineContext(run_id="test"),
    )

    output = list(transformer.transform([RecordEnvelope(input, metadata={})]))

    assert len(output) == 1

    ownership_aspect = builder.get_aspect_if_available(output[0].record,
                                                       models.OwnershipClass)
    assert ownership_aspect
    assert len(ownership_aspect.owners) == 1
    assert ownership_aspect.owners[
        0].type == models.OwnershipTypeClass.PRODUCER
Ejemplo n.º 22
0
    def test_kafka_sink_mcp(self, mock_producer, mock_callback):
        from datahub.emitter.mcp import MetadataChangeProposalWrapper

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn="urn:li:dataset:(urn:li:dataPlatform:mysql,User.UserAccount,PROD)",
            changeType=models.ChangeTypeClass.UPSERT,
            aspectName="datasetProfile",
            aspect=models.DatasetProfileClass(
                rowCount=2000,
                columnCount=15,
                timestampMillis=1626995099686,
            ),
        )
        kafka_sink = DatahubKafkaSink.create(
            {"connection": {"bootstrap": "localhost:9092"}},
            PipelineContext(run_id="test"),
        )
        kafka_sink.write_record_async(
            RecordEnvelope(record=mcp, metadata={}), mock_callback
        )
        kafka_sink.close()
        assert mock_producer.call_count == 2  # constructor should be called
Ejemplo n.º 23
0
def test_add_dataset_properties(mock_time):
    dataset_mce = make_dataset_with_properties()

    transformer = AddDatasetProperties.create(
        {
            "add_properties_resolver_class":
            "tests.unit.test_transform_dataset.DummyPropertiesResolverClass"
        },
        PipelineContext(run_id="test-properties"),
    )

    outputs = list(
        transformer.transform(
            [RecordEnvelope(input, metadata={}) for input in [dataset_mce]]))
    assert len(outputs) == 1

    custom_properties = builder.get_aspect_if_available(
        outputs[0].record, models.DatasetPropertiesClass)

    assert custom_properties is not None
    assert custom_properties.customProperties == {
        **EXISTING_PROPERTIES,
        **PROPERTIES_TO_ADD,
    }
Ejemplo n.º 24
0
def create_test_data(test_file):
    assertion_urn = "urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b"
    dataset_urn = make_dataset_urn(platform="postgres", name="foo")
    assertion_info = AssertionInfoClass(
        type=AssertionTypeClass.DATASET,
        customProperties={"suite_name": "demo_suite"},
        datasetAssertion=DatasetAssertionInfoClass(
            fields=[make_schema_field_urn(dataset_urn, "col1")],
            dataset=dataset_urn,
            scope=DatasetAssertionScopeClass.DATASET_COLUMN,
            operator=AssertionStdOperatorClass.LESS_THAN,
            nativeType="column_value_is_less_than",
            aggregation=AssertionStdAggregation.IDENTITY,
            nativeParameters={"max_value": "99"},
        ),
    )
    # The assertion definition
    mcp1 = MetadataChangeProposalWrapper(
        entityType="assertion",
        changeType="UPSERT",
        entityUrn=assertion_urn,
        aspectName="assertionInfo",
        aspect=assertion_info,
    )
    timestamps = [
        1643794280350,
        1643794280352,
        1643794280354,
        1643880726872,
        1643880726874,
        1643880726875,
    ]
    msg_ids = []
    # The assertion run event attached to the dataset
    mcp2 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[0],
            partitionSpec=PartitionSpecClass(
                partition="[{'country': 'IN'}]",
                type=PartitionTypeClass.PARTITION,
            ),
            messageId=str(timestamps[0]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.SUCCESS,
                actualAggValue=90,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )

    mcp3 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[1],
            partitionSpec=PartitionSpecClass(
                partition="[{'country': 'US'}]",
                type=PartitionTypeClass.PARTITION,
            ),
            messageId=str(timestamps[1]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.FAILURE,
                actualAggValue=101,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )
    # Result of evaluating this assertion on the whole dataset
    mcp4 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[2],
            partitionSpec=PartitionSpecClass(
                partition="FULL_TABLE_SNAPSHOT",
                type=PartitionTypeClass.FULL_TABLE,
            ),
            messageId=str(timestamps[2]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.SUCCESS,
                actualAggValue=93,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )

    mcp5 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[3],
            partitionSpec=PartitionSpecClass(
                partition="[{'country': 'IN'}]",
                type=PartitionTypeClass.PARTITION,
            ),
            messageId=str(timestamps[3]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.SUCCESS,
                actualAggValue=90,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )
    mcp6 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[4],
            partitionSpec=PartitionSpecClass(
                partition="[{'country': 'US'}]",
                type=PartitionTypeClass.PARTITION,
            ),
            messageId=str(timestamps[4]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.FAILURE,
                actualAggValue=101,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )

    # Result of evaluating this assertion on the whole dataset
    mcp7 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[5],
            partitionSpec=PartitionSpecClass(
                partition="FULL_TABLE_SNAPSHOT",
                type=PartitionTypeClass.FULL_TABLE,
            ),
            messageId=str(timestamps[5]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.SUCCESS,
                actualAggValue=93,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )

    fileSink: FileSink = FileSink.create(
        FileSinkConfig(filename=test_file),
        ctx=PipelineContext(run_id="test-file"))
    for mcp in [mcp1, mcp2, mcp3, mcp4, mcp5, mcp6, mcp7]:
        fileSink.write_record_async(RecordEnvelope(record=mcp, metadata={}),
                                    write_callback=NoopWriteCallback())
    fileSink.close()
Ejemplo n.º 25
0
def test_pattern_dataset_ownership_tranformation(mock_time):
    no_owner_aspect = make_generic_dataset()

    with_owner_aspect = models.MetadataChangeEventClass(
        proposedSnapshot=models.DatasetSnapshotClass(
            urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)",
            aspects=[
                models.OwnershipClass(
                    owners=[
                        models.OwnerClass(
                            owner=builder.make_user_urn("fake_owner"),
                            type=models.OwnershipTypeClass.DATAOWNER,
                        ),
                    ],
                    lastModified=models.AuditStampClass(
                        time=1625266033123, actor="urn:li:corpuser:datahub"
                    ),
                )
            ],
        ),
    )

    not_a_dataset = models.MetadataChangeEventClass(
        proposedSnapshot=models.DataJobSnapshotClass(
            urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)",
            aspects=[
                models.DataJobInfoClass(
                    name="User Deletions",
                    description="Constructs the fct_users_deleted from logging_events",
                    type=models.AzkabanJobTypeClass.SQL,
                )
            ],
        )
    )

    inputs = [
        no_owner_aspect,
        with_owner_aspect,
        not_a_dataset,
    ]

    transformer = PatternAddDatasetOwnership.create(
        {
            "owner_pattern": {
                "rules": {
                    ".*example1.*": [builder.make_user_urn("person1")],
                    ".*example2.*": [builder.make_user_urn("person2")],
                }
            },
        },
        PipelineContext(run_id="test"),
    )

    outputs = list(
        transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs])
    )

    assert len(outputs) == len(inputs)

    # Check the first entry.
    first_ownership_aspect = builder.get_aspect_if_available(
        outputs[0].record, models.OwnershipClass
    )
    assert first_ownership_aspect
    assert len(first_ownership_aspect.owners) == 1

    # Check the second entry.
    second_ownership_aspect = builder.get_aspect_if_available(
        outputs[1].record, models.OwnershipClass
    )
    assert second_ownership_aspect
    assert len(second_ownership_aspect.owners) == 2

    # Verify that the third entry is unchanged.
    assert inputs[2] == outputs[2].record
Ejemplo n.º 26
0
def generate(
    schema_files: List[str],
    server: Optional[str],
    file: Optional[str],
    dot: Optional[str],
    png: Optional[str],
    extra_docs: Optional[str],
) -> None:
    logger.info(f"server = {server}")
    logger.info(f"file = {file}")
    logger.info(f"dot = {dot}")
    logger.info(f"png = {png}")

    entity_extra_docs = {}
    if extra_docs:
        for path in glob.glob(f"{extra_docs}/**/*.md", recursive=True):
            m = re.search("/docs/entities/(.*)/*.md", path)
            if m:
                entity_name = m.group(1)
                with open(path, "r") as doc_file:
                    file_contents = doc_file.read()
                    final_markdown = preprocess_markdown(file_contents)
                    entity_extra_docs[entity_name] = final_markdown

    for schema_file in schema_files:
        if schema_file.endswith(".yml") or schema_file.endswith(".yaml"):
            # registry file
            load_registry_file(schema_file)
        else:
            # schema file
            load_schema_file(schema_file)

    if entity_extra_docs:
        for entity_name in entity_extra_docs:

            entity_registry.get(
                entity_name).doc_file_contents = entity_extra_docs[entity_name]

    relationship_graph = RelationshipGraph()
    events = generate_stitched_record(relationship_graph)

    generated_docs_dir = "../docs/generated/metamodel"
    import shutil

    shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
    entity_names = [(x, entity_registry.get(x))
                    for x in generated_documentation]

    sorted_entity_names = get_sorted_entity_names(entity_names)

    index = 0
    for category, sorted_entities in sorted_entity_names:
        for entity_name in sorted_entities:
            entity_def = entity_registry.get(entity_name)

            entity_category = entity_def.category
            entity_dir = f"{generated_docs_dir}/entities/"
            import os

            os.makedirs(entity_dir, exist_ok=True)

            with open(f"{entity_dir}/{entity_name}.md", "w") as fp:
                fp.write("---\n")
                fp.write(f"sidebar_position: {index}\n")
                fp.write("---\n")
                fp.write(generated_documentation[entity_name])
                index += 1

    if file:
        logger.info(f"Will write events to {file}")
        Path(file).parent.mkdir(parents=True, exist_ok=True)
        fileSink = FileSink(
            PipelineContext(run_id="generated-metaModel"),
            FileSinkConfig(filename=file),
        )
        for e in events:
            fileSink.write_record_async(RecordEnvelope(e, metadata={}),
                                        write_callback=NoopWriteCallback())
        fileSink.close()
        pipeline_config = {
            "source": {
                "type": "file",
                "config": {
                    "filename": file
                },
            },
            "sink": {
                "type": "datahub-rest",
                "config": {
                    "server": "${DATAHUB_SERVER:-http://localhost:8080}",
                    "token": "${DATAHUB_TOKEN:-}",
                },
            },
            "run_id": "modeldoc-generated",
        }
        pipeline_file = Path(file).parent.absolute() / "pipeline.yml"
        with open(pipeline_file, "w") as f:
            json.dump(pipeline_config, f, indent=2)
            logger.info(f"Wrote pipeline to {pipeline_file}")

    if server:
        logger.info(f"Will send events to {server}")
        assert server.startswith(
            "http://"), "server address must start with http://"
        emitter = DatahubRestEmitter(gms_server=server)
        emitter.test_connection()
        for e in events:
            emitter.emit(e)

    if dot:
        logger.info(f"Will write dot file to {dot}")

        import pydot

        graph = pydot.Dot("my_graph", graph_type="graph")
        for node, adjacency in relationship_graph.map.items():
            my_node = pydot.Node(
                node,
                label=node,
                shape="box",
            )
            graph.add_node(my_node)
            if adjacency.self_loop:
                for relnship in adjacency.self_loop:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
            if adjacency.outgoing:
                for relnship in adjacency.outgoing:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
        Path(dot).parent.mkdir(parents=True, exist_ok=True)
        graph.write_raw(dot)
        if png:
            try:
                graph.write_png(png)
            except Exception as e:
                logger.error(
                    "Failed to create png file. Do you have graphviz installed?"
                )
                raise e
Ejemplo n.º 27
0
def test_mark_status_dataset(tmp_path):
    dataset = make_generic_dataset()

    transformer = MarkDatasetStatus.create(
        {"removed": True},
        PipelineContext(run_id="test"),
    )
    removed = list(
        transformer.transform([
            RecordEnvelope(dataset, metadata={}),
        ]))
    assert len(removed) == 1
    status_aspect = builder.get_aspect_if_available(removed[0].record,
                                                    models.StatusClass)
    assert status_aspect
    assert status_aspect.removed is True

    transformer = MarkDatasetStatus.create(
        {"removed": False},
        PipelineContext(run_id="test"),
    )
    not_removed = list(
        transformer.transform([
            RecordEnvelope(dataset, metadata={}),
        ]))
    assert len(not_removed) == 1
    status_aspect = builder.get_aspect_if_available(not_removed[0].record,
                                                    models.StatusClass)
    assert status_aspect
    assert status_aspect.removed is False

    mcp = make_generic_dataset_mcp(
        aspect_name="datasetProperties",
        aspect=DatasetPropertiesClass(description="Test dataset"),
    )
    events_file = create_and_run_test_pipeline(
        events=[mcp],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert dataset properties aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="datasetProperties",
        aspect_field_matcher={"description": "Test dataset"},
        file=events_file,
    ) == 1)

    # assert Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)

    # MCE only
    test_aspect = DatasetPropertiesClass(description="Test dataset")
    events_file = create_and_run_test_pipeline(
        events=[make_generic_dataset(aspects=[test_aspect])],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert dataset properties aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=test_aspect,
        aspect_type=DatasetPropertiesClass,
        file=events_file,
    ) == 1)

    # assert Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)

    # MCE (non-matching) + MCP (matching)
    test_aspect = DatasetPropertiesClass(description="Test dataset")
    events_file = create_and_run_test_pipeline(
        events=[
            make_generic_dataset(aspects=[test_aspect]),
            make_generic_dataset_mcp(),
        ],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert dataset properties aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=test_aspect,
        aspect_type=DatasetPropertiesClass,
        file=events_file,
    ) == 1)

    # assert Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)

    # MCE (matching) + MCP (non-matching)
    test_status_aspect = StatusClass(removed=False)
    events_file = create_and_run_test_pipeline(
        events=[
            make_generic_dataset(aspects=[test_status_aspect]),
            make_generic_dataset_mcp(
                aspect_name="datasetProperties",
                aspect=DatasetPropertiesClass(description="test dataset"),
            ),
        ],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert MCE was transformed
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=StatusClass(removed=True),
        aspect_type=StatusClass,
        file=events_file,
    ) == 1)

    # assert MCP aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="datasetProperties",
        aspect_field_matcher={"description": "test dataset"},
        file=events_file,
    ) == 1)

    # MCE (non-matching) + MCP (non-matching)
    test_mcp_aspect = GlobalTagsClass(
        tags=[TagAssociationClass(tag="urn:li:tag:test")])
    test_dataset_props_aspect = DatasetPropertiesClass(
        description="Test dataset")
    events_file = create_and_run_test_pipeline(
        events=[
            make_generic_dataset(aspects=[test_dataset_props_aspect]),
            make_generic_dataset_mcp(aspect_name="globalTags",
                                     aspect=test_mcp_aspect),
        ],
        transformers=[{
            "type": "mark_dataset_status",
            "config": {
                "removed": True
            }
        }],
        path=tmp_path,
    )

    # assert MCE was preserved
    assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect(
        entity_urn=mcp.entityUrn or "",
        aspect=test_dataset_props_aspect,
        aspect_type=DatasetPropertiesClass,
        file=events_file,
    ) == 1)

    # assert MCP aspect was preserved
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="globalTags",
        aspect_field_matcher={"tags": [{
            "tag": "urn:li:tag:test"
        }]},
        file=events_file,
    ) == 1)

    # assert MCP Status aspect was generated
    assert (tests.test_helpers.mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="status",
        aspect_field_matcher={"removed": True},
        file=events_file,
    ) == 1)
Ejemplo n.º 28
0
def test_add_dataset_browse_paths():
    dataset = make_generic_dataset()

    transformer = AddDatasetBrowsePathTransformer.create(
        {"path_templates": ["/abc"]},
        PipelineContext(run_id="test"),
    )
    transformed = list(
        transformer.transform([
            RecordEnvelope(dataset, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))
    browse_path_aspect = transformed[1].record.aspect
    assert browse_path_aspect
    assert browse_path_aspect.paths == ["/abc"]

    # use an mce with a pre-existing browse path
    dataset_mce = make_generic_dataset(
        aspects=[StatusClass(removed=False), browse_path_aspect])

    transformer = AddDatasetBrowsePathTransformer.create(
        {
            "path_templates": [
                "/PLATFORM/foo/DATASET_PARTS/ENV",
                "/ENV/PLATFORM/bar/DATASET_PARTS/",
            ]
        },
        PipelineContext(run_id="test"),
    )
    transformed = list(
        transformer.transform([
            RecordEnvelope(dataset_mce, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))
    assert len(transformed) == 2
    browse_path_aspect = builder.get_aspect_if_available(
        transformed[0].record, BrowsePathsClass)
    assert browse_path_aspect
    assert browse_path_aspect.paths == [
        "/abc",
        "/bigquery/foo/example1/prod",
        "/prod/bigquery/bar/example1/",
    ]

    transformer = AddDatasetBrowsePathTransformer.create(
        {
            "path_templates": [
                "/xyz",
            ],
            "replace_existing": True,
        },
        PipelineContext(run_id="test"),
    )
    transformed = list(
        transformer.transform([
            RecordEnvelope(dataset_mce, metadata={}),
            RecordEnvelope(EndOfStream(), metadata={}),
        ]))
    assert len(transformed) == 2
    browse_path_aspect = builder.get_aspect_if_available(
        transformed[0].record, BrowsePathsClass)
    assert browse_path_aspect
    assert browse_path_aspect.paths == [
        "/xyz",
    ]
Ejemplo n.º 29
0
def generate(schema_files: List[str], server: Optional[str],
             file: Optional[str], dot: Optional[str],
             png: Optional[str]) -> None:
    logger.info(f"server = {server}")
    logger.info(f"file = {file}")
    logger.info(f"dot = {dot}")
    logger.info(f"png = {png}")

    for schema_file in schema_files:
        if schema_file.endswith(".yml") or schema_file.endswith(".yaml"):
            # registry file
            load_registry_file(schema_file)
        else:
            # schema file
            load_schema_file(schema_file)

    relationship_graph = RelationshipGraph()
    events = generate_stitched_record(relationship_graph)

    if file:
        logger.info(f"Will write events to {file}")
        Path(file).parent.mkdir(parents=True, exist_ok=True)
        fileSink = FileSink(
            PipelineContext(run_id="generated-metaModel"),
            FileSinkConfig(filename=file),
        )
        for e in events:
            fileSink.write_record_async(RecordEnvelope(e, metadata={}),
                                        write_callback=NoopWriteCallback())
        fileSink.close()
        pipeline_config = {
            "source": {
                "type": "file",
                "config": {
                    "filename": file
                },
            },
            "sink": {
                "type": "datahub-rest",
                "config": {
                    "server": "${DATAHUB_SERVER:-http://localhost:8080}",
                    "token": "${DATAHUB_TOKEN:-}",
                },
            },
            "run_id": "modeldoc-generated",
        }
        pipeline_file = Path(file).parent.absolute() / "pipeline.yml"
        with open(pipeline_file, "w") as f:
            json.dump(pipeline_config, f, indent=2)
            logger.info(f"Wrote pipeline to {pipeline_file}")

    if server:
        logger.info(f"Will send events to {server}")
        assert server.startswith(
            "http://"), "server address must start with http://"
        emitter = DatahubRestEmitter(gms_server=server)
        emitter.test_connection()
        for e in events:
            emitter.emit(e)

    if dot:
        logger.info(f"Will write dot file to {dot}")

        import pydot

        graph = pydot.Dot("my_graph", graph_type="graph")
        for node, adjacency in relationship_graph.map.items():
            my_node = pydot.Node(
                node,
                label=node,
                shape="box",
            )
            graph.add_node(my_node)
            if adjacency.self_loop:
                for relnship in adjacency.self_loop:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
            if adjacency.outgoing:
                for relnship in adjacency.outgoing:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
        Path(dot).parent.mkdir(parents=True, exist_ok=True)
        graph.write_raw(dot)
        if png:
            try:
                graph.write_png(png)
            except Exception as e:
                logger.error(
                    "Failed to create png file. Do you have graphviz installed?"
                )
                raise e