def test_add_dataset_browse_paths(): dataset = make_generic_dataset() transformer = AddDatasetBrowsePathTransformer.create( {"path_templates": ["/abc"]}, PipelineContext(run_id="test"), ) transformed = list(transformer.transform([RecordEnvelope(dataset, metadata={})])) browse_path_aspect = builder.get_aspect_if_available( transformed[0].record, models.BrowsePathsClass ) assert browse_path_aspect assert browse_path_aspect.paths == ["/abc"] transformer = AddDatasetBrowsePathTransformer.create( { "path_templates": [ "/PLATFORM/foo/DATASET_PARTS/ENV", "/ENV/PLATFORM/bar/DATASET_PARTS/", ] }, PipelineContext(run_id="test"), ) transformed = list(transformer.transform([RecordEnvelope(dataset, metadata={})])) browse_path_aspect = builder.get_aspect_if_available( transformed[0].record, models.BrowsePathsClass ) assert browse_path_aspect assert browse_path_aspect.paths == [ "/abc", "/bigquery/foo/example1/prod", "/prod/bigquery/bar/example1/", ]
def test_simple_dataset_ownership_tranformation(mock_time): no_owner_aspect = make_generic_dataset() with_owner_aspect = make_dataset_with_owner() not_a_dataset = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", aspects=[ models.DataJobInfoClass( name="User Deletions", description="Constructs the fct_users_deleted from logging_events", type=models.AzkabanJobTypeClass.SQL, ) ], ) ) inputs = [ no_owner_aspect, with_owner_aspect, not_a_dataset, ] transformer = SimpleAddDatasetOwnership.create( { "owner_urns": [ builder.make_user_urn("person1"), builder.make_user_urn("person2"), ] }, PipelineContext(run_id="test"), ) outputs = list( transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) ) assert len(outputs) == len(inputs) # Check the first entry. first_ownership_aspect = builder.get_aspect_if_available( outputs[0].record, models.OwnershipClass ) assert first_ownership_aspect assert len(first_ownership_aspect.owners) == 2 # Check the second entry. second_ownership_aspect = builder.get_aspect_if_available( outputs[1].record, models.OwnershipClass ) assert second_ownership_aspect assert len(second_ownership_aspect.owners) == 3 # Verify that the third entry is unchanged. assert inputs[2] == outputs[2].record
def test_simple_dataset_tags_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = SimpleAddDatasetTags.create( { "tag_urns": [ builder.make_tag_urn("NeedsDocumentation"), builder.make_tag_urn("Legacy"), ] }, PipelineContext(run_id="test-tags"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in [dataset_mce]] ) ) assert len(outputs) == 1 # Check that tags were added. tags_aspect = builder.get_aspect_if_available( outputs[0].record, models.GlobalTagsClass ) assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn("NeedsDocumentation")
def test_pattern_dataset_tags_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = PatternAddDatasetTags.create( { "tag_pattern": { "rules": { ".*example1.*": [ builder.make_tag_urn("Private"), builder.make_tag_urn("Legacy"), ], ".*example2.*": [builder.make_term_urn("Needs Documentation")], } }, }, PipelineContext(run_id="test-tags"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in [dataset_mce]])) assert len(outputs) == 1 # Check that glossary terms were added. tags_aspect = builder.get_aspect_if_available(outputs[0].record, models.GlobalTagsClass) assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private") assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
def test_pattern_dataset_terms_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = PatternAddDatasetTerms.create( { "term_pattern": { "rules": { ".*example1.*": [ builder.make_term_urn("AccountBalance"), builder.make_term_urn("Email"), ], ".*example2.*": [builder.make_term_urn("Address")], } }, }, PipelineContext(run_id="test-terms"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in [dataset_mce]])) assert len(outputs) == 1 # Check that glossary terms were added. terms_aspect = builder.get_aspect_if_available(outputs[0].record, models.GlossaryTermsClass) assert terms_aspect assert len(terms_aspect.terms) == 2 assert terms_aspect.terms[0].urn == builder.make_term_urn("AccountBalance") assert builder.make_term_urn("AccountBalance") not in terms_aspect.terms
def test_simple_add_dataset_properties(mock_time): dataset_mce = make_dataset_with_properties() new_properties = {"new-simple-property": "new-value"} transformer = SimpleAddDatasetProperties.create( { "properties": new_properties, }, PipelineContext(run_id="test-simple-properties"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in [dataset_mce]])) assert len(outputs) == 1 custom_properties = builder.get_aspect_if_available( outputs[0].record, models.DatasetPropertiesClass) print(str(custom_properties)) assert custom_properties is not None assert custom_properties.customProperties == { **EXISTING_PROPERTIES, **new_properties, }
def test_pattern_dataset_ownership_with_type_transformation(mock_time): input = make_generic_dataset() transformer = PatternAddDatasetOwnership.create( { "owner_pattern": { "rules": { ".*example1.*": [builder.make_user_urn("person1")], } }, "ownership_type": "PRODUCER", }, PipelineContext(run_id="test"), ) output = list(transformer.transform([RecordEnvelope(input, metadata={})])) assert len(output) == 1 ownership_aspect = builder.get_aspect_if_available(output[0].record, models.OwnershipClass) assert ownership_aspect assert len(ownership_aspect.owners) == 1 assert ownership_aspect.owners[ 0].type == models.OwnershipTypeClass.PRODUCER
def test_old_transformers_working_as_before(mock_time): dataset_mce = make_generic_dataset() dataset_mcp = make_generic_dataset_mcp() transformer = OldMCETransformer.create( {}, PipelineContext(run_id="test-old-transformer"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, dataset_mcp, EndOfStream()] ])) assert len(outputs) == 3 # MCP will come back untouched assert outputs[0].record == dataset_mce # Check that glossary terms were added. props_aspect = builder.get_aspect_if_available(outputs[0].record, DatasetPropertiesClass) assert props_aspect assert props_aspect.description == "Old Transformer was here" assert outputs[1].record == dataset_mcp assert isinstance(outputs[-1].record, EndOfStream) # MCP only stream dataset_mcps = [ make_generic_dataset_mcp(), make_generic_dataset_mcp(aspect=DatasetPropertiesClass( description="Another test MCP")), EndOfStream(), ] transformer = OldMCETransformer.create( {}, PipelineContext(run_id="test-old-transformer"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in dataset_mcps])) assert len(outputs) == 3 # MCP-s will come back untouched assert outputs[0].record == dataset_mcps[0] assert outputs[1].record == dataset_mcps[1] assert isinstance(outputs[-1].record, EndOfStream)
def test_mark_status_dataset(): dataset = make_generic_dataset() transformer = MarkDatasetStatus.create( {"removed": True}, PipelineContext(run_id="test"), ) removed = list(transformer.transform([RecordEnvelope(dataset, metadata={})])) status_aspect = builder.get_aspect_if_available( removed[0].record, models.StatusClass ) assert status_aspect assert status_aspect.removed is True transformer = MarkDatasetStatus.create( {"removed": False}, PipelineContext(run_id="test"), ) not_removed = list(transformer.transform([RecordEnvelope(dataset, metadata={})])) status_aspect = builder.get_aspect_if_available( not_removed[0].record, models.StatusClass ) assert status_aspect assert status_aspect.removed is False
def test_simple_remove_dataset_ownership(): with_owner_aspect = make_dataset_with_owner() transformer = SimpleRemoveDatasetOwnership.create( {}, PipelineContext(run_id="test"), ) outputs = list( transformer.transform([RecordEnvelope(with_owner_aspect, metadata={})])) ownership_aspect = builder.get_aspect_if_available(outputs[0].record, models.OwnershipClass) assert ownership_aspect assert len(ownership_aspect.owners) == 0
def get_lineage_if_enabled( self, mce: MetadataChangeEventClass ) -> Optional[MetadataChangeProposalWrapper]: if self.source_config.emit_s3_lineage: # extract dataset properties aspect dataset_properties: Optional[ DatasetPropertiesClass] = mce_builder.get_aspect_if_available( mce, DatasetPropertiesClass) if dataset_properties and "Location" in dataset_properties.customProperties: location = dataset_properties.customProperties["Location"] if location.startswith("s3://"): s3_dataset_urn = make_s3_urn(location, self.source_config.env) if self.source_config.glue_s3_lineage_direction == "upstream": upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=s3_dataset_urn, type=DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=mce.proposedSnapshot.urn, changeType=ChangeTypeClass.UPSERT, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp else: # Need to mint the s3 dataset with upstream lineage from it to glue upstream_lineage = UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=mce.proposedSnapshot.urn, type=DatasetLineageTypeClass.COPY, ) ]) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=s3_dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="upstreamLineage", aspect=upstream_lineage, ) return mcp return None
def test_simple_dataset_terms_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = SimpleAddDatasetTerms.create( { "term_urns": [ builder.make_term_urn("Test"), builder.make_term_urn("Needs Review"), ] }, PipelineContext(run_id="test-terms"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in [dataset_mce]])) assert len(outputs) == 1 # Check that glossary terms were added. terms_aspect = builder.get_aspect_if_available(outputs[0].record, models.GlossaryTermsClass) assert terms_aspect assert len(terms_aspect.terms) == 2 assert terms_aspect.terms[0].urn == builder.make_term_urn("Test")
def test_add_dataset_properties(mock_time): dataset_mce = make_dataset_with_properties() transformer = AddDatasetProperties.create( { "add_properties_resolver_class": "tests.unit.test_transform_dataset.DummyPropertiesResolverClass" }, PipelineContext(run_id="test-properties"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in [dataset_mce]])) assert len(outputs) == 1 custom_properties = builder.get_aspect_if_available( outputs[0].record, models.DatasetPropertiesClass) assert custom_properties is not None assert custom_properties.customProperties == { **EXISTING_PROPERTIES, **PROPERTIES_TO_ADD, }
def test_pattern_dataset_ownership_tranformation(mock_time): no_owner_aspect = make_generic_dataset() with_owner_aspect = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)", aspects=[ models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn("fake_owner"), type=models.OwnershipTypeClass.DATAOWNER, ), ], lastModified=models.AuditStampClass( time=1625266033123, actor="urn:li:corpuser:datahub" ), ) ], ), ) not_a_dataset = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", aspects=[ models.DataJobInfoClass( name="User Deletions", description="Constructs the fct_users_deleted from logging_events", type=models.AzkabanJobTypeClass.SQL, ) ], ) ) inputs = [ no_owner_aspect, with_owner_aspect, not_a_dataset, ] transformer = PatternAddDatasetOwnership.create( { "owner_pattern": { "rules": { ".*example1.*": [builder.make_user_urn("person1")], ".*example2.*": [builder.make_user_urn("person2")], } }, }, PipelineContext(run_id="test"), ) outputs = list( transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) ) assert len(outputs) == len(inputs) # Check the first entry. first_ownership_aspect = builder.get_aspect_if_available( outputs[0].record, models.OwnershipClass ) assert first_ownership_aspect assert len(first_ownership_aspect.owners) == 1 # Check the second entry. second_ownership_aspect = builder.get_aspect_if_available( outputs[1].record, models.OwnershipClass ) assert second_ownership_aspect assert len(second_ownership_aspect.owners) == 2 # Verify that the third entry is unchanged. assert inputs[2] == outputs[2].record
def test_add_dataset_browse_paths(): dataset = make_generic_dataset() transformer = AddDatasetBrowsePathTransformer.create( {"path_templates": ["/abc"]}, PipelineContext(run_id="test"), ) transformed = list( transformer.transform([ RecordEnvelope(dataset, metadata={}), RecordEnvelope(EndOfStream(), metadata={}), ])) browse_path_aspect = transformed[1].record.aspect assert browse_path_aspect assert browse_path_aspect.paths == ["/abc"] # use an mce with a pre-existing browse path dataset_mce = make_generic_dataset( aspects=[StatusClass(removed=False), browse_path_aspect]) transformer = AddDatasetBrowsePathTransformer.create( { "path_templates": [ "/PLATFORM/foo/DATASET_PARTS/ENV", "/ENV/PLATFORM/bar/DATASET_PARTS/", ] }, PipelineContext(run_id="test"), ) transformed = list( transformer.transform([ RecordEnvelope(dataset_mce, metadata={}), RecordEnvelope(EndOfStream(), metadata={}), ])) assert len(transformed) == 2 browse_path_aspect = builder.get_aspect_if_available( transformed[0].record, BrowsePathsClass) assert browse_path_aspect assert browse_path_aspect.paths == [ "/abc", "/bigquery/foo/example1/prod", "/prod/bigquery/bar/example1/", ] transformer = AddDatasetBrowsePathTransformer.create( { "path_templates": [ "/xyz", ], "replace_existing": True, }, PipelineContext(run_id="test"), ) transformed = list( transformer.transform([ RecordEnvelope(dataset_mce, metadata={}), RecordEnvelope(EndOfStream(), metadata={}), ])) assert len(transformed) == 2 browse_path_aspect = builder.get_aspect_if_available( transformed[0].record, BrowsePathsClass) assert browse_path_aspect assert browse_path_aspect.paths == [ "/xyz", ]
def test_mark_status_dataset(tmp_path): dataset = make_generic_dataset() transformer = MarkDatasetStatus.create( {"removed": True}, PipelineContext(run_id="test"), ) removed = list( transformer.transform([ RecordEnvelope(dataset, metadata={}), ])) assert len(removed) == 1 status_aspect = builder.get_aspect_if_available(removed[0].record, models.StatusClass) assert status_aspect assert status_aspect.removed is True transformer = MarkDatasetStatus.create( {"removed": False}, PipelineContext(run_id="test"), ) not_removed = list( transformer.transform([ RecordEnvelope(dataset, metadata={}), ])) assert len(not_removed) == 1 status_aspect = builder.get_aspect_if_available(not_removed[0].record, models.StatusClass) assert status_aspect assert status_aspect.removed is False mcp = make_generic_dataset_mcp( aspect_name="datasetProperties", aspect=DatasetPropertiesClass(description="Test dataset"), ) events_file = create_and_run_test_pipeline( events=[mcp], transformers=[{ "type": "mark_dataset_status", "config": { "removed": True } }], path=tmp_path, ) # assert dataset properties aspect was preserved assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="datasetProperties", aspect_field_matcher={"description": "Test dataset"}, file=events_file, ) == 1) # assert Status aspect was generated assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="status", aspect_field_matcher={"removed": True}, file=events_file, ) == 1) # MCE only test_aspect = DatasetPropertiesClass(description="Test dataset") events_file = create_and_run_test_pipeline( events=[make_generic_dataset(aspects=[test_aspect])], transformers=[{ "type": "mark_dataset_status", "config": { "removed": True } }], path=tmp_path, ) # assert dataset properties aspect was preserved assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect( entity_urn=mcp.entityUrn or "", aspect=test_aspect, aspect_type=DatasetPropertiesClass, file=events_file, ) == 1) # assert Status aspect was generated assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="status", aspect_field_matcher={"removed": True}, file=events_file, ) == 1) # MCE (non-matching) + MCP (matching) test_aspect = DatasetPropertiesClass(description="Test dataset") events_file = create_and_run_test_pipeline( events=[ make_generic_dataset(aspects=[test_aspect]), make_generic_dataset_mcp(), ], transformers=[{ "type": "mark_dataset_status", "config": { "removed": True } }], path=tmp_path, ) # assert dataset properties aspect was preserved assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect( entity_urn=mcp.entityUrn or "", aspect=test_aspect, aspect_type=DatasetPropertiesClass, file=events_file, ) == 1) # assert Status aspect was generated assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="status", aspect_field_matcher={"removed": True}, file=events_file, ) == 1) # MCE (matching) + MCP (non-matching) test_status_aspect = StatusClass(removed=False) events_file = create_and_run_test_pipeline( events=[ make_generic_dataset(aspects=[test_status_aspect]), make_generic_dataset_mcp( aspect_name="datasetProperties", aspect=DatasetPropertiesClass(description="test dataset"), ), ], transformers=[{ "type": "mark_dataset_status", "config": { "removed": True } }], path=tmp_path, ) # assert MCE was transformed assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect( entity_urn=mcp.entityUrn or "", aspect=StatusClass(removed=True), aspect_type=StatusClass, file=events_file, ) == 1) # assert MCP aspect was preserved assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="datasetProperties", aspect_field_matcher={"description": "test dataset"}, file=events_file, ) == 1) # MCE (non-matching) + MCP (non-matching) test_mcp_aspect = GlobalTagsClass( tags=[TagAssociationClass(tag="urn:li:tag:test")]) test_dataset_props_aspect = DatasetPropertiesClass( description="Test dataset") events_file = create_and_run_test_pipeline( events=[ make_generic_dataset(aspects=[test_dataset_props_aspect]), make_generic_dataset_mcp(aspect_name="globalTags", aspect=test_mcp_aspect), ], transformers=[{ "type": "mark_dataset_status", "config": { "removed": True } }], path=tmp_path, ) # assert MCE was preserved assert (tests.test_helpers.mce_helpers.assert_entity_mce_aspect( entity_urn=mcp.entityUrn or "", aspect=test_dataset_props_aspect, aspect_type=DatasetPropertiesClass, file=events_file, ) == 1) # assert MCP aspect was preserved assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="globalTags", aspect_field_matcher={"tags": [{ "tag": "urn:li:tag:test" }]}, file=events_file, ) == 1) # assert MCP Status aspect was generated assert (tests.test_helpers.mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="status", aspect_field_matcher={"removed": True}, file=events_file, ) == 1)