def test_old_transformers_working_as_before(mock_time): dataset_mce = make_generic_dataset() dataset_mcp = make_generic_dataset_mcp() transformer = OldMCETransformer.create( {}, PipelineContext(run_id="test-old-transformer"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, dataset_mcp, EndOfStream()] ])) assert len(outputs) == 3 # MCP will come back untouched assert outputs[0].record == dataset_mce # Check that glossary terms were added. props_aspect = builder.get_aspect_if_available(outputs[0].record, DatasetPropertiesClass) assert props_aspect assert props_aspect.description == "Old Transformer was here" assert outputs[1].record == dataset_mcp assert isinstance(outputs[-1].record, EndOfStream) # MCP only stream dataset_mcps = [ make_generic_dataset_mcp(), make_generic_dataset_mcp(aspect=DatasetPropertiesClass( description="Another test MCP")), EndOfStream(), ] transformer = OldMCETransformer.create( {}, PipelineContext(run_id="test-old-transformer"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in dataset_mcps])) assert len(outputs) == 3 # MCP-s will come back untouched assert outputs[0].record == dataset_mcps[0] assert outputs[1].record == dataset_mcps[1] assert isinstance(outputs[-1].record, EndOfStream)
def test_mcp_add_tags_missing(mock_time): dataset_mcp = make_generic_dataset_mcp() transformer = SimpleAddDatasetTags.create( { "tag_urns": [ builder.make_tag_urn("NeedsDocumentation"), builder.make_tag_urn("Legacy"), ] }, PipelineContext(run_id="test-tags"), ) input_stream: List[RecordEnvelope] = [ RecordEnvelope(input, metadata={}) for input in [dataset_mcp] ] input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={})) outputs = list(transformer.transform(input_stream)) assert len(outputs) == 3 assert outputs[0].record == dataset_mcp # Check that tags were added, this will be the second result tags_aspect = outputs[1].record.aspect assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn( "NeedsDocumentation") assert isinstance(outputs[-1].record, EndOfStream)
def test_pattern_dataset_terms_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = PatternAddDatasetTerms.create( { "term_pattern": { "rules": { ".*example1.*": [ builder.make_term_urn("AccountBalance"), builder.make_term_urn("Email"), ], ".*example2.*": [builder.make_term_urn("Address")], } }, }, PipelineContext(run_id="test-terms"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, EndOfStream()] ])) assert len(outputs) == 3 # Check that glossary terms were added. terms_aspect = outputs[1].record.aspect assert terms_aspect assert len(terms_aspect.terms) == 2 assert terms_aspect.terms[0].urn == builder.make_term_urn("AccountBalance") assert builder.make_term_urn("AccountBalance") not in terms_aspect.terms
def test_simple_dataset_terms_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = SimpleAddDatasetTerms.create( { "term_urns": [ builder.make_term_urn("Test"), builder.make_term_urn("Needs Review"), ] }, PipelineContext(run_id="test-terms"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, EndOfStream()] ])) assert len(outputs) == 3 # Check that glossary terms were added. terms_aspect = outputs[1].record.aspect assert terms_aspect assert len(terms_aspect.terms) == 2 assert terms_aspect.terms[0].urn == builder.make_term_urn("Test")
def test_pattern_dataset_ownership_with_type_transformation(mock_time): input = make_generic_dataset() transformer = PatternAddDatasetOwnership.create( { "owner_pattern": { "rules": { ".*example1.*": [builder.make_user_urn("person1")], } }, "ownership_type": "PRODUCER", }, PipelineContext(run_id="test"), ) output = list( transformer.transform([ RecordEnvelope(input, metadata={}), RecordEnvelope(EndOfStream(), metadata={}), ])) assert len(output) == 3 ownership_aspect = output[1].record.aspect assert ownership_aspect assert len(ownership_aspect.owners) == 1 assert ownership_aspect.owners[ 0].type == models.OwnershipTypeClass.PRODUCER
def test_pattern_dataset_tags_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = PatternAddDatasetTags.create( { "tag_pattern": { "rules": { ".*example1.*": [ builder.make_tag_urn("Private"), builder.make_tag_urn("Legacy"), ], ".*example2.*": [builder.make_term_urn("Needs Documentation")], } }, }, PipelineContext(run_id="test-tags"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, EndOfStream()] ])) assert len(outputs) == 3 tags_aspect = outputs[1].record.aspect assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn("Private") assert builder.make_tag_urn("Needs Documentation") not in tags_aspect.tags
def test_simple_dataset_tags_transformation(mock_time): dataset_mce = make_generic_dataset() transformer = SimpleAddDatasetTags.create( { "tag_urns": [ builder.make_tag_urn("NeedsDocumentation"), builder.make_tag_urn("Legacy"), ] }, PipelineContext(run_id="test-tags"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, EndOfStream()] ])) assert len(outputs) == 3 # Check that tags were added. tags_aspect = outputs[1].record.aspect assert tags_aspect assert len(tags_aspect.tags) == 2 assert tags_aspect.tags[0].tag == builder.make_tag_urn( "NeedsDocumentation")
def test_simple_dataset_ownership_with_type_transformation(mock_time): input = make_generic_dataset() transformer = SimpleAddDatasetOwnership.create( { "owner_urns": [ builder.make_user_urn("person1"), ], "ownership_type": "PRODUCER", }, PipelineContext(run_id="test"), ) output = list( transformer.transform([ RecordEnvelope(input, metadata={}), RecordEnvelope(EndOfStream(), metadata={}), ])) assert len(output) == 3 # original MCE is unchanged assert input == output[0].record ownership_aspect = output[1].record.aspect assert isinstance(ownership_aspect, OwnershipClass) assert len(ownership_aspect.owners) == 1 assert ownership_aspect.owners[ 0].type == models.OwnershipTypeClass.PRODUCER
def run(self) -> None: callback = LoggingCallback() extractor: Extractor = self.extractor_class() for wu in itertools.islice(self.source.get_workunits(), 10 if self.preview_mode else None): # TODO: change extractor interface extractor.configure({}, self.ctx) if not self.dry_run: self.sink.handle_work_unit_start(wu) record_envelopes = extractor.get_records(wu) for record_envelope in self.transform(record_envelopes): if not self.dry_run: self.sink.write_record_async(record_envelope, callback) extractor.close() if not self.dry_run: self.sink.handle_work_unit_end(wu) self.source.close() # no more data is coming, we need to let the transformers produce any additional records if they are holding on to state for record_envelope in self.transform([ RecordEnvelope(record=EndOfStream(), metadata={"workunit_id": "end-of-stream"}) ]): if not self.dry_run and not isinstance(record_envelope.record, EndOfStream): # TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc. self.sink.write_record_async(record_envelope, callback) self.sink.close() self.process_commits()
def test_supression_works(): dataset_mce = make_generic_dataset() dataset_mcp = make_generic_dataset_mcp( aspect_name="datasetProperties", aspect=DatasetPropertiesClass(description="supressable description"), ) transformer = SuppressingTransformer.create( {}, PipelineContext(run_id="test-suppress-transformer"), ) outputs = list( transformer.transform([ RecordEnvelope(input, metadata={}) for input in [dataset_mce, dataset_mcp, EndOfStream()] ])) assert len(outputs) == 2 # MCP will be dropped
def test_pattern_dataset_ownership_transformation(mock_time): no_owner_aspect = make_generic_dataset() with_owner_aspect = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)", aspects=[ models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn("fake_owner"), type=models.OwnershipTypeClass.DATAOWNER, ), ], lastModified=models.AuditStampClass( time=1625266033123, actor="urn:li:corpuser:datahub"), ) ], ), ) not_a_dataset = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn= "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", aspects=[ models.DataJobInfoClass( name="User Deletions", description= "Constructs the fct_users_deleted from logging_events", type=models.AzkabanJobTypeClass.SQL, ) ], )) inputs = [no_owner_aspect, with_owner_aspect, not_a_dataset, EndOfStream()] transformer = PatternAddDatasetOwnership.create( { "owner_pattern": { "rules": { ".*example1.*": [builder.make_user_urn("person1")], ".*example2.*": [builder.make_user_urn("person2")], } }, }, PipelineContext(run_id="test"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in inputs])) assert len( outputs) == len(inputs) + 1 # additional MCP due to the no-owner MCE # Check the first entry. assert inputs[0] == outputs[0].record first_ownership_aspect = outputs[3].record.aspect assert first_ownership_aspect assert len(first_ownership_aspect.owners) == 1 assert all([ owner.type == models.OwnershipTypeClass.DATAOWNER for owner in first_ownership_aspect.owners ]) # Check the second entry. second_ownership_aspect = builder.get_aspect_if_available( outputs[1].record, models.OwnershipClass) assert second_ownership_aspect assert len(second_ownership_aspect.owners) == 2 assert all([ owner.type == models.OwnershipTypeClass.DATAOWNER for owner in second_ownership_aspect.owners ]) # Verify that the third entry is unchanged. assert inputs[2] == outputs[2].record # Verify that the last entry is unchanged (EOS) assert inputs[-1] == outputs[-1].record
def test_add_dataset_browse_paths(): dataset = make_generic_dataset() transformer = AddDatasetBrowsePathTransformer.create( {"path_templates": ["/abc"]}, PipelineContext(run_id="test"), ) transformed = list( transformer.transform([ RecordEnvelope(dataset, metadata={}), RecordEnvelope(EndOfStream(), metadata={}), ])) browse_path_aspect = transformed[1].record.aspect assert browse_path_aspect assert browse_path_aspect.paths == ["/abc"] # use an mce with a pre-existing browse path dataset_mce = make_generic_dataset( aspects=[StatusClass(removed=False), browse_path_aspect]) transformer = AddDatasetBrowsePathTransformer.create( { "path_templates": [ "/PLATFORM/foo/DATASET_PARTS/ENV", "/ENV/PLATFORM/bar/DATASET_PARTS/", ] }, PipelineContext(run_id="test"), ) transformed = list( transformer.transform([ RecordEnvelope(dataset_mce, metadata={}), RecordEnvelope(EndOfStream(), metadata={}), ])) assert len(transformed) == 2 browse_path_aspect = builder.get_aspect_if_available( transformed[0].record, BrowsePathsClass) assert browse_path_aspect assert browse_path_aspect.paths == [ "/abc", "/bigquery/foo/example1/prod", "/prod/bigquery/bar/example1/", ] transformer = AddDatasetBrowsePathTransformer.create( { "path_templates": [ "/xyz", ], "replace_existing": True, }, PipelineContext(run_id="test"), ) transformed = list( transformer.transform([ RecordEnvelope(dataset_mce, metadata={}), RecordEnvelope(EndOfStream(), metadata={}), ])) assert len(transformed) == 2 browse_path_aspect = builder.get_aspect_if_available( transformed[0].record, BrowsePathsClass) assert browse_path_aspect assert browse_path_aspect.paths == [ "/xyz", ]
def test_simple_dataset_ownership_transformation(mock_time): no_owner_aspect = make_generic_dataset() with_owner_aspect = make_dataset_with_owner() not_a_dataset = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn= "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", aspects=[ models.DataJobInfoClass( name="User Deletions", description= "Constructs the fct_users_deleted from logging_events", type=models.AzkabanJobTypeClass.SQL, ) ], )) inputs = [no_owner_aspect, with_owner_aspect, not_a_dataset, EndOfStream()] transformer = SimpleAddDatasetOwnership.create( { "owner_urns": [ builder.make_user_urn("person1"), builder.make_user_urn("person2"), ] }, PipelineContext(run_id="test"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in inputs])) assert len(outputs) == len(inputs) + 1 # Check the first entry. first_ownership_aspect = builder.get_aspect_if_available( outputs[0].record, models.OwnershipClass) assert first_ownership_aspect is None last_event = outputs[3].record assert isinstance(last_event, MetadataChangeProposalWrapper) assert isinstance(last_event.aspect, OwnershipClass) assert len(last_event.aspect.owners) == 2 assert last_event.entityUrn == outputs[0].record.proposedSnapshot.urn assert all([ owner.type == models.OwnershipTypeClass.DATAOWNER for owner in last_event.aspect.owners ]) # Check the second entry. second_ownership_aspect = builder.get_aspect_if_available( outputs[1].record, models.OwnershipClass) assert second_ownership_aspect assert len(second_ownership_aspect.owners) == 3 assert all([ owner.type == models.OwnershipTypeClass.DATAOWNER for owner in second_ownership_aspect.owners ]) # Verify that the third entry is unchanged. assert inputs[2] == outputs[2].record # Verify that the last entry is EndOfStream assert inputs[3] == outputs[4].record