def create_global_tags_aspect_mce(directive: Directive) -> MetadataChangeEventClass: return MetadataChangeEventClass( proposedSnapshot=DatasetSnapshotClass( urn=dataset_name_to_urn(directive.table), aspects=[GlobalTagsClass(tags=[])], ) )
def get_dataflow_wu(self, flow_urn: str, job: Dict[str, Any]) -> MetadataWorkUnit: """ Generate a DataFlow workunit for a Glue job. Parameters ---------- flow_urn: URN for the flow job: Job object from get_all_jobs() """ mce = MetadataChangeEventClass( proposedSnapshot=DataFlowSnapshotClass( urn=flow_urn, aspects=[ DataFlowInfoClass( name=job["Name"], description=job["Description"], # specify a few Glue-specific properties customProperties={ "role": job["Role"], "created": str(job["CreatedOn"]), "modified": str(job["LastModifiedOn"]), "command": job["Command"]["ScriptLocation"], }, ), ], ) ) return MetadataWorkUnit(id=job["Name"], mce=mce)
def get_datajob_wu(self, node: Dict[str, Any], job: Dict[str, Any]) -> MetadataWorkUnit: """ Generate a DataJob workunit for a component (node) in a Glue job. Parameters ---------- node: Node from process_dataflow_graph() job: Job object from get_all_jobs() """ mce = MetadataChangeEventClass(proposedSnapshot=DataJobSnapshotClass( urn=node["urn"], aspects=[ DataJobInfoClass( name=f"{job['Name']}:{node['NodeType']}-{node['Id']}", type="GLUE", customProperties={ **{x["Name"]: x["Value"] for x in node["Args"]}, "transformType": node["NodeType"], "nodeId": node["Id"], }, ), DataJobInputOutputClass( inputDatasets=node["inputDatasets"], outputDatasets=node["outputDatasets"], inputDatajobs=node["inputDatajobs"], ), ], )) return MetadataWorkUnit(id=f'{job["Name"]}-{node["Id"]}', mce=mce)
def make_lineage_mce( upstream_urns: List[str], downstream_urn: str, actor: str = make_user_urn("datahub"), lineage_type: str = DatasetLineageTypeClass.TRANSFORMED, ) -> MetadataChangeEventClass: sys_time = get_sys_time() mce = MetadataChangeEventClass( proposedSnapshot=DatasetSnapshotClass( urn=downstream_urn, aspects=[ UpstreamLineageClass( upstreams=[ UpstreamClass( auditStamp=AuditStampClass( time=sys_time, actor=actor, ), dataset=upstream_urn, type=lineage_type, ) for upstream_urn in upstream_urns ] ) ], ) ) return mce
def test_serde_to_avro(pytestconfig, json_filename): # In this test, we want to read in from JSON -> MCE object. # Next we serialize from MCE to Avro and then deserialize back to MCE. # Finally, we want to compare the two MCE objects. json_path = pytestconfig.rootpath / json_filename mces = list(iterate_mce_file(str(json_path))) # Serialize to Avro. parsed_schema = fastavro.parse_schema(json.loads(SCHEMA_JSON_STR)) fo = io.BytesIO() out_records = [mce.to_obj(tuples=True) for mce in mces] fastavro.writer(fo, parsed_schema, out_records) # Deserialized from Avro. fo.seek(0) in_records = list(fastavro.reader(fo)) in_mces = [ MetadataChangeEventClass.from_obj(record, tuples=True) for record in in_records ] # Check diff assert len(mces) == len(in_mces) for i in range(len(mces)): assert str(mces[i]) == str(in_mces[i])
def get_initial_mce() -> MetadataChangeEventClass: return MetadataChangeEventClass( proposedSnapshot=DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:test_platform,test,PROD)", aspects=[DatasetPropertiesClass(description="test.description", )], ), systemMetadata=SystemMetadata(lastObserved=1586847600000, runId="pipeline_test"), )
def create_owner_entity_mce(owner: str) -> MetadataChangeEventClass: clean_name = clean_owner_name(owner) return MetadataChangeEventClass(proposedSnapshot=CorpUserSnapshotClass( urn=owner_name_to_urn(clean_name), aspects=[ CorpUserInfoClass( active=True, displayName=owner, fullName=owner, email=f"{clean_name}[email protected]", ) ], ))
def get_initial_mce() -> MetadataChangeEventClass: return MetadataChangeEventClass( proposedSnapshot=DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:test_platform,test,PROD)", aspects=[ DatasetPropertiesClass( description="test.description", customProperties={}, uri=None, tags=[], ) ], ) )
def transform_aspect( # not marked as @abstractmethod to avoid impacting transformers that extend this class self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]) -> Optional[Aspect]: """A default implementation for transform_aspect that calls `transform_one` with a fake MCE to preserve compatibility with previous transformers coded against MCE""" fake_mce: MetadataChangeEventClass = MetadataChangeEventClass( proposedSnapshot=DatasetSnapshotClass( urn=entity_urn, aspects=[aspect] if aspect else [], # type: ignore )) transformed_mce = self.transform_one(fake_mce) assert transformed_mce.proposedSnapshot assert ( len(transformed_mce.proposedSnapshot.aspects) <= 1 ), "This implementation assumes that transformers will return at most 1 aspect value back" return (transformed_mce.proposedSnapshot.aspects[0] # type: ignore if len(transformed_mce.proposedSnapshot.aspects) else None)
def generate_mce(self) -> MetadataChangeEventClass: flow_mce = MetadataChangeEventClass( proposedSnapshot=DataFlowSnapshotClass( urn=str(self.urn), aspects=[ DataFlowInfoClass( name=self.id, description=self.description, customProperties=self.properties, externalUrl=self.url, ), *self.generate_ownership_aspect(), *self.generate_tags_aspect(), ], )) return flow_mce
def make_lineage_mce( upstream_urns: List[str], downstream_urn: str, lineage_type: str = DatasetLineageTypeClass.TRANSFORMED, ) -> MetadataChangeEventClass: mce = MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=downstream_urn, aspects=[ UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=upstream_urn, type=lineage_type, ) for upstream_urn in upstream_urns ]) ], )) return mce
def create_lineage_aspect_mce( directive: Directive) -> MetadataChangeEventClass: return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=dataset_name_to_urn(directive.table), aspects=[ UpstreamLineageClass(upstreams=[ UpstreamClass( dataset=dataset_name_to_urn(upstream), type=DatasetLineageTypeClass.TRANSFORMED, auditStamp=AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), ) for upstream in directive.depends_on ]) ], ))
def assert_entity_mce_aspect(entity_urn: str, aspect: Any, aspect_type: Type, file: str) -> int: test_output = load_json_file(file) entity_type = Urn.create_from_string(entity_urn).get_type() assert isinstance(test_output, list) # mce urns mces: List[MetadataChangeEventClass] = [ MetadataChangeEventClass.from_obj(x) for x in test_output if _get_filter(mce=True, entity_type=entity_type)(x) and _get_element(x, _get_mce_urn_path_spec(entity_type)) == entity_urn ] matches = 0 for mce in mces: for a in mce.proposedSnapshot.aspects: if isinstance(a, aspect_type): assert a == aspect matches = matches + 1 return matches
def create_editable_schema_info_aspect_mce( directive: Directive, ) -> MetadataChangeEventClass: return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=dataset_name_to_urn(directive.table), aspects=[ EditableSchemaMetadataClass( created=AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), lastModified=AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), editableSchemaFieldInfo=[], ) ], ))
def get_dataflow_wu(self, flow_urn: str, job: Dict[str, Any]) -> MetadataWorkUnit: """ Generate a DataFlow workunit for a Glue job. Parameters ---------- flow_urn: URN for the flow job: Job object from get_all_jobs() """ region = self.source_config.aws_region custom_props = { "role": job["Role"], } if job.get("CreatedOn") is not None: custom_props["created"] = str(job["CreatedOn"]) if job.get("LastModifiedOn") is not None: custom_props["modified"] = str(job["LastModifiedOn"]) command = job.get("Command", {}).get("ScriptLocation") if command is not None: custom_props["command"] = command mce = MetadataChangeEventClass(proposedSnapshot=DataFlowSnapshotClass( urn=flow_urn, aspects=[ DataFlowInfoClass( name=job["Name"], description=job.get("Description"), externalUrl= f"https://{region}.console.aws.amazon.com/gluestudio/home?region={region}#/editor/job/{job['Name']}/graph", # specify a few Glue-specific properties customProperties=custom_props, ), ], )) return MetadataWorkUnit(id=job["Name"], mce=mce)
def create_ownership_aspect_mce( directive: Directive) -> MetadataChangeEventClass: return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=dataset_name_to_urn(directive.table), aspects=[ OwnershipClass( owners=[ OwnerClass( owner=owner_name_to_urn(clean_owner_name(owner)), type=OwnershipTypeClass.DATAOWNER, ) for owner in directive.owners ], lastModified=AuditStampClass( time=int(time.time() * 1000), actor="urn:li:corpuser:datahub", ), ) ], ))
def get_datajob_wu( self, node: Dict[str, Any], job: Dict[str, Any] ) -> MetadataWorkUnit: """ Generate a DataJob workunit for a component (node) in a Glue job. Parameters ---------- node: Node from process_dataflow_graph() job: Job object from get_all_jobs() """ region = self.source_config.aws_region mce = MetadataChangeEventClass( proposedSnapshot=DataJobSnapshotClass( urn=node["urn"], aspects=[ DataJobInfoClass( name=f"{job['Name']}:{node['NodeType']}-{node['Id']}", type="GLUE", # there's no way to view an individual job node by link, so just show the graph externalUrl=f"https://{region}.console.aws.amazon.com/gluestudio/home?region={region}#/editor/job/{job['Name']}/graph", customProperties={ **{x["Name"]: x["Value"] for x in node["Args"]}, "transformType": node["NodeType"], "nodeId": node["Id"], }, ), DataJobInputOutputClass( inputDatasets=node["inputDatasets"], outputDatasets=node["outputDatasets"], inputDatajobs=node["inputDatajobs"], ), ], ) ) return MetadataWorkUnit(id=f'{job["Name"]}-{node["Id"]}', mce=mce)
def generate_mce(self) -> MetadataChangeEventClass: job_mce = MetadataChangeEventClass( proposedSnapshot=DataJobSnapshotClass( urn=str(self.urn), aspects=[ DataJobInfoClass( name=self.name if self.name is not None else self.id, type=AzkabanJobTypeClass.COMMAND, description=self.description, customProperties=self.properties, externalUrl=self.url, ), DataJobInputOutputClass( inputDatasets=[str(urn) for urn in self.inlets], outputDatasets=[str(urn) for urn in self.outlets], inputDatajobs=[str(urn) for urn in self.upstream_urns], ), *self.generate_ownership_aspect(), *self.generate_tags_aspect(), ], )) return job_mce
def create_editable_schema_info_aspect_mce( directive: Directive) -> MetadataChangeEventClass: return MetadataChangeEventClass(proposedSnapshot=DatasetSnapshotClass( urn=dataset_name_to_urn(directive.table), aspects=[EditableSchemaMetadataClass(editableSchemaFieldInfo=[])], ))
def read_mces(path: os.PathLike) -> List[MetadataChangeEventClass]: with open(path) as f: objs = json.load(f) mces = [MetadataChangeEventClass.from_obj(obj) for obj in objs] return mces
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]: def strip_types(field_path: str) -> str: final_path = field_path final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path) final_path = re.sub(r"^\[version=2.0\]\.", "", final_path) return final_path datasets: List[DatasetSnapshotClass] = [] for entity_name, entity_def in entity_registry.items(): entity_display_name = entity_def.display_name entity_fields = [] for aspect_name in entity_def.aspects: if aspect_name not in aspect_registry: print( f"Did not find aspect name: {aspect_name} in aspect_registry" ) continue # all aspects should have a schema aspect_schema = aspect_registry[aspect_name].schema assert aspect_schema entity_fields.append({ "type": aspect_schema.to_json(), "name": aspect_name, }) if entity_fields: names = avro.schema.Names() field_objects = [] for f in entity_fields: field = avro.schema.Field( type=f["type"], name=f["name"], has_default=False, ) field_objects.append(field) with unittest.mock.patch("avro.schema.Names.add_name", add_name): entity_avro_schema = avro.schema.RecordSchema( name=entity_name, namespace="datahub.metadata.model", names=names, fields=[], ) entity_avro_schema.set_prop("fields", field_objects) rawSchema = json.dumps(entity_avro_schema.to_json()) # always add the URN which is the primary key urn_field = SchemaField( fieldPath="urn", type=SchemaFieldDataTypeClass(type=StringTypeClass()), nativeDataType="string", nullable=False, isPartOfKey=True, description= f"The primary identifier for the {entity_name} entity. See the {entity_def.keyAspect} field to understand the structure of this urn.", ) schema_fields: List[SchemaField] = [ urn_field ] + avro_schema_to_mce_fields(rawSchema) foreign_keys: List[ForeignKeyConstraintClass] = [] source_dataset_urn = make_dataset_urn( platform=make_data_platform_urn("datahub"), name=f"{entity_display_name}", ) for f_field in schema_fields: if f_field.jsonProps: json_dict = json.loads(f_field.jsonProps) if "Aspect" in json_dict: aspect_info = json_dict["Aspect"] f_field.globalTags = f_field.globalTags or GlobalTagsClass( tags=[]) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Aspect")) # if this is the key aspect, also add primary-key if entity_def.keyAspect == aspect_info.get("name"): f_field.isPartOfKey = True if "timeseries" == aspect_info.get("type", ""): # f_field.globalTags = f_field.globalTags or GlobalTagsClass( # tags=[] # ) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Temporal")) import pdb # breakpoint() if "Searchable" in json_dict: f_field.globalTags = f_field.globalTags or GlobalTagsClass( tags=[]) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Searchable")) if "Relationship" in json_dict: relationship_info = json_dict["Relationship"] # detect if we have relationship specified at leaf level or thru path specs if "entityTypes" not in relationship_info: # path spec assert ( len(relationship_info.keys()) == 1 ), "We should never have more than one path spec assigned to a relationship annotation" final_info = None for k, v in relationship_info.items(): final_info = v relationship_info = final_info assert "entityTypes" in relationship_info entity_types: List[str] = relationship_info.get( "entityTypes", []) relnship_name = relationship_info.get("name", None) for entity_type in entity_types: destination_entity_name = capitalize_first( entity_type) foreign_dataset_urn = make_dataset_urn( platform=make_data_platform_urn("datahub"), name=destination_entity_name, ) fkey = ForeignKeyConstraintClass( name=relnship_name, foreignDataset=foreign_dataset_urn, foreignFields=[ f"urn:li:schemaField:({foreign_dataset_urn}, urn)" ], sourceFields=[ f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})" ], ) foreign_keys.append(fkey) relnships_graph.add_edge( entity_display_name, destination_entity_name, fkey.name, f" via `{strip_types(f_field.fieldPath)}`", edge_id= f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}", ) schemaMetadata = SchemaMetadataClass( schemaName=f"{entity_name}", platform=make_data_platform_urn("datahub"), platformSchema=OtherSchemaClass(rawSchema=rawSchema), fields=schema_fields, version=0, hash="", foreignKeys=foreign_keys if foreign_keys else None, ) dataset = DatasetSnapshotClass( urn=make_dataset_urn( platform=make_data_platform_urn("datahub"), name=f"{entity_display_name}", ), aspects=[ schemaMetadata, GlobalTagsClass( tags=[TagAssociationClass(tag="urn:li:tag:Entity")]), BrowsePathsClass( [f"/prod/datahub/entities/{entity_display_name}"]), ], ) datasets.append(dataset) events: List[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]] = [] for d in datasets: entity_name = d.urn.split(":")[-1].split(",")[1] d.aspects.append( DatasetPropertiesClass( description=make_entity_docs(entity_name, relnships_graph))) mce = MetadataChangeEventClass(proposedSnapshot=d, ) events.append(mce) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=d.urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["entity"]), ) events.append(mcp) return events