def get_datajob_wu(self, node: Dict[str, Any], job: Dict[str, Any]) -> MetadataWorkUnit: """ Generate a DataJob workunit for a component (node) in a Glue job. Parameters ---------- node: Node from process_dataflow_graph() job: Job object from get_all_jobs() """ mce = MetadataChangeEventClass(proposedSnapshot=DataJobSnapshotClass( urn=node["urn"], aspects=[ DataJobInfoClass( name=f"{job['Name']}:{node['NodeType']}-{node['Id']}", type="GLUE", customProperties={ **{x["Name"]: x["Value"] for x in node["Args"]}, "transformType": node["NodeType"], "nodeId": node["Id"], }, ), DataJobInputOutputClass( inputDatasets=node["inputDatasets"], outputDatasets=node["outputDatasets"], inputDatajobs=node["inputDatajobs"], ), ], )) return MetadataWorkUnit(id=f'{job["Name"]}-{node["Id"]}', mce=mce)
def construct_job_workunits( self, job_urn: str, job_name: str, external_url: str, job_type: str, description: Optional[str], job_properties: Optional[Dict[str, str]] = None, inlets: List[str] = [], outlets: List[str] = [], inputJobs: List[str] = [], status: Optional[str] = None, ) -> Iterable[MetadataWorkUnit]: if job_properties: job_properties = {k: v for k, v in job_properties.items() if v is not None} mcp = MetadataChangeProposalWrapper( entityType="dataJob", entityUrn=job_urn, changeType=ChangeTypeClass.UPSERT, aspectName="dataJobInfo", aspect=DataJobInfoClass( name=job_name, type=job_type, description=description, customProperties=job_properties, externalUrl=external_url, status=status, ), ) wu = MetadataWorkUnit( id=f"{NIFI}.{job_name}.{mcp.aspectName}", mcp=mcp, ) self.report.report_workunit(wu) yield wu inlets.sort() outlets.sort() inputJobs.sort() mcp = MetadataChangeProposalWrapper( entityType="dataJob", entityUrn=job_urn, changeType=ChangeTypeClass.UPSERT, aspectName="dataJobInputOutput", aspect=DataJobInputOutputClass( inputDatasets=inlets, outputDatasets=outlets, inputDatajobs=inputJobs ), ) wu = MetadataWorkUnit( id=f"{NIFI}.{job_name}.{mcp.aspectName}", mcp=mcp, ) self.report.report_workunit(wu) yield wu
def create_common_job_snapshot( self, job: Dict[str, Any], job_type: JobType, job_url: Optional[str] = None, ) -> Tuple[DataJobSnapshotClass, str, str]: """ General function for generating a job snapshot. """ job_type_info = job_type_to_info[job_type] name = job[job_type_info.describe_name_key] arn = job[job_type_info.describe_arn_key] sagemaker_status = job[job_type_info.describe_status_key] mapped_status = job_type_info.status_map.get(sagemaker_status) if mapped_status is None: mapped_status = JobStatusClass.UNKNOWN self.report.report_warning( name, f"Unknown status for {name} ({arn}): {sagemaker_status}", ) job_urn = make_sagemaker_job_urn(job_type.value, name, arn, self.env) job_snapshot = DataJobSnapshotClass( urn=job_urn, aspects=[ DataJobInfoClass( name=name, type="SAGEMAKER", status=mapped_status, externalUrl=job_url, customProperties={ **{key: str(value) for key, value in job.items()}, "jobType": job_type.value, }, ), BrowsePathsClass(paths=[f"/{job_type.value}/{name}"]), ], ) return job_snapshot, name, arn
def get_datajob_wu( self, node: Dict[str, Any], job: Dict[str, Any] ) -> MetadataWorkUnit: """ Generate a DataJob workunit for a component (node) in a Glue job. Parameters ---------- node: Node from process_dataflow_graph() job: Job object from get_all_jobs() """ region = self.source_config.aws_region mce = MetadataChangeEventClass( proposedSnapshot=DataJobSnapshotClass( urn=node["urn"], aspects=[ DataJobInfoClass( name=f"{job['Name']}:{node['NodeType']}-{node['Id']}", type="GLUE", # there's no way to view an individual job node by link, so just show the graph externalUrl=f"https://{region}.console.aws.amazon.com/gluestudio/home?region={region}#/editor/job/{job['Name']}/graph", customProperties={ **{x["Name"]: x["Value"] for x in node["Args"]}, "transformType": node["NodeType"], "nodeId": node["Id"], }, ), DataJobInputOutputClass( inputDatasets=node["inputDatasets"], outputDatasets=node["outputDatasets"], inputDatajobs=node["inputDatajobs"], ), ], ) ) return MetadataWorkUnit(id=f'{job["Name"]}-{node["Id"]}', mce=mce)
def generate_mce(self) -> MetadataChangeEventClass: job_mce = MetadataChangeEventClass( proposedSnapshot=DataJobSnapshotClass( urn=str(self.urn), aspects=[ DataJobInfoClass( name=self.name if self.name is not None else self.id, type=AzkabanJobTypeClass.COMMAND, description=self.description, customProperties=self.properties, externalUrl=self.url, ), DataJobInputOutputClass( inputDatasets=[str(urn) for urn in self.inlets], outputDatasets=[str(urn) for urn in self.outlets], inputDatajobs=[str(urn) for urn in self.upstream_urns], ), *self.generate_ownership_aspect(), *self.generate_tags_aspect(), ], )) return job_mce
def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: mcp = MetadataChangeProposalWrapper( entityType="datajob", entityUrn=str(self.urn), aspectName="dataJobInfo", aspect=DataJobInfoClass( name=self.name if self.name is not None else self.id, type=AzkabanJobTypeClass.COMMAND, description=self.description, customProperties=self.properties, externalUrl=self.url, ), changeType=ChangeTypeClass.UPSERT, ) yield mcp yield from self.generate_data_input_output_mcp() for owner in self.generate_ownership_aspect(): mcp = MetadataChangeProposalWrapper( entityType="datajob", entityUrn=str(self.urn), aspectName="ownership", aspect=owner, changeType=ChangeTypeClass.UPSERT, ) yield mcp for tag in self.generate_tags_aspect(): mcp = MetadataChangeProposalWrapper( entityType="datajob", entityUrn=str(self.urn), aspectName="globalTags", aspect=tag, changeType=ChangeTypeClass.UPSERT, ) yield mcp