Example #1
0
    def get_dataflow_wu(self, flow_urn: str, job: Dict[str, Any]) -> MetadataWorkUnit:
        """
        Generate a DataFlow workunit for a Glue job.

        Parameters
        ----------
            flow_urn:
                URN for the flow
            job:
                Job object from get_all_jobs()
        """
        mce = MetadataChangeEventClass(
            proposedSnapshot=DataFlowSnapshotClass(
                urn=flow_urn,
                aspects=[
                    DataFlowInfoClass(
                        name=job["Name"],
                        description=job["Description"],
                        # specify a few Glue-specific properties
                        customProperties={
                            "role": job["Role"],
                            "created": str(job["CreatedOn"]),
                            "modified": str(job["LastModifiedOn"]),
                            "command": job["Command"]["ScriptLocation"],
                        },
                    ),
                ],
            )
        )

        return MetadataWorkUnit(id=job["Name"], mce=mce)
Example #2
0
    def generate_mce(self) -> MetadataChangeEventClass:
        flow_mce = MetadataChangeEventClass(
            proposedSnapshot=DataFlowSnapshotClass(
                urn=str(self.urn),
                aspects=[
                    DataFlowInfoClass(
                        name=self.id,
                        description=self.description,
                        customProperties=self.properties,
                        externalUrl=self.url,
                    ),
                    *self.generate_ownership_aspect(),
                    *self.generate_tags_aspect(),
                ],
            ))

        return flow_mce
Example #3
0
    def get_dataflow_wu(self, flow_urn: str,
                        job: Dict[str, Any]) -> MetadataWorkUnit:
        """
        Generate a DataFlow workunit for a Glue job.

        Parameters
        ----------
            flow_urn:
                URN for the flow
            job:
                Job object from get_all_jobs()
        """

        region = self.source_config.aws_region

        custom_props = {
            "role": job["Role"],
        }

        if job.get("CreatedOn") is not None:
            custom_props["created"] = str(job["CreatedOn"])

        if job.get("LastModifiedOn") is not None:
            custom_props["modified"] = str(job["LastModifiedOn"])

        command = job.get("Command", {}).get("ScriptLocation")
        if command is not None:
            custom_props["command"] = command

        mce = MetadataChangeEventClass(proposedSnapshot=DataFlowSnapshotClass(
            urn=flow_urn,
            aspects=[
                DataFlowInfoClass(
                    name=job["Name"],
                    description=job.get("Description"),
                    externalUrl=
                    f"https://{region}.console.aws.amazon.com/gluestudio/home?region={region}#/editor/job/{job['Name']}/graph",
                    # specify a few Glue-specific properties
                    customProperties=custom_props,
                ),
            ],
        ))

        return MetadataWorkUnit(id=job["Name"], mce=mce)
Example #4
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:

        jobs = self.get_all_jobs()

        processed_jobs: Dict[str, SageMakerJob] = {}

        # first pass: process jobs and collect datasets used
        for job in jobs:

            job_type = SAGEMAKER_JOB_TYPES[job["type"]]
            job_name = job[job_type.list_name_key]

            job_details = self.get_job_details(job_name, job["type"])

            processed_job = getattr(self, job_type.processor)(job_details)
            processed_jobs[processed_job.job_snapshot.urn] = processed_job

        all_datasets = {}

        # second pass:
        #   - move output jobs to inputs
        #   - aggregate i/o datasets
        for job_urn in sorted(processed_jobs):
            processed_job = processed_jobs[job_urn]

            for output_job_urn in processed_job.output_jobs:
                processed_jobs[output_job_urn].input_jobs.add(output_job_urn)

            all_datasets.update(processed_job.input_datasets)
            all_datasets.update(processed_job.output_datasets)

        # yield datasets
        for dataset_urn, dataset in all_datasets.items():

            dataset_snapshot = DatasetSnapshot(
                urn=dataset_urn,
                aspects=[],
            )
            dataset_snapshot.aspects.append(
                DatasetPropertiesClass(
                    customProperties={k: str(v) for k, v in dataset.items()},
                    tags=[],
                )
            )
            dataset_mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
            dataset_wu = MetadataWorkUnit(
                id=dataset_urn,
                mce=dataset_mce,
            )
            self.report.report_dataset_scanned()
            self.report.report_workunit(dataset_wu)
            yield dataset_wu

        # third pass: construct and yield MCEs
        for job_urn in sorted(processed_jobs):

            processed_job = processed_jobs[job_urn]
            job_snapshot = processed_job.job_snapshot

            flow_urn = make_sagemaker_flow_urn(
                processed_job.job_type, processed_job.job_name, self.env
            )

            # create flow for each job
            flow_mce = MetadataChangeEvent(
                proposedSnapshot=DataFlowSnapshotClass(
                    urn=flow_urn,
                    aspects=[
                        DataFlowInfoClass(
                            name=processed_job.job_name,
                        ),
                    ],
                )
            )
            flow_wu = MetadataWorkUnit(
                id=flow_urn,
                mce=flow_mce,
            )
            self.report.report_workunit(flow_wu)
            yield flow_wu

            job_snapshot.aspects.append(
                DataJobInputOutputClass(
                    inputDatasets=sorted(list(processed_job.input_datasets.keys())),
                    outputDatasets=sorted(list(processed_job.output_datasets.keys())),
                    inputDatajobs=sorted(list(processed_job.input_jobs)),
                )
            )

            job_mce = MetadataChangeEvent(proposedSnapshot=job_snapshot)
            job_wu = MetadataWorkUnit(
                id=job_urn,
                mce=job_mce,
            )
            self.report.report_job_scanned()
            self.report.report_workunit(job_wu)
            yield job_wu