def commit_checkpoints(
            self,
            job_checkpoints: Dict[JobId,
                                  DatahubIngestionCheckpointClass]) -> None:
        for job_name, checkpoint in job_checkpoints.items():
            # Emit the ingestion state for each job
            logger.info(
                f"Committing ingestion checkpoint for pipeline:'{checkpoint.pipelineName}',"
                f"instance:'{checkpoint.platformInstanceId}', job:'{job_name}'"
            )

            datajob_urn = builder.make_data_job_urn(
                self.orchestrator_name,
                checkpoint.pipelineName,
                job_name,
            )

            self.graph.emit_mcp(
                MetadataChangeProposalWrapper(
                    entityType="dataJob",
                    entityUrn=datajob_urn,
                    aspectName="datahubIngestionCheckpoint",
                    aspect=checkpoint,
                    changeType=ChangeTypeClass.UPSERT,
                ))

            logger.info(
                f"Committed ingestion checkpoint for pipeline:'{checkpoint.pipelineName}',"
                f"instance:'{checkpoint.platformInstanceId}', job:'{job_name}'"
            )
Beispiel #2
0
 def get_data_job_urn(
     orchestrator: str,
     pipeline_name: str,
     job_name: JobId,
     platform_instance_id: str,
 ) -> str:
     """
     Standardizes datajob urn minting for all ingestion job state providers.
     """
     return builder.make_data_job_urn(
         orchestrator, f"{pipeline_name}_{platform_instance_id}", job_name)
    def get_latest_checkpoint(
        self,
        pipeline_name: str,
        platform_instance_id: str,
        job_name: JobId,
    ) -> Optional[DatahubIngestionCheckpointClass]:

        logger.info(
            f"Querying for the latest ingestion checkpoint for pipelineName:'{pipeline_name}',"
            f" platformInstanceId:'{platform_instance_id}', job_name:'{job_name}'"
        )

        data_job_urn = builder.make_data_job_urn(self.orchestrator_name,
                                                 pipeline_name, job_name)
        latest_checkpoint: Optional[
            DatahubIngestionCheckpointClass] = self.graph.get_latest_timeseries_value(
                entity_urn=data_job_urn,
                aspect_name="datahubIngestionCheckpoint",
                filter_criteria_map={
                    "pipelineName": pipeline_name,
                    "platformInstanceId": platform_instance_id,
                },
                aspect_type=DatahubIngestionCheckpointClass,
            )
        if latest_checkpoint:
            logger.info(
                f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
                f" platformInstanceId:'{platform_instance_id}', job_name:'{job_name}' found with start_time:"
                f" {datetime.fromtimestamp(latest_checkpoint.timestampMillis/1000, tz=timezone.utc)} and a"
                f" bucket duration of {latest_checkpoint.eventGranularity}.")
            return latest_checkpoint
        else:
            logger.info(
                f"No committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
                f" platformInstanceId:'{platform_instance_id}', job_name:'{job_name}' found"
            )

        return None
Beispiel #4
0
        fldUrn("bar3", "c2"),
        fldUrn("bar4", "c1"),
    ],
    outputDatasetFields=[
        fldUrn("bar", "c1"),
        fldUrn("bar", "c2"),
        fldUrn("bar", "c3"),
        fldUrn("bar", "c4"),
        fldUrn("bar", "c5"),
        fldUrn("bar", "c6"),
        fldUrn("bar", "c7"),
        fldUrn("bar", "c9"),
        fldUrn("bar2", "c9"),
    ],
    fineGrainedLineages=fineGrainedLineages,
)

dataJobLineageMcp = MetadataChangeProposalWrapper(
    entityType="dataJob",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_data_job_urn("spark", "Flow1", "Task1"),
    aspectName="dataJobInputOutput",
    aspect=dataJobInputOutput,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(dataJobLineageMcp)
import datahub.emitter.mce_builder as builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.com.linkedin.pegasus2avro.datajob import DataJobInfoClass
from datahub.metadata.schema_classes import ChangeTypeClass

# Construct the DataJobInfo aspect with the job -> flow lineage.
dataflow_urn = builder.make_data_flow_urn(
    orchestrator="airflow", flow_id="flow1", cluster="prod"
)

datajob_info = DataJobInfoClass(name="My Job 1", type="AIRFLOW", flowUrn=dataflow_urn)

# Construct a MetadataChangeProposalWrapper object with the DataJobInfo aspect.
# NOTE: This will overwrite all of the existing dataJobInfo aspect information associated with this job.
chart_info_mcp = MetadataChangeProposalWrapper(
    entityType="dataJob",
    changeType=ChangeTypeClass.UPSERT,
    entityUrn=builder.make_data_job_urn(
        orchestrator="airflow", flow_id="flow1", job_id="job1", cluster="prod"
    ),
    aspectName="dataJobInfo",
    aspect=datajob_info,
)

# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")

# Emit metadata!
emitter.emit_mcp(chart_info_mcp)