Ejemplo n.º 1
0
def emit_event():
    client = OpenLineageClient.from_environment()
    client.emit(
        RunEvent(
            RunState.COMPLETE,
            datetime.datetime.now().isoformat(), Run(runId=str(uuid.uuid4())),
            Job(namespace=os.getenv('OPENLINEAGE_NAMESPACE'),
                name='emit_event.wait-for-me'), _PRODUCER, [], []))
Ejemplo n.º 2
0
    def _run(self,
             validation_result_suite: ExpectationSuiteValidationResult,
             validation_result_suite_identifier: ValidationResultIdentifier,
             data_asset: GEDataset,
             expectation_suite_identifier=None,
             checkpoint_identifier=None,
             payload=None):
        # Initialize logger here so that the action is serializable until it actually runs
        self.log = logging.getLogger(self.__class__.__module__ + '.' +
                                     self.__class__.__name__)

        datasets = []
        if isinstance(data_asset, SqlAlchemyDataset):
            datasets = self._fetch_datasets_from_sql_source(
                data_asset, validation_result_suite)
        elif isinstance(data_asset, PandasDataset):
            datasets = self._fetch_datasets_from_pandas_source(
                data_asset, validation_result_suite)
        run_facets = {}
        if self.parent_run_id is not None:
            run_facets.update({
                "parentRun":
                ParentRunFacet.create(self.parent_run_id,
                                      self.parent_job_namespace,
                                      self.parent_job_name)
            })
        run_facets.update({
            "great_expectations_meta":
            GreatExpectationsRunFacet(**validation_result_suite.meta)
        })
        job_facets = {}
        if self.job_description:
            job_facets.update(
                {"documentation": DocumentationJobFacet(self.job_description)})
        if self.code_location:
            job_facets.update({
                "sourceCodeLocation":
                SourceCodeLocationJobFacet("", self.code_location)
            })

        job_name = self.job_name
        if self.job_name is None:
            job_name = validation_result_suite.meta["expectation_suite_name"] + '.' \
                       + validation_result_suite_identifier.batch_identifier
        run_event = RunEvent(
            eventType=RunState.COMPLETE,
            eventTime=datetime.now().isoformat(),
            run=Run(runId=str(self.run_id), facets=run_facets),
            job=Job(self.namespace, job_name, facets=job_facets),
            inputs=datasets,
            outputs=[],
            producer=
            "https://github.com/OpenLineage/OpenLineage/tree/$VERSION/integration/common/openlineage/provider/great_expectations"  # noqa
        )
        if self.do_publish:
            self.openlineage_client.emit(run_event)
        # Great expectations tries to append stuff here, so we need to make it a dict
        return Serde.to_dict(run_event)
Ejemplo n.º 3
0
 def fail_task(self, run_id: str, job_name: str, end_time: str,
               task: TaskMetadata):
     """
     Emits openlineage event of type FAIL
     :param run_id: globally unique identifier of task in dag run
     :param job_name: globally unique identifier of task between dags
     :param end_time: time of task completion
     :param task: metadata container with information extracted from operator
     """
     event = RunEvent(eventType=RunState.FAIL,
                      eventTime=end_time,
                      run=self._build_run(run_id),
                      job=self._build_job(job_name),
                      inputs=task.inputs,
                      outputs=task.outputs,
                      producer=_PRODUCER)
     self.get_or_create_openlineage_client().emit(event)
Ejemplo n.º 4
0
    def start_task(
            self,
            run_id: str,
            job_name: str,
            job_description: str,
            event_time: str,
            parent_run_id: Optional[str],
            code_location: Optional[str],
            nominal_start_time: str,
            nominal_end_time: str,
            task: Optional[TaskMetadata],
            run_facets: Optional[Dict[
                str, Type[BaseFacet]]] = None,  # Custom run facets
    ) -> str:
        """
        Emits openlineage event of type START
        :param run_id: globally unique identifier of task in dag run
        :param job_name: globally unique identifier of task in dag
        :param job_description: user provided description of job
        :param event_time:
        :param parent_run_id: identifier of job spawning this task
        :param code_location: file path or URL of DAG file
        :param nominal_start_time: scheduled time of dag run
        :param nominal_end_time: following schedule of dag run
        :param task: metadata container with information extracted from operator
        :param run_facets:
        :return:
        """

        event = RunEvent(eventType=RunState.START,
                         eventTime=event_time,
                         run=self._build_run(run_id, parent_run_id, job_name,
                                             nominal_start_time,
                                             nominal_end_time, run_facets),
                         job=self._build_job(job_name, job_description,
                                             code_location, task.job_facets),
                         inputs=task.inputs if task else None,
                         outputs=task.outputs if task else None,
                         producer=_PRODUCER)
        self.get_or_create_openlineage_client().emit(event)
        return event.run.runId
Ejemplo n.º 5
0
def test_client_sends_proper_json_with_minimal_event():
    session = MagicMock()
    client = OpenLineageClient(url="http://example.com", session=session)

    client.emit(
        RunEvent(
            RunState.START,
            "2020-01-01",
            Run("69f4acab-b87d-4fc0-b27b-8ea950370ff3"),
            Job("openlineage", "job"),
            "producer"
        )
    )

    session.post.assert_called_with(
        "http://example.com/api/v1/lineage",
        '{"eventTime": "2020-01-01", "eventType": "START", "inputs": [], "job": '
        '{"facets": {}, "name": "job", "namespace": "openlineage"}, "outputs": [], '
        '"producer": "producer", "run": {"facets": {}, "runId": '
        '"69f4acab-b87d-4fc0-b27b-8ea950370ff3"}}',
        timeout=5.0,
        verify=True
    )
Ejemplo n.º 6
0
def test_openlineage_dag_adds_custom_facets(
        mock_get_or_create_openlineage_client,
        new_lineage_run_id,
        clear_db_airflow_dags,
):
    openlineage.airflow.dag.extractors.clear()
    openlineage.airflow.dag.extractor_mapper.extractors.pop('TestFixtureDummyOperator', None)

    dag = DAG(
        DAG_ID,
        schedule_interval='@daily',
        default_args=DAG_DEFAULT_ARGS,
        description=DAG_DESCRIPTION
    )
    # Mock the openlineage client method calls
    mock_openlineage_client = mock.Mock()
    mock_get_or_create_openlineage_client.return_value = mock_openlineage_client

    run_id = str(uuid.uuid4())
    job_id = f"{DAG_ID}.{TASK_ID_COMPLETED}"

    new_lineage_run_id.return_value = run_id

    # Add task that will be marked as completed
    task_will_complete = DummyOperator(
        task_id=TASK_ID_COMPLETED,
        dag=dag
    )
    completed_task_location = get_location(task_will_complete.dag.fileloc)

    # Start run
    dag.create_dagrun(
        run_id=DAG_RUN_ID,
        execution_date=DEFAULT_DATE,
        state=State.RUNNING)

    # Assert emit calls
    start_time = '2016-01-01T00:00:00.000000Z'
    end_time = '2016-01-02T00:00:00.000000Z'

    mock_openlineage_client.emit.assert_called_once_with(RunEvent(
            eventType=RunState.START,
            eventTime=mock.ANY,
            run=Run(run_id, {
                "nominalTime": NominalTimeRunFacet(start_time, end_time),
                "parentRun": ParentRunFacet.create(
                    runId=DAG_RUN_ID,
                    namespace=DAG_NAMESPACE,
                    name=job_id
                ),
                "airflow_runArgs": AirflowRunArgsRunFacet(False),
                "airflow_version": AirflowVersionRunFacet(
                    operator="airflow.operators.dummy_operator.DummyOperator",
                    taskInfo=mock.ANY,
                    airflowVersion=AIRFLOW_VERSION,
                    openlineageAirflowVersion=OPENLINEAGE_AIRFLOW_VERSION
                )
            }),
            job=Job("default", job_id, {
                "documentation": DocumentationJobFacet(DAG_DESCRIPTION),
                "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location)
            }),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
    ))
Ejemplo n.º 7
0
def test_openlineage_dag_with_extract_on_complete(
        job_id_mapping,
        mock_get_or_create_openlineage_client,
        get_custom_facets,
        new_lineage_run_id,
        clear_db_airflow_dags,
        session=None):

    # --- test setup

    # Add the dummy extractor to the list for the task above
    openlineage.airflow.dag.extractors.clear()
    openlineage.airflow.dag.extractor_mapper.extractors[TestFixtureDummyOperator.__name__] = \
        TestFixtureDummyExtractorOnComplete

    dag_id = 'test_openlineage_dag_with_extractor_on_complete'
    dag = DAG(
        dag_id,
        schedule_interval='@daily',
        default_args=DAG_DEFAULT_ARGS,
        description=DAG_DESCRIPTION
    )

    dag_run_id = 'test_openlineage_dag_with_extractor_run_id'

    run_id = str(uuid.uuid4())
    job_id = f"{dag_id}.{TASK_ID_COMPLETED}"
    # Mock the openlineage client method calls
    mock_openlineage_client = mock.Mock()
    mock_get_or_create_openlineage_client.return_value = mock_openlineage_client
    get_custom_facets.return_value = {}
    new_lineage_run_id.return_value = run_id

    # Add task that will be marked as completed
    task_will_complete = TestFixtureDummyOperator(
        task_id=TASK_ID_COMPLETED,
        dag=dag
    )
    completed_task_location = get_location(task_will_complete.dag.fileloc)

    # Create DAG run and mark as running
    dagrun = dag.create_dagrun(
        run_id=dag_run_id,
        execution_date=DEFAULT_DATE,
        state=State.RUNNING)

    start_time = '2016-01-01T00:00:00.000000Z'
    end_time = '2016-01-02T00:00:00.000000Z'

    mock_openlineage_client.emit.assert_has_calls([
        mock.call(RunEvent(
            eventType=RunState.START,
            eventTime=mock.ANY,
            run=Run(run_id, {
                "nominalTime": NominalTimeRunFacet(start_time, end_time),
                "parentRun": ParentRunFacet.create(
                    runId=dag_run_id,
                    namespace=DAG_NAMESPACE,
                    name=job_id
                )
            }),
            job=Job("default",  job_id, {
                "documentation": DocumentationJobFacet(DAG_DESCRIPTION),
                "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location)
            }),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        ))
    ])

    mock_openlineage_client.reset_mock()

    # --- Pretend complete the task
    job_id_mapping.pop.return_value = run_id

    task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    dag.handle_callback(dagrun, success=True, session=session)

    mock_openlineage_client.emit.assert_has_calls([
        mock.call(RunEvent(
            eventType=RunState.COMPLETE,
            eventTime=mock.ANY,
            run=Run(run_id),
            job=Job("default", job_id),
            producer=PRODUCER,
            inputs=[OpenLineageDataset(
                namespace='dummy://localhost:1234',
                name='schema.extract_on_complete_input1',
                facets={
                    'dataSource': DataSourceDatasetFacet(
                        name='dummy://localhost:1234',
                        uri='dummy://localhost:1234?query_tag=asdf'
                    ),
                    'schema': SchemaDatasetFacet(
                        fields=[
                            SchemaField(name='field1', type='text', description=''),
                            SchemaField(name='field2', type='text', description='')
                        ]
                    )
                })
            ],
            outputs=[OpenLineageDataset(
                namespace='dummy://localhost:1234',
                name='extract_on_complete_output1',
                facets={
                    'dataSource': DataSourceDatasetFacet(
                        name='dummy://localhost:1234',
                        uri='dummy://localhost:1234?query_tag=asdf'
                    )
                })
            ]
        ))
    ])
Ejemplo n.º 8
0
def test_openlineage_dag(
        job_id_mapping,
        mock_get_or_create_openlineage_client,
        get_custom_facets,
        new_lineage_run_id,
        clear_db_airflow_dags,
        session=None
):
    dag = DAG(
        DAG_ID,
        schedule_interval='@daily',
        default_args=DAG_DEFAULT_ARGS,
        description=DAG_DESCRIPTION
    )
    # (1) Mock the openlineage client method calls
    mock_ol_client = mock.Mock()
    mock_get_or_create_openlineage_client.return_value = mock_ol_client

    run_id_completed = str(uuid.uuid4())
    run_id_failed = str(uuid.uuid4())

    job_id_completed = f"{DAG_ID}.{TASK_ID_COMPLETED}"
    job_id_failed = f"{DAG_ID}.{TASK_ID_FAILED}"

    get_custom_facets.return_value = {}
    new_lineage_run_id.side_effect = [
        run_id_completed, run_id_failed, run_id_completed, run_id_failed
    ]

    # (2) Add task that will be marked as completed
    task_will_complete = DummyOperator(
        task_id=TASK_ID_COMPLETED,
        dag=dag
    )
    completed_task_location = get_location(task_will_complete.dag.fileloc)

    # (3) Add task that will be marked as failed
    task_will_fail = DummyOperator(
        task_id=TASK_ID_FAILED,
        dag=dag
    )
    failed_task_location = get_location(task_will_complete.dag.fileloc)

    # (4) Create DAG run and mark as running
    dagrun = dag.create_dagrun(
        run_id=DAG_RUN_ID,
        execution_date=DEFAULT_DATE,
        state=State.RUNNING)

    # Assert emit calls
    start_time = '2016-01-01T00:00:00.000000Z'
    end_time = '2016-01-02T00:00:00.000000Z'

    emit_calls = [
        mock.call(RunEvent(
            eventType=RunState.START,
            eventTime=mock.ANY,
            run=Run(run_id_completed, {
                "nominalTime": NominalTimeRunFacet(start_time, end_time),
                "parentRun": ParentRunFacet.create(
                    runId=DAG_RUN_ID,
                    namespace=DAG_NAMESPACE,
                    name=job_id_completed
                )
            }),
            job=Job("default", job_id_completed, {
                "documentation": DocumentationJobFacet(DAG_DESCRIPTION),
                "sourceCodeLocation": SourceCodeLocationJobFacet("", completed_task_location)
            }),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        )),
        mock.call(RunEvent(
            eventType=RunState.START,
            eventTime=mock.ANY,
            run=Run(run_id_failed, {
                "nominalTime": NominalTimeRunFacet(start_time, end_time),
                "parentRun": ParentRunFacet.create(
                    runId=DAG_RUN_ID,
                    namespace=DAG_NAMESPACE,
                    name=job_id_failed
                )
            }),
            job=Job("default", job_id_failed, {
                "documentation": DocumentationJobFacet(DAG_DESCRIPTION),
                "sourceCodeLocation": SourceCodeLocationJobFacet("", failed_task_location)
            }),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        ))
    ]
    log.info(
        f"{ [name for name, args, kwargs in mock_ol_client.mock_calls]}")
    mock_ol_client.emit.assert_has_calls(emit_calls)

    # (5) Start task that will be marked as completed
    task_will_complete.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

    # (6) Start task that will be marked as failed
    ti1 = TaskInstance(task=task_will_fail, execution_date=DEFAULT_DATE)
    ti1.state = State.FAILED
    session.add(ti1)
    session.commit()

    job_id_mapping.pop.side_effect = [run_id_completed, run_id_failed]

    dag.handle_callback(dagrun, success=False, session=session)

    emit_calls += [
        mock.call(RunEvent(
            eventType=RunState.COMPLETE,
            eventTime=mock.ANY,
            run=Run(run_id_completed),
            job=Job("default", job_id_completed),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        )),
        mock.call(RunEvent(
            eventType=RunState.FAIL,
            eventTime=mock.ANY,
            run=Run(run_id_failed),
            job=Job("default", job_id_failed),
            producer=PRODUCER,
            inputs=[],
            outputs=[]
        ))
    ]
    mock_ol_client.emit.assert_has_calls(emit_calls)