def execute(self, context):
        # If the DAG Run is externally triggered, then return without
        # skipping downstream tasks
        if context['dag_run'].external_trigger:
            logging.info("""Externally triggered DAG_Run:
                         allowing execution to proceed.""")
            return

        now = datetime.datetime.now()
        left_window = context['dag'].following_schedule(
            context['execution_date'])
        right_window = context['dag'].following_schedule(left_window)
        logging.info(
            'Checking latest only with left_window: %s right_window: %s '
            'now: %s', left_window, right_window, now)
        if not left_window < now <= right_window:
            logging.info('Not latest execution, skipping downstream.')
            session = settings.Session()
            for task in context['task'].downstream_list:
                ti = TaskInstance(task,
                                  execution_date=context['ti'].execution_date)
                logging.info('Skipping task: %s', ti.task_id)
                ti.state = State.SKIPPED
                ti.start_date = now
                ti.end_date = now
                session.merge(ti)
            session.commit()
            session.close()
            logging.info('Done.')
        else:
            logging.info('Latest, allowing execution to proceed.')
Exemple #2
0
    def kill_zombies(self, zombies, session=None):
        """
        Fail given zombie tasks, which are tasks that haven't
        had a heartbeat for too long, in the current DagBag.

        :param zombies: zombie task instances to kill.
        :type zombies: SimpleTaskInstance
        :param session: DB session.
        :type Session.
        """
        for zombie in zombies:
            if zombie.dag_id in self.dags:
                dag = self.dags[zombie.dag_id]
                if zombie.task_id in dag.task_ids:
                    task = dag.get_task(zombie.task_id)
                    ti = TaskInstance(task, zombie.execution_date)
                    # Get properties needed for failure handling from SimpleTaskInstance.
                    ti.start_date = zombie.start_date
                    ti.end_date = zombie.end_date
                    ti.try_number = zombie.try_number
                    ti.state = zombie.state
                    ti.test_mode = configuration.getboolean(
                        'core', 'unit_test_mode')
                    ti.handle_failure("{} detected as zombie".format(ti),
                                      ti.test_mode, ti.get_template_context())
                    self.log.info('Marked zombie job %s as %s', ti, ti.state)
                    Stats.incr('zombies_killed')
        session.commit()
Exemple #3
0
 def test_set_duration(self):
     task = DummyOperator(task_id='op', email='*****@*****.**')
     ti = TI(
         task=task,
         execution_date=datetime.datetime.now(),
     )
     ti.start_date = datetime.datetime(2018, 10, 1, 1)
     ti.end_date = datetime.datetime(2018, 10, 1, 2)
     ti.set_duration()
     self.assertEqual(ti.duration, 3600)
 def test_set_duration(self):
     task = DummyOperator(task_id='op', email='*****@*****.**')
     ti = TI(
         task=task,
         execution_date=datetime.datetime.now(),
     )
     ti.start_date = datetime.datetime(2018, 10, 1, 1)
     ti.end_date = datetime.datetime(2018, 10, 1, 2)
     ti.set_duration()
     self.assertEqual(ti.duration, 3600)
    def execute(self, context):
        # If the DAG Run is externally triggered, then return without
        # skipping downstream tasks
        if context['dag_run'] and context['dag_run'].external_trigger:
            logging.info("""Externally triggered DAG_Run:
                         allowing execution to proceed.""")
            return

        now = datetime.datetime.now()
        left_window = context['dag'].following_schedule(
            context['execution_date'])
        right_window = context['dag'].following_schedule(left_window)
        logging.info(
            'Checking latest only with left_window: %s right_window: %s '
            'now: %s', left_window, right_window, now)

        if not left_window < now <= right_window:
            logging.info('Not latest execution, skipping downstream.')
            downstream_task_ids = context['task'].downstream_task_ids
            if downstream_task_ids:
                session = settings.Session()
                TI = TaskInstance
                tis = session.query(TI).filter(
                    TI.execution_date == context['ti'].execution_date,
                    TI.task_id.in_(downstream_task_ids)
                ).with_for_update().all()

                for ti in tis:
                    logging.info('Skipping task: %s', ti.task_id)
                    ti.state = State.SKIPPED
                    ti.start_date = now
                    ti.end_date = now
                    session.merge(ti)

                # this is defensive against dag runs that are not complete
                for task in context['task'].downstream_list:
                    if task.task_id in tis:
                        continue

                    logging.warning("Task {} was not part of a dag run. "
                                    "This should not happen."
                                    .format(task))
                    now = datetime.datetime.now()
                    ti = TaskInstance(task, execution_date=context['ti'].execution_date)
                    ti.state = State.SKIPPED
                    ti.start_date = now
                    ti.end_date = now
                    session.merge(ti)

                session.commit()
                session.close()

            logging.info('Done.')
        else:
            logging.info('Latest, allowing execution to proceed.')
    def execute(self, context):
        # If the DAG Run is externally triggered, then return without
        # skipping downstream tasks
        if context['dag_run'] and context['dag_run'].external_trigger:
            logging.info("""Externally triggered DAG_Run:
                         allowing execution to proceed.""")
            return

        now = datetime.datetime.now()
        left_window = context['dag'].following_schedule(
            context['execution_date'])
        right_window = context['dag'].following_schedule(left_window)
        logging.info(
            'Checking latest only with left_window: %s right_window: %s '
            'now: %s', left_window, right_window, now)

        if not left_window < now <= right_window:
            logging.info('Not latest execution, skipping downstream.')
            session = settings.Session()

            TI = TaskInstance
            tis = session.query(TI).filter(
                TI.execution_date == context['ti'].execution_date,
                TI.task_id.in_(context['task'].downstream_task_ids)
            ).with_for_update().all()

            for ti in tis:
                logging.info('Skipping task: %s', ti.task_id)
                ti.state = State.SKIPPED
                ti.start_date = now
                ti.end_date = now
                session.merge(ti)

            # this is defensive against dag runs that are not complete
            for task in context['task'].downstream_list:
                if task.task_id in tis:
                    continue

                logging.warning("Task {} was not part of a dag run. "
                                "This should not happen."
                                .format(task))
                now = datetime.datetime.now()
                ti = TaskInstance(task, execution_date=context['ti'].execution_date)
                ti.state = State.SKIPPED
                ti.start_date = now
                ti.end_date = now
                session.merge(ti)

            session.commit()
            session.close()
            logging.info('Done.')
        else:
            logging.info('Latest, allowing execution to proceed.')
 def execute(self, context):
     branch = super(BranchPythonOperator, self).execute(context)
     logging.info("Following branch " + branch)
     logging.info("Marking other directly downstream tasks as failed")
     session = settings.Session()
     for task in context['task'].downstream_list:
         if task.task_id != branch:
             ti = TaskInstance(task,
                               execution_date=context['ti'].execution_date)
             ti.state = State.SKIPPED
             ti.start_date = datetime.now()
             ti.end_date = datetime.now()
             session.merge(ti)
     session.commit()
     session.close()
     logging.info("Done.")
 def execute(self, context):
     branch = super(BranchPythonOperator, self).execute(context)
     logging.info("Following branch " + branch)
     logging.info("Marking other directly downstream tasks as skipped")
     session = settings.Session()
     for task in context['task'].downstream_list:
         if task.task_id != branch:
             ti = TaskInstance(
                 task, execution_date=context['ti'].execution_date)
             ti.state = State.SKIPPED
             ti.start_date = datetime.now()
             ti.end_date = datetime.now()
             session.merge(ti)
     session.commit()
     session.close()
     logging.info("Done.")
Exemple #9
0
 def execute(self, context):
     condition = super(JollyShortCircuitOperator, self).execute(context)
     logging.info("Condition result is {}".format(condition))
     if condition:
         logging.info('Proceeding with downstream tasks...')
         return
     else:
         logging.info('Skipping downstream tasks...')
         session = settings.Session()
         for task in context['task'].downstream_list:
             ti = TaskInstance(task,
                               execution_date=context['ti'].execution_date)
             ti.state = State.SKIPPED
             ti.start_date = datetime.now()
             ti.end_date = datetime.now()
             session.merge(ti)
         session.commit()
         session.close()
         logging.info("Done.")
 def execute(self, context):
     condition = super(ShortCircuitOperator, self).execute(context)
     logging.info("Condition result is {}".format(condition))
     if condition:
         logging.info('Proceeding with downstream tasks...')
         return
     else:
         logging.info('Skipping downstream tasks...')
         session = settings.Session()
         for task in context['task'].downstream_list:
             ti = TaskInstance(
                 task, execution_date=context['ti'].execution_date)
             ti.state = State.SKIPPED
             ti.start_date = datetime.now()
             ti.end_date = datetime.now()
             session.merge(ti)
         session.commit()
         session.close()
         logging.info("Done.")
    def execute(self, context):
        condition = super(ShortCircuitOperator, self).execute(context)
        logging.info("Condition result is {}".format(condition))

        if condition:
            logging.info('Proceeding with downstream tasks...')
            return

        logging.info('Skipping downstream tasks...')
        session = settings.Session()

        TI = TaskInstance
        tis = session.query(TI).filter(
            TI.execution_date == context['ti'].execution_date,
            TI.task_id.in_(context['task'].downstream_task_ids),
        ).with_for_update().all()

        for ti in tis:
            logging.info('Skipping task: %s', ti.task_id)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()

        # this is defensive against dag runs that are not complete
        for task in context['task'].downstream_list:
            if task.task_id in tis:
                continue

            logging.warning(
                "Task {} was not part of a dag run. This should not happen.".
                format(task))
            ti = TaskInstance(task,
                              execution_date=context['ti'].execution_date)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()
            session.merge(ti)

        session.commit()
        session.close()
        logging.info("Done.")
Exemple #12
0
    def skip(self, dag_run, execution_date, tasks, session=None):
        """
        Sets tasks instances to skipped from the same dag run.

        :param dag_run: the DagRun for which to set the tasks to skipped
        :param execution_date: execution_date
        :param tasks: tasks to skip (not task_ids)
        :param session: db session to use
        """
        if not tasks:
            return

        task_ids = [d.task_id for d in tasks]
        now = timezone.utcnow()

        if dag_run:
            session.query(TaskInstance).filter(
                TaskInstance.dag_id == dag_run.dag_id,
                TaskInstance.execution_date == dag_run.execution_date,
                TaskInstance.task_id.in_(task_ids)).update(
                    {
                        TaskInstance.state: State.SKIPPED,
                        TaskInstance.start_date: now,
                        TaskInstance.end_date: now
                    },
                    synchronize_session=False)
            session.commit()
        else:
            assert execution_date is not None, "Execution date is None and no dag run"

            self.log.warning("No DAG RUN present this should not happen")
            # this is defensive against dag runs that are not complete
            for task in tasks:
                ti = TaskInstance(task, execution_date=execution_date)
                ti.state = State.SKIPPED
                ti.start_date = now
                ti.end_date = now
                session.merge(ti)

            session.commit()
    def execute(self, context):
        branch = super(BranchPythonOperator, self).execute(context)
        logging.info("Following branch " + branch)
        logging.info("Marking other directly downstream tasks as skipped")
        session = settings.Session()

        TI = TaskInstance
        tis = session.query(TI).filter(
            TI.execution_date == context['ti'].execution_date,
            TI.task_id.in_(context['task'].downstream_task_ids),
            TI.task_id != branch,
        ).with_for_update().all()

        for ti in tis:
            logging.info('Skipping task: %s', ti.task_id)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()

        # this is defensive against dag runs that are not complete
        for task in context['task'].downstream_list:
            if task.task_id in tis:
                continue

            if task.task_id == branch:
                continue

            logging.warning(
                "Task {} was not part of a dag run. This should not happen.".
                format(task))
            ti = TaskInstance(task,
                              execution_date=context['ti'].execution_date)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()
            session.merge(ti)

        session.commit()
        session.close()
        logging.info("Done.")
Exemple #14
0
    def execute(self, context):
        condition = super(ShortCircuitOperator, self).execute(context)
        logging.info("Condition result is {}".format(condition))

        if condition:
            logging.info('Proceeding with downstream tasks...')
            return

        logging.info('Skipping downstream tasks...')
        session = settings.Session()

        TI = TaskInstance
        tis = session.query(TI).filter(
            TI.execution_date == context['ti'].execution_date,
            TI.task_id.in_(context['task'].downstream_task_ids),
        ).with_for_update().all()

        for ti in tis:
            logging.info('Skipping task: %s', ti.task_id)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()

        # this is defensive against dag runs that are not complete
        for task in context['task'].downstream_list:
            if task.task_id in tis:
                continue

            logging.warning("Task {} was not part of a dag run. This should not happen."
                            .format(task))
            ti = TaskInstance(task, execution_date=context['ti'].execution_date)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()
            session.merge(ti)

        session.commit()
        session.close()
        logging.info("Done.")
Exemple #15
0
    def execute(self, context):
        branch = super(BranchPythonOperator, self).execute(context)
        logging.info("Following branch " + branch)
        logging.info("Marking other directly downstream tasks as skipped")
        session = settings.Session()

        TI = TaskInstance
        tis = session.query(TI).filter(
            TI.execution_date == context['ti'].execution_date,
            TI.task_id.in_(context['task'].downstream_task_ids),
            TI.task_id != branch,
        ).with_for_update().all()

        for ti in tis:
            logging.info('Skipping task: %s', ti.task_id)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()

        # this is defensive against dag runs that are not complete
        for task in context['task'].downstream_list:
            if task.task_id in tis:
                continue

            if task.task_id == branch:
                continue

            logging.warning("Task {} was not part of a dag run. This should not happen."
                            .format(task))
            ti = TaskInstance(task, execution_date=context['ti'].execution_date)
            ti.state = State.SKIPPED
            ti.start_date = datetime.now()
            ti.end_date = datetime.now()
            session.merge(ti)

        session.commit()
        session.close()
        logging.info("Done.")
Exemple #16
0
    def skip(self, dag_run, execution_date, tasks, session=None):
        """
        Sets tasks instances to skipped from the same dag run.

        :param dag_run: the DagRun for which to set the tasks to skipped
        :param execution_date: execution_date
        :param tasks: tasks to skip (not task_ids)
        :param session: db session to use
        """
        if not tasks:
            return

        task_ids = [d.task_id for d in tasks]
        now = timezone.utcnow()

        if dag_run:
            session.query(TaskInstance).filter(
                TaskInstance.dag_id == dag_run.dag_id,
                TaskInstance.execution_date == dag_run.execution_date,
                TaskInstance.task_id.in_(task_ids)
            ).update({TaskInstance.state: State.SKIPPED,
                      TaskInstance.start_date: now,
                      TaskInstance.end_date: now},
                     synchronize_session=False)
            session.commit()
        else:
            assert execution_date is not None, "Execution date is None and no dag run"

            self.log.warning("No DAG RUN present this should not happen")
            # this is defensive against dag runs that are not complete
            for task in tasks:
                ti = TaskInstance(task, execution_date=execution_date)
                ti.state = State.SKIPPED
                ti.start_date = now
                ti.end_date = now
                session.merge(ti)

            session.commit()
 def execute(self, context):
     now = datetime.datetime.now()
     left_window = context['dag'].following_schedule(
         context['execution_date'])
     right_window = context['dag'].following_schedule(left_window)
     _log.info(
         'Checking latest only with left_window: %s right_window: %s '
         'now: %s', left_window, right_window, now)
     if not left_window < now <= right_window:
         _log.info('Not latest execution, skipping downstream.')
         session = settings.Session()
         for task in context['task'].downstream_list:
             ti = TaskInstance(task,
                               execution_date=context['ti'].execution_date)
             _log.info('Skipping task: %s', ti.task_id)
             ti.state = State.SKIPPED
             ti.start_date = now
             ti.end_date = now
             session.merge(ti)
         session.commit()
         session.close()
         _log.info('Done.')
     else:
         _log.info('Latest, allowing execution to proceed.')
Exemple #18
0
def test_extra_link_in_gantt_view(dag, viewer_client):
    exec_date = dates.days_ago(2)
    start_date = timezone.datetime(2020, 4, 10, 2, 0, 0)
    end_date = exec_date + datetime.timedelta(seconds=30)

    with create_session() as session:
        for task in dag.tasks:
            ti = TaskInstance(task=task,
                              execution_date=exec_date,
                              state="success")
            ti.start_date = start_date
            ti.end_date = end_date
            session.add(ti)

    url = f'gantt?dag_id={dag.dag_id}&execution_date={exec_date}'
    resp = viewer_client.get(url, follow_redirects=True)

    check_content_in_response('"extraLinks":', resp)

    extra_links_grps = re.search(r'extraLinks\": \[(\".*?\")\]',
                                 resp.get_data(as_text=True))
    extra_links = extra_links_grps.group(0)
    assert 'airflow' in extra_links
    assert 'github' in extra_links
 def execute(self, context):
     now = datetime.datetime.now()
     left_window = context['dag'].following_schedule(
         context['execution_date'])
     right_window = context['dag'].following_schedule(left_window)
     logging.info(
         'Checking latest only with left_window: %s right_window: %s '
         'now: %s', left_window, right_window, now)
     if not left_window < now <= right_window:
         logging.info('Not latest execution, skipping downstream.')
         session = settings.Session()
         for task in context['task'].downstream_list:
             ti = TaskInstance(
                 task, execution_date=context['ti'].execution_date)
             logging.info('Skipping task: %s', ti.task_id)
             ti.state = State.SKIPPED
             ti.start_date = now
             ti.end_date = now
             session.merge(ti)
         session.commit()
         session.close()
         logging.info('Done.')
     else:
         logging.info('Latest, allowing execution to proceed.')
Exemple #20
0
def test_lineage_backend_capture_executions(mock_emit, inlets, outlets):
    DEFAULT_DATE = datetime.datetime(2020, 5, 17)
    mock_emitter = Mock()
    mock_emit.return_value = mock_emitter
    # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6.
    with mock.patch.dict(
            os.environ,
        {
            "AIRFLOW__LINEAGE__BACKEND":
            "datahub_provider.lineage.datahub.DatahubLineageBackend",
            "AIRFLOW__LINEAGE__DATAHUB_CONN_ID":
            datahub_rest_connection_config.conn_id,
            "AIRFLOW__LINEAGE__DATAHUB_KWARGS":
            json.dumps({
                "graceful_exceptions": False,
                "capture_executions": True
            }),
        },
    ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch(
            "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection(
                datahub_rest_connection_config):
        func = mock.Mock()
        func.__name__ = "foo"

        dag = DAG(dag_id="test_lineage_is_sent_to_backend",
                  start_date=DEFAULT_DATE)

        with dag:
            op1 = DummyOperator(
                task_id="task1_upstream",
                inlets=inlets,
                outlets=outlets,
            )
            op2 = DummyOperator(
                task_id="task2",
                inlets=inlets,
                outlets=outlets,
            )
            op1 >> op2

        # Airflow < 2.2 requires the execution_date parameter. Newer Airflow
        # versions do not require it, but will attempt to find the associated
        # run_id in the database if execution_date is provided. As such, we
        # must fake the run_id parameter for newer Airflow versions.
        if AIRFLOW_VERSION < packaging.version.parse("2.2.0"):
            ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE)
            # Ignoring type here because DagRun state is just a sring at Airflow 1
            dag_run = DagRun(
                state="success",
                run_id=f"scheduled_{DEFAULT_DATE}")  # type: ignore
            ti.dag_run = dag_run
            ti.start_date = datetime.datetime.utcnow()
            ti.execution_date = DEFAULT_DATE

        else:
            from airflow.utils.state import DagRunState

            ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}")
            dag_run = DagRun(state=DagRunState.SUCCESS,
                             run_id=f"scheduled_{DEFAULT_DATE}")
            ti.dag_run = dag_run
            ti.start_date = datetime.datetime.utcnow()
            ti.execution_date = DEFAULT_DATE

        ctx1 = {
            "dag": dag,
            "task": op2,
            "ti": ti,
            "dag_run": dag_run,
            "task_instance": ti,
            "execution_date": DEFAULT_DATE,
            "ts": "2021-04-08T00:54:25.771575+00:00",
        }

        prep = prepare_lineage(func)
        prep(op2, ctx1)
        post = apply_lineage(func)
        post(op2, ctx1)

        # Verify that the inlets and outlets are registered and recognized by Airflow correctly,
        # or that our lineage backend forces it to.
        assert len(op2.inlets) == 1
        assert len(op2.outlets) == 1
        assert all(map(lambda let: isinstance(let, Dataset), op2.inlets))
        assert all(map(lambda let: isinstance(let, Dataset), op2.outlets))

        # Check that the right things were emitted.
        assert mock_emitter.emit.call_count == 17
        # Running further checks based on python version because args only exists in python 3.7+
        if sys.version_info[:3] > (3, 7):
            assert mock_emitter.method_calls[0].args[
                0].aspectName == "dataFlowInfo"
            assert (
                mock_emitter.method_calls[0].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[1].args[
                0].aspectName == "ownership"
            assert (
                mock_emitter.method_calls[1].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[2].args[
                0].aspectName == "globalTags"
            assert (
                mock_emitter.method_calls[2].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[3].args[
                0].aspectName == "dataJobInfo"
            assert (
                mock_emitter.method_calls[3].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert (mock_emitter.method_calls[4].args[0].aspectName ==
                    "dataJobInputOutput")
            assert (
                mock_emitter.method_calls[4].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.outputDatasets[0]
                ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )

            assert mock_emitter.method_calls[5].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[5].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )

            assert mock_emitter.method_calls[6].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[6].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )

            assert mock_emitter.method_calls[7].args[
                0].aspectName == "ownership"
            assert (
                mock_emitter.method_calls[7].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert mock_emitter.method_calls[8].args[
                0].aspectName == "globalTags"
            assert (
                mock_emitter.method_calls[8].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert (mock_emitter.method_calls[9].args[0].aspectName ==
                    "dataProcessInstanceProperties")
            assert (
                mock_emitter.method_calls[9].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")

            assert (mock_emitter.method_calls[10].args[0].aspectName ==
                    "dataProcessInstanceRelationships")
            assert (
                mock_emitter.method_calls[10].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert (mock_emitter.method_calls[11].args[0].aspectName ==
                    "dataProcessInstanceInput")
            assert (
                mock_emitter.method_calls[11].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert (mock_emitter.method_calls[12].args[0].aspectName ==
                    "dataProcessInstanceOutput")
            assert (
                mock_emitter.method_calls[12].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert mock_emitter.method_calls[13].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[13].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )
            assert mock_emitter.method_calls[14].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[14].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )
            assert (mock_emitter.method_calls[15].args[0].aspectName ==
                    "dataProcessInstanceRunEvent")
            assert (
                mock_emitter.method_calls[15].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert (mock_emitter.method_calls[16].args[0].aspectName ==
                    "dataProcessInstanceRunEvent")
            assert (
                mock_emitter.method_calls[16].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
Exemple #21
0
def create_dagrun_from_dbnd_run(
    databand_run,
    dag,
    execution_date,
    run_id,
    state=State.RUNNING,
    external_trigger=False,
    conf=None,
    session=None,
):
    """
    Create new DagRun and all relevant TaskInstances
    """
    dagrun = (session.query(DagRun).filter(
        DagRun.dag_id == dag.dag_id,
        DagRun.execution_date == execution_date).first())
    if dagrun is None:
        dagrun = DagRun(
            run_id=run_id,
            execution_date=execution_date,
            start_date=dag.start_date,
            _state=state,
            external_trigger=external_trigger,
            dag_id=dag.dag_id,
            conf=conf,
        )
        session.add(dagrun)
    else:
        logger.warning("Running with existing airflow dag run %s", dagrun)

    dagrun.dag = dag
    dagrun.run_id = run_id
    session.commit()

    # create the associated task instances
    # state is None at the moment of creation

    # dagrun.verify_integrity(session=session)
    # fetches [TaskInstance] again
    # tasks_skipped = databand_run.tasks_skipped

    # we can find a source of the completion, but also,
    # sometimes we don't know the source of the "complete"
    TI = TaskInstance
    tis = (session.query(TI).filter(TI.dag_id == dag.dag_id,
                                    TI.execution_date == execution_date).all())
    tis = {ti.task_id: ti for ti in tis}

    for af_task in dag.tasks:
        ti = tis.get(af_task.task_id)
        if ti is None:
            ti = TaskInstance(af_task, execution_date=execution_date)
            ti.start_date = timezone.utcnow()
            ti.end_date = timezone.utcnow()
            session.add(ti)
        task_run = databand_run.get_task_run_by_af_id(af_task.task_id)
        # all tasks part of the backfill are scheduled to dagrun

        # Set log file path to expected airflow log file path
        task_run.log.local_log_file.path = ti.log_filepath.replace(
            ".log", "/{0}.log".format(ti.try_number))
        if task_run.is_reused:
            # this task is completed and we don't need to run it anymore
            ti.state = State.SUCCESS

    session.commit()

    return dagrun