Ejemplo n.º 1
0
    def run(self,
            start_date=None,
            end_date=None,
            ignore_dependencies=False,
            force=False,
            mark_success=False):
        """
        Run a set of task instances for a date range.
        """
        start_date = start_date or self.start_date
        end_date = end_date or self.end_date or datetime.now()

        for dt in utils.date_range(start_date, end_date,
                                   self.schedule_interval):
            TaskInstance(self, dt).run(
                mark_success=mark_success,
                ignore_dependencies=ignore_dependencies,
                force=force,
            )
Ejemplo n.º 2
0
    def _execute(self):
        """
        Runs a dag for a specified date range.
        """
        session = settings.Session()

        start_date = self.bf_start_date
        end_date = self.bf_end_date

        # picklin'
        pickle_id = None
        if not self.donot_pickle and self.executor.__class__ not in (
                executors.LocalExecutor, executors.SequentialExecutor):
            pickle = models.DagPickle(self.dag)
            session.add(pickle)
            session.commit()
            pickle_id = pickle.id

        executor = self.executor
        executor.start()

        # Build a list of all instances to run
        tasks_to_run = {}
        failed = []
        succeeded = []
        started = []
        wont_run = []
        for task in self.dag.tasks:
            if (not self.include_adhoc) and task.adhoc:
                continue

            start_date = start_date or task.start_date
            end_date = end_date or task.end_date or datetime.now()
            for dttm in utils.date_range(
                    start_date, end_date, task.dag.schedule_interval):
                ti = models.TaskInstance(task, dttm)
                tasks_to_run[ti.key] = ti

        # Triggering what is ready to get triggered
        while tasks_to_run:
            for key, ti in tasks_to_run.items():
                ti.refresh_from_db()
                if ti.state == State.SUCCESS and key in tasks_to_run:
                    succeeded.append(key)
                    del tasks_to_run[key]
                elif ti.is_runnable():
                    executor.queue_task_instance(
                        ti,
                        mark_success=self.mark_success,
                        task_start_date=self.bf_start_date,
                        pickle_id=pickle_id,
                        ignore_dependencies=self.ignore_dependencies)
                    ti.state = State.RUNNING
                    if key not in started:
                        started.append(key)
            self.heartbeat()
            executor.heartbeat()

            # Reacting to events
            for key, state in executor.get_event_buffer().items():
                dag_id, task_id, execution_date = key
                if key not in tasks_to_run:
                    continue
                ti = tasks_to_run[key]
                ti.refresh_from_db()
                if ti.state == State.FAILED:
                    failed.append(key)
                    logging.error("Task instance " + str(key) + " failed")
                    del tasks_to_run[key]
                    # Removing downstream tasks from the one that has failed
                    for t in self.dag.get_task(task_id).get_flat_relatives(
                            upstream=False):
                        key = (ti.dag_id, t.task_id, execution_date)
                        if key in tasks_to_run:
                            wont_run.append(key)
                            del tasks_to_run[key]
                elif ti.state == State.SUCCESS:
                    succeeded.append(key)
                    del tasks_to_run[key]

            msg = (
                "[backfill progress] "
                "waiting: {0} | "
                "succeeded: {1} | "
                "kicked_off: {2} | "
                "failed: {3} | "
                "skipped: {4} ").format(
                    len(tasks_to_run),
                    len(succeeded),
                    len(started),
                    len(failed),
                    len(wont_run))
            logging.info(msg)

        executor.end()
        session.close()
        if failed:
            raise AirflowException(
                "Some tasks instances failed, here's the list:\n"+str(failed))
        logging.info("All done. Exiting.")
Ejemplo n.º 3
0
    def _execute(self):
        """
        Runs a dag for a specified date range.
        """
        session = settings.Session()

        start_date = self.bf_start_date
        end_date = self.bf_end_date

        # picklin'
        pickle_id = None
        if not self.donot_pickle and self.executor.__class__ not in (
                executors.LocalExecutor, executors.SequentialExecutor):
            pickle = models.DagPickle(self.dag)
            session.add(pickle)
            session.commit()
            pickle_id = pickle.id

        executor = self.executor
        executor.start()

        # Build a list of all instances to run
        tasks_to_run = {}
        failed = []
        succeeded = []
        started = []
        wont_run = []
        for task in self.dag.tasks:
            if (not self.include_adhoc) and task.adhoc:
                continue

            start_date = start_date or task.start_date
            end_date = end_date or task.end_date or datetime.now()
            for dttm in utils.date_range(
                    start_date, end_date, task.dag.schedule_interval):
                ti = models.TaskInstance(task, dttm)
                tasks_to_run[ti.key] = ti

        # Triggering what is ready to get triggered
        while tasks_to_run:
            for key, ti in tasks_to_run.items():
                ti.refresh_from_db()
                if ti.state == State.SUCCESS and key in tasks_to_run:
                    succeeded.append(key)
                    del tasks_to_run[key]
                elif ti.is_runnable():
                    executor.queue_task_instance(
                        ti,
                        mark_success=self.mark_success,
                        task_start_date=self.bf_start_date,
                        pickle_id=pickle_id,
                        ignore_dependencies=self.ignore_dependencies)
                    ti.state = State.RUNNING
                    if key not in started:
                        started.append(key)
            self.heartbeat()
            executor.heartbeat()

            # Reacting to events
            for key, state in executor.get_event_buffer().items():
                dag_id, task_id, execution_date = key
                if key not in tasks_to_run:
                    continue
                ti = tasks_to_run[key]
                ti.refresh_from_db()
                if ti.state == State.FAILED:
                    failed.append(key)
                    logging.error("Task instance " + str(key) + " failed")
                    del tasks_to_run[key]
                    # Removing downstream tasks from the one that has failed
                    for t in self.dag.get_task(task_id).get_flat_relatives(
                            upstream=False):
                        key = (ti.dag_id, t.task_id, execution_date)
                        if key in tasks_to_run:
                            wont_run.append(key)
                            del tasks_to_run[key]
                elif ti.state == State.SUCCESS:
                    succeeded.append(key)
                    del tasks_to_run[key]

            msg = (
                "[backfill progress] "
                "waiting: {0} | "
                "succeeded: {1} | "
                "kicked_off: {2} | "
                "failed: {3} | "
                "skipped: {4} ").format(
                    len(tasks_to_run),
                    len(succeeded),
                    len(started),
                    len(failed),
                    len(wont_run))
            logging.info(msg)

        executor.end()
        session.close()
        if failed:
            raise AirflowException(
                "Some tasks instances failed, here's the list:\n"+str(failed))
        logging.info("All done. Exiting.")
Ejemplo n.º 4
0
    def tree(self):
        dag_id = request.args.get('dag_id')
        dag = dagbag.dags[dag_id]
        session = settings.Session()

        base_date = request.args.get('base_date')
        if not base_date:
            base_date = datetime.now()
        else:
            base_date = dateutil.parser.parse(base_date)

        num_runs = request.args.get('num_runs')
        num_runs = int(num_runs) if num_runs else 25
        from_date = (base_date - (num_runs * dag.schedule_interval)).date()
        from_date = datetime.combine(from_date, datetime.min.time())

        dates = utils.date_range(from_date, base_date, dag.schedule_interval)
        task_instances = {}
        for ti in dag.get_task_instances(session, from_date):
            task_instances[(ti.task_id, ti.execution_date)] = ti

        expanded = []

        def recurse_nodes(task):
            children = [recurse_nodes(t) for t in task.upstream_list]

            # D3 tree uses children vs _children to define what is
            # expanded or not. The following block makes it such that
            # repeated nodes are collapsed by default.
            children_key = 'children'
            if task.task_id not in expanded:
                expanded.append(task.task_id)
            elif children:
                children_key = "_children"

            return {
                'name':
                task.task_id,
                'instances': [
                    utils.alchemy_to_dict(task_instances.get(
                        (task.task_id, d))) or {
                            'execution_date': d.isoformat(),
                            'task_id': task.task_id
                        } for d in dates
                ],
                children_key:
                children,
                'num_dep':
                len(task.upstream_list),
                'operator':
                task.task_type,
                'retries':
                task.retries,
                'owner':
                task.owner,
                'start_date':
                task.start_date,
                'end_date':
                task.end_date,
                'depends_on_past':
                task.depends_on_past,
            }

        if len(dag.roots) > 1:
            # d3 likes a single root
            data = {
                'name': 'root',
                'instances': [],
                'children': [recurse_nodes(t) for t in dag.roots]
            }
        else:
            data = recurse_nodes(dag.roots[0])

        data = json.dumps(data, indent=4, default=utils.json_ser)
        session.commit()
        session.close()

        return self.render('airflow/tree.html', dag=dag, data=data)
Ejemplo n.º 5
0
    def _execute(self):
        """
        Runs a dag for a specified date range.
        """
        start_date = self.bf_start_date
        end_date = self.bf_end_date

        session = settings.Session()
        pickle = models.DagPickle(self.dag, self)
        executor = self.executor
        executor.start()
        session.add(pickle)
        session.commit()
        pickle_id = pickle.id

        # Build a list of all intances to run
        tasks_to_run = {}
        failed = []
        succeeded = []
        started = []
        wont_run = []
        for task in self.dag.tasks:
            start_date = start_date or task.start_date
            end_date = end_date or task.end_date or datetime.now()
            for dttm in utils.date_range(
                    start_date, end_date, task.dag.schedule_interval):
                ti = models.TaskInstance(task, dttm)
                tasks_to_run[ti.key] = ti

        # Triggering what is ready to get triggered
        while tasks_to_run:
            msg = (
                "Yet to run: {0} | "
                "Succeeded: {1} | "
                "Started: {2} | "
                "Failed: {3} | "
                "Won't run: {4} ").format(
                len(tasks_to_run),
                len(succeeded),
                len(started),
                len(failed),
                len(wont_run))

            logging.info(msg)
            for key, ti in tasks_to_run.items():
                ti.refresh_from_db()
                if ti.state == State.SUCCESS and key in tasks_to_run:
                    succeeded.append(key)
                    del tasks_to_run[key]
                elif ti.is_runnable():
                    executor.queue_command(
                        key=ti.key, command=ti.command(
                            mark_success=self.mark_success,
                            pickle_id=pickle_id)
                    )
                    ti.state = State.RUNNING
                    if key not in started:
                        started.append(key)
            self.heartbeat()
            executor.heartbeat()

            # Reacting to events
            for key, state in executor.get_event_buffer().items():
                dag_id, task_id, execution_date = key
                if key not in tasks_to_run:
                    continue
                ti = tasks_to_run[key]
                ti.refresh_from_db()
                if ti.state == State.FAILED:
                    failed.append(key)
                    logging.error("Task instance " + str(key) + " failed")
                    del tasks_to_run[key]
                    # Removing downstream tasks from the one that has failed
                    for t in self.dag.get_task(task_id).get_flat_relatives(
                            upstream=False):
                        key = (ti.dag_id, t.task_id, execution_date)
                        if key in tasks_to_run:
                            wont_run.append(key)
                            del tasks_to_run[key]
                elif ti.state == State.SUCCESS:
                    succeeded.append(key)
                    del tasks_to_run[key]
        executor.end()
        logging.info("Run summary:")
        session.close()
Ejemplo n.º 6
0
    def tree(self):
        dag_id = request.args.get('dag_id')
        dag = dagbag.dags[dag_id]
        session = settings.Session()

        base_date = request.args.get('base_date')
        if not base_date:
            base_date = datetime.now()
        else:
            base_date = dateutil.parser.parse(base_date)

        num_runs = request.args.get('num_runs')
        num_runs = int(num_runs) if num_runs else 25
        from_date = (base_date-(num_runs * dag.schedule_interval)).date()
        from_date = datetime.combine(from_date, datetime.min.time())

        dates = utils.date_range(
            from_date, base_date, dag.schedule_interval)
        task_instances = {}
        for ti in dag.get_task_instances(session, from_date):
            task_instances[(ti.task_id, ti.execution_date)] = ti

        expanded = []

        def recurse_nodes(task):
            children = [recurse_nodes(t) for t in task.upstream_list]

            # D3 tree uses children vs _children to define what is
            # expanded or not. The following block makes it such that
            # repeated nodes are collapsed by default.
            children_key = 'children'
            if task.task_id not in expanded:
                expanded.append(task.task_id)
            elif children:
                children_key = "_children"

            return {
                'name': task.task_id,
                'instances': [
                    utils.alchemy_to_dict(
                        task_instances.get((task.task_id, d))) or {
                            'execution_date': d.isoformat(),
                            'task_id': task.task_id
                        }
                    for d in dates],
                children_key: children,
                'num_dep': len(task.upstream_list),
                'operator': task.task_type,
                'retries': task.retries,
                'owner': task.owner,
                'start_date': task.start_date,
                'end_date': task.end_date,
                'depends_on_past': task.depends_on_past,
            }
        if len(dag.roots) > 1:
            # d3 likes a single root
            data = {
                'name': 'root',
                'instances': [],
                'children': [recurse_nodes(t) for t in dag.roots]
            }
        else:
            data = recurse_nodes(dag.roots[0])

        data = json.dumps(data, indent=4, default=utils.json_ser)
        session.commit()
        session.close()

        return self.render(
            'airflow/tree.html',
            dag=dag, data=data)
Ejemplo n.º 7
0
    def _execute(self):
        """
        Runs a dag for a specified date range.
        """
        session = settings.Session()

        start_date = self.bf_start_date
        end_date = self.bf_end_date

        # picklin'
        pickle_id = None
        if not self.donot_pickle and self.executor.__class__ not in (
                executors.LocalExecutor, executors.SequentialExecutor):
            pickle = models.DagPickle(self.dag)
            session.add(pickle)
            session.commit()
            pickle_id = pickle.id

        executor = self.executor
        executor.start()

        # Build a list of all instances to run
        tasks_to_run = {}
        failed = []
        succeeded = []
        started = []
        wont_run = []
        for task in self.dag.tasks:
            if (not self.include_adhoc) and task.adhoc:
                continue

            start_date = start_date or task.start_date
            end_date = end_date or task.end_date or datetime.now()
            for dttm in utils.date_range(start_date, end_date,
                                         task.dag.schedule_interval):
                ti = models.TaskInstance(task, dttm)
                tasks_to_run[ti.key] = ti

        # Triggering what is ready to get triggered
        while tasks_to_run:
            for key, ti in list(tasks_to_run.items()):
                ti.refresh_from_db()
                if ti.state in (State.SUCCESS,
                                State.SKIPPED) and key in tasks_to_run:
                    succeeded.append(key)
                    tasks_to_run.pop(key)
                elif ti.state in (State.RUNNING, State.QUEUED):
                    continue
                elif ti.is_runnable(flag_upstream_failed=True):
                    executor.queue_task_instance(
                        ti,
                        mark_success=self.mark_success,
                        task_start_date=self.bf_start_date,
                        pickle_id=pickle_id,
                        ignore_dependencies=self.ignore_dependencies,
                        pool=self.pool)
                    ti.state = State.RUNNING
                    if key not in started:
                        started.append(key)
            self.heartbeat()
            executor.heartbeat()

            # Reacting to events
            for key, state in list(executor.get_event_buffer().items()):
                dag_id, task_id, execution_date = key
                if key not in tasks_to_run:
                    continue
                ti = tasks_to_run[key]
                ti.refresh_from_db()
                if (ti.state in (State.FAILED, State.SKIPPED)
                        or state == State.FAILED):
                    if ti.state == State.FAILED or state == State.FAILED:
                        failed.append(key)
                        logging.error("Task instance " + str(key) + " failed")
                    elif ti.state == State.SKIPPED:
                        wont_run.append(key)
                        logging.error("Skipping " + str(key) + " failed")
                    tasks_to_run.pop(key)
                    # Removing downstream tasks that also shouldn't run
                    for t in self.dag.get_task(task_id).get_flat_relatives(
                            upstream=False):
                        key = (ti.dag_id, t.task_id, execution_date)
                        if key in tasks_to_run:
                            wont_run.append(key)
                            tasks_to_run.pop(key)
                elif ti.state == State.SUCCESS and state == State.SUCCESS:
                    succeeded.append(key)
                    tasks_to_run.pop(key)
                elif (ti.state not in (State.SUCCESS, State.QUEUED)
                      and state == State.SUCCESS):
                    logging.error(
                        "The airflow run command failed "
                        "at reporting an error. This should not occur "
                        "in normal circustances. State is {}".format(ti.state))

            msg = ("[backfill progress] "
                   "waiting: {0} | "
                   "succeeded: {1} | "
                   "kicked_off: {2} | "
                   "failed: {3} | "
                   "wont_run: {4} ").format(len(tasks_to_run), len(succeeded),
                                            len(started), len(failed),
                                            len(wont_run))
            logging.info(msg)

        executor.end()
        session.close()
        if failed:
            logging.error("------------------------------------------\n"
                          "Some tasks instances failed, "
                          "here's the list:\n{}".format(failed))
            sys.exit(1)
        logging.info("All done. Exiting.")
Ejemplo n.º 8
0
    def _execute(self):
        """
        Runs a dag for a specified date range.
        """
        session = settings.Session()

        start_date = self.bf_start_date
        end_date = self.bf_end_date

        # picklin'
        pickle_id = None
        if not self.donot_pickle and self.executor.__class__ not in (
                executors.LocalExecutor, executors.SequentialExecutor):
            pickle = models.DagPickle(self.dag)
            session.add(pickle)
            session.commit()
            pickle_id = pickle.id

        executor = self.executor
        executor.start()

        # Build a list of all instances to run
        tasks_to_run = {}
        failed = []
        succeeded = []
        started = []
        wont_run = []
        for task in self.dag.tasks:
            if (not self.include_adhoc) and task.adhoc:
                continue

            start_date = start_date or task.start_date
            end_date = end_date or task.end_date or datetime.now()
            for dttm in utils.date_range(
                    start_date, end_date, task.dag.schedule_interval):
                ti = models.TaskInstance(task, dttm)
                tasks_to_run[ti.key] = ti

        # Triggering what is ready to get triggered
        while tasks_to_run:
            for key, ti in list(tasks_to_run.items()):
                ti.refresh_from_db()
                if ti.state in (
                        State.SUCCESS, State.SKIPPED) and key in tasks_to_run:
                    succeeded.append(key)
                    tasks_to_run.pop(key)
                elif ti.state in (State.RUNNING, State.QUEUED):
                    continue
                elif ti.is_runnable(flag_upstream_failed=True):
                    executor.queue_task_instance(
                        ti,
                        mark_success=self.mark_success,
                        task_start_date=self.bf_start_date,
                        pickle_id=pickle_id,
                        ignore_dependencies=self.ignore_dependencies,
                        pool=self.pool)
                    ti.state = State.RUNNING
                    if key not in started:
                        started.append(key)
            self.heartbeat()
            executor.heartbeat()

            # Reacting to events
            for key, state in list(executor.get_event_buffer().items()):
                dag_id, task_id, execution_date = key
                if key not in tasks_to_run:
                    continue
                ti = tasks_to_run[key]
                ti.refresh_from_db()
                if (
                        ti.state in (State.FAILED, State.SKIPPED) or
                        state == State.FAILED):
                    if ti.state == State.FAILED or state == State.FAILED:
                        failed.append(key)
                        logging.error("Task instance " + str(key) + " failed")
                    elif ti.state == State.SKIPPED:
                        wont_run.append(key)
                        logging.error("Skipping " + str(key) + " failed")
                    tasks_to_run.pop(key)
                    # Removing downstream tasks that also shouldn't run
                    for t in self.dag.get_task(task_id).get_flat_relatives(
                            upstream=False):
                        key = (ti.dag_id, t.task_id, execution_date)
                        if key in tasks_to_run:
                            wont_run.append(key)
                            tasks_to_run.pop(key)
                elif ti.state == State.SUCCESS and state == State.SUCCESS:
                    succeeded.append(key)
                    tasks_to_run.pop(key)
                elif (
                        ti.state not in (State.SUCCESS, State.QUEUED) and 
                        state == State.SUCCESS):
                    logging.error(
                        "The airflow run command failed "
                        "at reporting an error. This should not occur "
                        "in normal circustances. State is {}".format(ti.state))

            msg = (
                "[backfill progress] "
                "waiting: {0} | "
                "succeeded: {1} | "
                "kicked_off: {2} | "
                "failed: {3} | "
                "wont_run: {4} ").format(
                    len(tasks_to_run),
                    len(succeeded),
                    len(started),
                    len(failed),
                    len(wont_run))
            logging.info(msg)

        executor.end()
        session.close()
        if failed:
            logging.error(
                "------------------------------------------\n"
                "Some tasks instances failed, "
                "here's the list:\n{}".format(failed))
            sys.exit(1)
        logging.info("All done. Exiting.")
Ejemplo n.º 9
0
    def _execute(self):
        """
        Runs a dag for a specified date range.
        """
        start_date = self.bf_start_date
        end_date = self.bf_end_date

        session = settings.Session()
        pickle = models.DagPickle(self.dag, self)
        executor = self.executor
        executor.start()
        session.add(pickle)
        session.commit()
        pickle_id = pickle.id

        # Build a list of all intances to run
        tasks_to_run = {}
        failed = []
        succeeded = []
        started = []
        wont_run = []
        for task in self.dag.tasks:
            start_date = start_date or task.start_date
            end_date = end_date or task.end_date or datetime.now()
            for dttm in utils.date_range(start_date, end_date,
                                         task.dag.schedule_interval):
                ti = models.TaskInstance(task, dttm)
                tasks_to_run[ti.key] = ti

        # Triggering what is ready to get triggered
        while tasks_to_run:
            msg = ("Yet to run: {0} | "
                   "Succeeded: {1} | "
                   "Started: {2} | "
                   "Failed: {3} | "
                   "Won't run: {4} ").format(len(tasks_to_run), len(succeeded),
                                             len(started), len(failed),
                                             len(wont_run))

            logging.info(msg)
            for key, ti in tasks_to_run.items():
                ti.refresh_from_db()
                if ti.state == State.SUCCESS and key in tasks_to_run:
                    succeeded.append(key)
                    del tasks_to_run[key]
                elif ti.is_runnable():
                    executor.queue_command(key=ti.key,
                                           command=ti.command(
                                               mark_success=self.mark_success,
                                               pickle_id=pickle_id))
                    ti.state = State.RUNNING
                    if key not in started:
                        started.append(key)
            self.heartbeat()
            executor.heartbeat()

            # Reacting to events
            for key, state in executor.get_event_buffer().items():
                dag_id, task_id, execution_date = key
                if key not in tasks_to_run:
                    continue
                ti = tasks_to_run[key]
                ti.refresh_from_db()
                if ti.state == State.FAILED:
                    failed.append(key)
                    logging.error("Task instance " + str(key) + " failed")
                    del tasks_to_run[key]
                    # Removing downstream tasks from the one that has failed
                    for t in self.dag.get_task(task_id).get_flat_relatives(
                            upstream=False):
                        key = (ti.dag_id, t.task_id, execution_date)
                        if key in tasks_to_run:
                            wont_run.append(key)
                            del tasks_to_run[key]
                elif ti.state == State.SUCCESS:
                    succeeded.append(key)
                    del tasks_to_run[key]
        executor.end()
        logging.info("Run summary:")
        session.close()