Ejemplo n.º 1
0
    def test_terminate_task(self):
        """If a task instance's db state get deleted, it should fail"""
        TI = TaskInstance
        dag = self.dagbag.dags.get('test_utils')
        task = dag.task_dict.get('sleeps_forever')

        ti = TI(task=task, execution_date=DEFAULT_DATE)
        job = jobs.LocalTaskJob(
            task_instance=ti, ignore_ti_state=True, executor=SequentialExecutor())

        # Running task instance asynchronously
        p = multiprocessing.Process(target=job.run)
        p.start()
        sleep(5)
        settings.engine.dispose()
        session = settings.Session()
        ti.refresh_from_db(session=session)
        # making sure it's actually running
        self.assertEqual(State.RUNNING, ti.state)
        ti = session.query(TI).filter_by(
            dag_id=task.dag_id,
            task_id=task.task_id,
            execution_date=DEFAULT_DATE
        ).one()

        # deleting the instance should result in a failure
        session.delete(ti)
        session.commit()
        # waiting for the async task to finish
        p.join()

        # making sure that the task ended up as failed
        ti.refresh_from_db(session=session)
        self.assertEqual(State.FAILED, ti.state)
        session.close()
Ejemplo n.º 2
0
def _run_task_by_local_task_job(args, ti):
    """
    Run LocalTaskJob, which monitors the raw task execution process
    """
    run_job = jobs.LocalTaskJob(
        task_instance=ti,
        mark_success=args.mark_success,
        pickle_id=args.pickle,
        ignore_all_deps=args.ignore_all_dependencies,
        ignore_depends_on_past=args.ignore_depends_on_past,
        ignore_task_deps=args.ignore_dependencies,
        ignore_ti_state=args.force,
        pool=args.pool)
    run_job.run()
Ejemplo n.º 3
0
def _run(args, dag, ti):
    if args.local:
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            pickle_id=args.pickle,
            ignore_all_deps=args.ignore_all_dependencies,
            ignore_depends_on_past=args.ignore_depends_on_past,
            ignore_task_deps=args.ignore_dependencies,
            ignore_ti_state=args.force,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti._run_raw_task(  # pylint: disable=protected-access
            mark_success=args.mark_success,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                with db.create_session() as session:
                    pickle = DagPickle(dag)
                    session.add(pickle)
                    pickle_id = pickle.id
                    # TODO: This should be written to a log
                    print('Pickled dag {dag} as pickle_id: {pickle_id}'.format(
                        dag=dag, pickle_id=pickle_id))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = get_default_executor()
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_all_deps=args.ignore_all_dependencies,
            ignore_depends_on_past=args.ignore_depends_on_past,
            ignore_task_deps=args.ignore_dependencies,
            ignore_ti_state=args.force,
            pool=args.pool)
        executor.heartbeat()
        executor.end()
Ejemplo n.º 4
0
def run(args):

    utils.pessimistic_connection_handling()
    # Setting up logging
    log = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
    directory = log + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    # store old log (to help with S3 appends)
    if os.path.exists(filename):
        with open(filename, 'r') as logfile:
            old_log = logfile.read()
    else:
        old_log = None

    subdir = None
    if args.subdir:
        subdir = args.subdir.replace(
            "DAGS_FOLDER", conf.get("core", "DAGS_FOLDER"))
        subdir = os.path.expanduser(subdir)
    logging.basicConfig(
        filename=filename,
        level=settings.LOGGING_LEVEL,
        format=settings.LOG_FORMAT)
    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found'.format(args.dag_id)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(
            DagPickle).filter(DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print((
                    'Pickled dag {dag} '
                    'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force)
        executor.heartbeat()
        executor.end()

    if conf.get('core', 'S3_LOG_FOLDER').startswith('s3:'):
        import boto
        s3_log = filename.replace(log, conf.get('core', 'S3_LOG_FOLDER'))
        bucket, key = s3_log.lstrip('s3:/').split('/', 1)
        if os.path.exists(filename):

            # get logs
            with open(filename, 'r') as logfile:
                new_log = logfile.read()

            # remove old logs (since they are already in S3)
            if old_log:
                new_log.replace(old_log, '')

            try:
                s3 = boto.connect_s3()
                s3_key = boto.s3.key.Key(s3.get_bucket(bucket), key)

                # append new logs to old S3 logs, if available
                if s3_key.exists():
                    old_s3_log = s3_key.get_contents_as_string().decode()
                    new_log = old_s3_log + '\n' + new_log

                # send log to S3
                s3_key.set_contents_from_string(new_log)
            except:
                print('Could not send logs to S3.')
Ejemplo n.º 5
0
def run(args):

    utils.pessimistic_connection_handling()
    # Setting up logging
    log = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
    directory = log + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())
    subdir = None
    if args.subdir:
        subdir = args.subdir.replace(
            "DAGS_FOLDER", conf.get("core", "DAGS_FOLDER"))
        subdir = os.path.expanduser(subdir)
    logging.basicConfig(
        filename=filename,
        level=settings.LOGGING_LEVEL,
        format=settings.LOG_FORMAT)
    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found'.format(args.dag_id)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(
            DagPickle).filter(DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print((
                    'Pickled dag {dag} '
                    'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force)
        executor.heartbeat()
        executor.end()
Ejemplo n.º 6
0
def run(args, dag=None):
    db_utils.pessimistic_connection_handling()
    if dag:
        args.dag_id = dag.dag_id

    # Setting up logging
    log_base = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
    directory = log_base + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    logging.root.handlers = []
    logging.basicConfig(
        filename=filename,
        level=settings.LOGGING_LEVEL,
        format=settings.LOG_FORMAT)

    if not args.pickle and not dag:
        dag = get_dag(args)
    elif not dag:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(
            DagPickle).filter(DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
    task = dag.get_task(task_id=args.task_id)

    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            ignore_dependencies=args.ignore_dependencies,
            ignore_depends_on_past=args.ignore_depends_on_past,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            ignore_depends_on_past=args.ignore_depends_on_past,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print((
                    'Pickled dag {dag} '
                    'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            ignore_depends_on_past=args.ignore_depends_on_past,
            force=args.force,
            pool=args.pool)
        executor.heartbeat()
        executor.end()

    # Force the log to flush, and set the handler to go back to normal so we
    # don't continue logging to the task's log file. The flush is important
    # because we subsequently read from the log to insert into S3 or Google
    # cloud storage.
    logging.root.handlers[0].flush()
    logging.root.handlers = []

    # store logs remotely
    remote_base = conf.get('core', 'REMOTE_BASE_LOG_FOLDER')

    # deprecated as of March 2016
    if not remote_base and conf.get('core', 'S3_LOG_FOLDER'):
        warnings.warn(
            'The S3_LOG_FOLDER conf key has been replaced by '
            'REMOTE_BASE_LOG_FOLDER. Your conf still works but please '
            'update airflow.cfg to ensure future compatibility.',
            DeprecationWarning)
        remote_base = conf.get('core', 'S3_LOG_FOLDER')

    if os.path.exists(filename):
        # read log and remove old logs to get just the latest additions

        with open(filename, 'r') as logfile:
            log = logfile.read()

        remote_log_location = filename.replace(log_base, remote_base)
        # S3
        if remote_base.startswith('s3:/'):
            logging_utils.S3Log().write(log, remote_log_location)
        # GCS
        elif remote_base.startswith('gs:/'):
            logging_utils.GCSLog().write(
                log,
                remote_log_location,
                append=True)
        # Other
        elif remote_base and remote_base != 'None':
            logging.error(
                'Unsupported remote log location: {}'.format(remote_base))
Ejemplo n.º 7
0
 def test_local_task_job(self):
     TI = models.TaskInstance
     ti = TI(task=self.runme_0, execution_date=DEFAULT_DATE)
     job = jobs.LocalTaskJob(task_instance=ti, force=True)
     job.run()
Ejemplo n.º 8
0
def run(args):

    utils.pessimistic_connection_handling()

    # Setting up logging
    log_base = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER'))
    directory = log_base + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    subdir = process_subdir(args.subdir)
    logging.root.handlers = []
    logging.basicConfig(filename=filename,
                        level=settings.LOGGING_LEVEL,
                        format=settings.LOG_FORMAT)

    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found in {1}'.format(
                args.dag_id, subdir)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(DagPickle).filter(
            DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print(('Pickled dag {dag} '
                       'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force,
            pool=args.pool)
        executor.heartbeat()
        executor.end()

    # store logs remotely
    remote_base = configuration.get('core', 'REMOTE_BASE_LOG_FOLDER')

    # deprecated as of March 2016
    if not remote_base and configuration.get('core', 'S3_LOG_FOLDER'):
        warnings.warn(
            'The S3_LOG_FOLDER configuration key has been replaced by '
            'REMOTE_BASE_LOG_FOLDER. Your configuration still works but please '
            'update airflow.cfg to ensure future compatibility.',
            DeprecationWarning)
        remote_base = configuration.get('core', 'S3_LOG_FOLDER')

    if os.path.exists(filename):
        # read log and remove old logs to get just the latest additions

        with open(filename, 'r') as logfile:
            log = logfile.read()

        remote_log_location = filename.replace(log_base, remote_base)
        # S3

        if remote_base.startswith('s3:/'):
            utils.S3Log().write(log, remote_log_location)
        # GCS
        elif remote_base.startswith('gs:/'):
            utils.GCSLog().write(log, remote_log_location, append=True)
        # Other
        elif remote_base:
            logging.error(
                'Unsupported remote log location: {}'.format(remote_base))