Beispiel #1
0
    def __init__(
            self,
            dag_id=None,
            dag_ids=None,
            subdir=None,
            test_mode=False,
            refresh_dags_every=10,
            num_runs=None,
            do_pickle=False,
            *args, **kwargs):

        # for BaseJob compatibility
        self.dag_id = dag_id
        self.dag_ids = [dag_id] if dag_id else []
        if dag_ids:
            self.dag_ids.extend(dag_ids)

        self.subdir = subdir

        if test_mode:
            self.num_runs = 1
        else:
            self.num_runs = num_runs

        self.refresh_dags_every = refresh_dags_every
        self.do_pickle = do_pickle
        super(SchedulerJob, self).__init__(*args, **kwargs)

        self.heartrate = conf.getint('scheduler', 'SCHEDULER_HEARTBEAT_SEC')
        self.max_threads = min(conf.getint('scheduler', 'max_threads'), multiprocessing.cpu_count())
        if 'sqlite' in conf.get('core', 'sql_alchemy_conn'):
            if self.max_threads > 1:
                self.logger.error("Cannot use more than 1 thread when using sqlite. Setting max_threads to 1")
            self.max_threads = 1
Beispiel #2
0
 def test_some_resources_specified(self):
     resources = Resources(cpus=0, disk=1)
     self.assertEqual(resources.cpus.qty, 0)
     self.assertEqual(resources.ram.qty,
                      configuration.getint('operators', 'default_ram'))
     self.assertEqual(resources.disk.qty, 1)
     self.assertEqual(resources.gpus.qty,
                      configuration.getint('operators', 'default_gpus'))
Beispiel #3
0
 def test_no_resources_specified(self):
     resources = Resources()
     self.assertEqual(resources.cpus.qty,
                      configuration.getint('operators', 'default_cpus'))
     self.assertEqual(resources.ram.qty,
                      configuration.getint('operators', 'default_ram'))
     self.assertEqual(resources.disk.qty,
                      configuration.getint('operators', 'default_disk'))
     self.assertEqual(resources.gpus.qty,
                      configuration.getint('operators', 'default_gpus'))
Beispiel #4
0
def configure_orm():
    global engine
    global Session
    engine_args = {}
    if "sqlite" not in SQL_ALCHEMY_CONN:
        # Engine args not supported by sqlite
        engine_args["pool_size"] = conf.getint("core", "SQL_ALCHEMY_POOL_SIZE")
        engine_args["pool_recycle"] = conf.getint("core", "SQL_ALCHEMY_POOL_RECYCLE")

    engine = create_engine(SQL_ALCHEMY_CONN, **engine_args)
    Session = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine))
Beispiel #5
0
def configure_orm(disable_connection_pool=False):
    log.debug("Setting up DB connection pool (PID %s)" % os.getpid())
    global engine
    global Session
    engine_args = {}

    pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED')
    if disable_connection_pool or not pool_connections:
        engine_args['poolclass'] = NullPool
        log.debug("settings.configure_orm(): Using NullPool")
    elif 'sqlite' not in SQL_ALCHEMY_CONN:
        # Pool size engine args not supported by sqlite.
        # If no config value is defined for the pool size, select a reasonable value.
        # 0 means no limit, which could lead to exceeding the Database connection limit.
        try:
            pool_size = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE')
        except conf.AirflowConfigException:
            pool_size = 5

        # The DB server already has a value for wait_timeout (number of seconds after
        # which an idle sleeping connection should be killed). Since other DBs may
        # co-exist on the same server, SQLAlchemy should set its
        # pool_recycle to an equal or smaller value.
        try:
            pool_recycle = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE')
        except conf.AirflowConfigException:
            pool_recycle = 1800

        log.info("settings.configure_orm(): Using pool settings. pool_size={}, "
                 "pool_recycle={}, pid={}".format(pool_size, pool_recycle, os.getpid()))
        engine_args['pool_size'] = pool_size
        engine_args['pool_recycle'] = pool_recycle

    try:
        # Allow the user to specify an encoding for their DB otherwise default
        # to utf-8 so jobs & users with non-latin1 characters can still use
        # us.
        engine_args['encoding'] = conf.get('core', 'SQL_ENGINE_ENCODING')
    except conf.AirflowConfigException:
        engine_args['encoding'] = 'utf-8'
    # For Python2 we get back a newstr and need a str
    engine_args['encoding'] = engine_args['encoding'].__str__()

    engine = create_engine(SQL_ALCHEMY_CONN, **engine_args)
    reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT')
    setup_event_handlers(engine, reconnect_timeout)

    Session = scoped_session(
        sessionmaker(autocommit=False,
                     autoflush=False,
                     bind=engine,
                     expire_on_commit=False))
    def __init__(self, cpus=None, ram=None, disk=None, gpus=None):
        if cpus is None:
            cpus = configuration.getint('operators', 'default_cpus')
        if ram is None:
            ram = configuration.getint('operators', 'default_ram')
        if disk is None:
            disk = configuration.getint('operators', 'default_disk')
        if gpus is None:
            gpus = configuration.getint('operators', 'default_gpus')

        self.cpus = CpuResource(cpus)
        self.ram = RamResource(ram)
        self.disk = DiskResource(disk)
        self.gpus = GpuResource(gpus)
Beispiel #7
0
def configure_orm():
    global engine
    global Session
    engine_args = {}
    if 'sqlite' not in SQL_ALCHEMY_CONN:
        # Engine args not supported by sqlite
        engine_args['pool_size'] = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE')
        engine_args['pool_recycle'] = conf.getint('core',
                                                  'SQL_ALCHEMY_POOL_RECYCLE')
        #engine_args['echo'] = True

    engine = create_engine(SQL_ALCHEMY_CONN, **engine_args)
    Session = scoped_session(
        sessionmaker(autocommit=False, autoflush=False, bind=engine))
Beispiel #8
0
def configure_orm(disable_connection_pool=False):
    global engine
    global Session
    engine_args = {}
    if disable_connection_pool:
        engine_args['poolclass'] = NullPool
    elif 'sqlite' not in SQL_ALCHEMY_CONN:
        # Engine args not supported by sqlite
        engine_args['pool_size'] = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE')
        engine_args['pool_recycle'] = conf.getint('core',
                                                  'SQL_ALCHEMY_POOL_RECYCLE')

    engine = create_engine(SQL_ALCHEMY_CONN, **engine_args)
    Session = scoped_session(
        sessionmaker(autocommit=False, autoflush=False, bind=engine))
Beispiel #9
0
def health():
    """
    An endpoint helping check the health status of the Airflow instance,
    including metadatabase and scheduler.
    """
    session = settings.Session()
    BJ = jobs.BaseJob
    payload = {}
    scheduler_health_check_threshold = timedelta(seconds=conf.getint('scheduler',
                                                                     'scheduler_health_check_threshold'
                                                                     ))

    latest_scheduler_heartbeat = None
    payload['metadatabase'] = {'status': 'healthy'}
    try:
        latest_scheduler_heartbeat = session.query(func.max(BJ.latest_heartbeat)). \
            filter(BJ.state == 'running', BJ.job_type == 'SchedulerJob'). \
            scalar()
    except Exception:
        payload['metadatabase']['status'] = 'unhealthy'

    if not latest_scheduler_heartbeat:
        scheduler_status = 'unhealthy'
    else:
        if timezone.utcnow() - latest_scheduler_heartbeat <= scheduler_health_check_threshold:
            scheduler_status = 'healthy'
        else:
            scheduler_status = 'unhealthy'

    payload['scheduler'] = {'status': scheduler_status,
                            'latest_scheduler_heartbeat': str(latest_scheduler_heartbeat)}

    return wwwutils.json_response(payload)
Beispiel #10
0
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False):
    log = LoggingMixin().log

    SMTP_HOST = configuration.get('smtp', 'SMTP_HOST')
    SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT')
    SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS')
    SMTP_SSL = configuration.getboolean('smtp', 'SMTP_SSL')
    SMTP_USER = None
    SMTP_PASSWORD = None

    try:
        SMTP_USER = configuration.get('smtp', 'SMTP_USER')
        SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD')
    except AirflowConfigException:
        log.debug("No user/password found for SMTP, so logging in with no authentication.")

    if not dryrun:
        s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP(SMTP_HOST, SMTP_PORT)
        if SMTP_STARTTLS:
            s.starttls()
        if SMTP_USER and SMTP_PASSWORD:
            s.login(SMTP_USER, SMTP_PASSWORD)
        log.info("Sent an alert email to %s", e_to)
        s.sendmail(e_from, e_to, mime_msg.as_string())
        s.quit()
Beispiel #11
0
    def __init__(
        self,
        dag_id=None,
        subdir=None,
        test_mode=False,
        refresh_dags_every=10,
        num_runs=None,
        do_pickle=False,
        *args,
        **kwargs
    ):

        self.dag_id = dag_id
        self.subdir = subdir

        if test_mode:
            self.num_runs = 1
        else:
            self.num_runs = num_runs

        self.refresh_dags_every = refresh_dags_every
        self.do_pickle = do_pickle
        super(SchedulerJob, self).__init__(*args, **kwargs)

        self.heartrate = configuration.getint("scheduler", "SCHEDULER_HEARTBEAT_SEC")
Beispiel #12
0
def renew_from_kt():
    # The config is specified in seconds. But we ask for that same amount in
    # minutes to give ourselves a large renewal buffer.
    renewal_lifetime = "%sm" % configuration.getint('kerberos', 'reinit_frequency')
    principal = configuration.get('kerberos', 'principal').replace("_HOST", socket.getfqdn())
    cmdv = [configuration.get('kerberos', 'kinit_path'),
            "-r", renewal_lifetime,
            "-k",  # host ticket
            "-t", configuration.get('kerberos', 'keytab'),   # specify keytab
            "-c", configuration.get('kerberos', 'ccache'),   # specify credentials cache
            principal]
    LOG.info("Reinitting kerberos from keytab: " +
             " ".join(cmdv))

    subp = subprocess.Popen(cmdv,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            close_fds=True,
                            bufsize=-1)
    subp.wait()
    if subp.returncode != 0:
        LOG.error("Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s" % (
            subp.returncode,
            "\n".join(subp.stdout.readlines()),
            "\n".join(subp.stderr.readlines())))
        sys.exit(subp.returncode)

    global NEED_KRB181_WORKAROUND
    if NEED_KRB181_WORKAROUND is None:
        NEED_KRB181_WORKAROUND = detect_conf_var()
    if NEED_KRB181_WORKAROUND:
        # (From: HUE-640). Kerberos clock have seconds level granularity. Make sure we
        # renew the ticket after the initial valid time.
        time.sleep(1.5)
        perform_krb181_workaround()
Beispiel #13
0
def run():
    if configuration.get('kerberos', 'keytab') is None:
        log.debug("Keytab renewer not starting, no keytab configured")
        sys.exit(0)

    while True:
        renew_from_kt()
        time.sleep(configuration.getint('kerberos', 'reinit_frequency'))
Beispiel #14
0
def configure_orm(disable_connection_pool=False):
    log.debug("Setting up DB connection pool (PID %s)" % os.getpid())
    global engine
    global Session
    engine_args = {}

    pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED')
    if disable_connection_pool or not pool_connections:
        engine_args['poolclass'] = NullPool
    elif 'sqlite' not in SQL_ALCHEMY_CONN:
        # Engine args not supported by sqlite
        engine_args['pool_size'] = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE')
        engine_args['pool_recycle'] = conf.getint('core',
                                                  'SQL_ALCHEMY_POOL_RECYCLE')

    engine = create_engine(SQL_ALCHEMY_CONN, **engine_args)
    reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT')
    setup_event_handlers(engine, reconnect_timeout)

    Session = scoped_session(
        sessionmaker(autocommit=False, autoflush=False, bind=engine))
    def _read(self, ti, try_number, metadata=None):
        """
        Template method that contains custom logic of reading
        logs given the try_number.
        :param ti: task instance record
        :param try_number: current try_number to read log from
        :param metadata: log metadata,
                         can be used for steaming log reading and auto-tailing.
        :return: log message as a string and metadata.
        """
        # Task instance here might be different from task instance when
        # initializing the handler. Thus explicitly getting log location
        # is needed to get correct log path.
        log_relative_path = self._render_filename(ti, try_number)
        location = os.path.join(self.local_base, log_relative_path)

        log = ""

        if os.path.exists(location):
            try:
                with open(location) as f:
                    log += "*** Reading local file: {}\n".format(location)
                    log += "".join(f.readlines())
            except Exception as e:
                log = "*** Failed to load local log file: {}\n".format(location)
                log += "*** {}\n".format(str(e))
        else:
            url = os.path.join(
                "http://{ti.hostname}:{worker_log_server_port}/log", log_relative_path
            ).format(
                ti=ti,
                worker_log_server_port=conf.get('celery', 'WORKER_LOG_SERVER_PORT')
            )
            log += "*** Log file does not exist: {}\n".format(location)
            log += "*** Fetching from: {}\n".format(url)
            try:
                timeout = None  # No timeout
                try:
                    timeout = conf.getint('webserver', 'log_fetch_timeout_sec')
                except (AirflowConfigException, ValueError):
                    pass

                response = requests.get(url, timeout=timeout)

                # Check if the resource was properly fetched
                response.raise_for_status()

                log += '\n' + response.text
            except Exception as e:
                log += "*** Failed to fetch log file from worker. {}\n".format(str(e))

        return log, {'end_of_log': True}
def configure_orm(disable_connection_pool=False):
    log.debug("Setting up DB connection pool (PID %s)" % os.getpid())
    global engine
    global Session
    engine_args = {}

    pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED')
    if disable_connection_pool or not pool_connections:
        engine_args['poolclass'] = NullPool
        log.debug("settings.configure_orm(): Using NullPool")
    elif 'sqlite' not in SQL_ALCHEMY_CONN:
        # Engine args not supported by sqlite.
        # If no config value is defined for the pool size, select a reasonable value.
        # 0 means no limit, which could lead to exceeding the Database connection limit.
        try:
            pool_size = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE')
        except conf.AirflowConfigException:
            pool_size = 5

        # The DB server already has a value for wait_timeout (number of seconds after
        # which an idle sleeping connection should be killed). Since other DBs may
        # co-exist on the same server, SQLAlchemy should set its
        # pool_recycle to an equal or smaller value.
        try:
            pool_recycle = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE')
        except conf.AirflowConfigException:
            pool_recycle = 1800

        log.info("setting.configure_orm(): Using pool settings. pool_size={}, "
                 "pool_recycle={}".format(pool_size, pool_recycle))
        engine_args['pool_size'] = pool_size
        engine_args['pool_recycle'] = pool_recycle

    engine = create_engine(SQL_ALCHEMY_CONN, **engine_args)
    reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT')
    setup_event_handlers(engine, reconnect_timeout)

    Session = scoped_session(
        sessionmaker(autocommit=False, autoflush=False, bind=engine))
Beispiel #17
0
    def start_refresh(gunicorn_master_proc):
        batch_size = conf.getint('webserver', 'worker_refresh_batch_size')
        logging.debug('%s doing a refresh of %s workers',
            state, batch_size)
        sys.stdout.flush()
        sys.stderr.flush()

        excess = 0
        for _ in range(batch_size):
            gunicorn_master_proc.send_signal(signal.SIGTTIN)
            excess += 1
            wait_until_true(lambda: num_workers_expected + excess ==
                get_num_workers_running(gunicorn_master_proc))
Beispiel #18
0
    def __init__(self):
        super(CeleryExecutor, self).__init__()

        # Celery doesn't support querying the state of multiple tasks in parallel
        # (which can become a bottleneck on bigger clusters) so we use
        # a multiprocessing pool to speed this up.
        # How many worker processes are created for checking celery task state.
        self._sync_parallelism = configuration.getint('celery', 'SYNC_PARALLELISM')
        if self._sync_parallelism == 0:
            self._sync_parallelism = max(1, cpu_count() - 1)

        self._sync_pool = None
        self.tasks = {}
        self.last_state = {}
Beispiel #19
0
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False):
    SMTP_HOST = configuration.get('smtp', 'SMTP_HOST')
    SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT')
    SMTP_USER = configuration.get('smtp', 'SMTP_USER')
    SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD')
    SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS')

    if not dryrun:
        s = smtplib.SMTP(SMTP_HOST, SMTP_PORT)
        if SMTP_STARTTLS:
            s.starttls()
        if SMTP_USER and SMTP_PASSWORD:
            s.login(SMTP_USER, SMTP_PASSWORD)
        logging.info("Sent an alert email to " + str(e_to))
        s.sendmail(e_from, e_to, mime_msg.as_string())
        s.quit()
Beispiel #20
0
def get_date_time_num_runs_dag_runs_form_data(request, session, dag):
    dttm = request.args.get('execution_date')
    if dttm:
        dttm = pendulum.parse(dttm)
    else:
        dttm = dag.latest_execution_date or timezone.utcnow()

    base_date = request.args.get('base_date')
    if base_date:
        base_date = timezone.parse(base_date)
    else:
        # The DateTimeField widget truncates milliseconds and would loose
        # the first dag run. Round to next second.
        base_date = (dttm + timedelta(seconds=1)).replace(microsecond=0)

    default_dag_run = conf.getint('webserver',
                                  'default_dag_run_display_number')
    num_runs = request.args.get('num_runs')
    num_runs = int(num_runs) if num_runs else default_dag_run

    DR = models.DagRun
    drs = (session.query(DR).filter(
        DR.dag_id == dag.dag_id, DR.execution_date <= base_date).order_by(
            desc(DR.execution_date)).limit(num_runs).all())
    dr_choices = []
    dr_state = None
    for dr in drs:
        dr_choices.append((dr.execution_date.isoformat(), dr.run_id))
        if dttm == dr.execution_date:
            dr_state = dr.state

    # Happens if base_date was changed and the selected dag run is not in result
    if not dr_state and drs:
        dr = drs[0]
        dttm = dr.execution_date
        dr_state = dr.state

    return {
        'dttm': dttm,
        'base_date': base_date,
        'num_runs': num_runs,
        'execution_date': dttm.isoformat(),
        'dr_choices': dr_choices,
        'dr_state': dr_state,
    }
Beispiel #21
0
def renew_from_kt():
    # The config is specified in seconds. But we ask for that same amount in
    # minutes to give ourselves a large renewal buffer.
    renewal_lifetime = "%sm" % configuration.getint('kerberos',
                                                    'reinit_frequency')
    principal = configuration.get('kerberos',
                                  'principal').replace("_HOST",
                                                       socket.getfqdn())

    cmdv = [
        configuration.get('kerberos', 'kinit_path'),
        "-r",
        renewal_lifetime,
        "-k",  # host ticket
        "-t",
        configuration.get('kerberos', 'keytab'),  # specify keytab
        "-c",
        configuration.get('kerberos', 'ccache'),  # specify credentials cache
        principal
    ]
    log.info("Reinitting kerberos from keytab: " + " ".join(cmdv))

    subp = subprocess.Popen(cmdv,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            close_fds=True,
                            bufsize=-1,
                            universal_newlines=True)
    subp.wait()
    if subp.returncode != 0:
        log.error(
            "Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s" %
            (subp.returncode, "\n".join(subp.stdout.readlines()), "\n".join(
                subp.stderr.readlines())))
        sys.exit(subp.returncode)

    global NEED_KRB181_WORKAROUND
    if NEED_KRB181_WORKAROUND is None:
        NEED_KRB181_WORKAROUND = detect_conf_var()
    if NEED_KRB181_WORKAROUND:
        # (From: HUE-640). Kerberos clock have seconds level granularity. Make sure we
        # renew the ticket after the initial valid time.
        time.sleep(1.5)
        perform_krb181_workaround()
    def _read(self, ti, try_number):
        """
        Template method that contains custom logic of reading
        logs given the try_number.
        :param ti: task instance record
        :param try_number: current try_number to read log from
        :return: log message as a string
        """
        # Task instance here might be different from task instance when
        # initializing the handler. Thus explicitly getting log location
        # is needed to get correct log path.
        log_relative_path = self._render_filename(ti, try_number + 1)
        loc = os.path.join(self.local_base, log_relative_path)
        log = ""

        if os.path.exists(loc):
            try:
                with open(loc) as f:
                    log += "*** Reading local log.\n" + "".join(f.readlines())
            except Exception as e:
                log = "*** Failed to load local log file: {}. {}\n".format(loc, str(e))
        else:
            url = os.path.join("http://{ti.hostname}:{worker_log_server_port}/log",
                               log_relative_path).format(
                ti=ti,
                worker_log_server_port=conf.get('celery', 'WORKER_LOG_SERVER_PORT'))
            log += "*** Log file isn't local.\n"
            log += "*** Fetching here: {url}\n".format(**locals())
            try:
                import requests
                timeout = None  # No timeout
                try:
                    timeout = conf.getint('webserver', 'log_fetch_timeout_sec')
                except (AirflowConfigException, ValueError):
                    pass

                response = requests.get(url, timeout=timeout)
                response.raise_for_status()
                log += '\n' + response.text
            except Exception as e:
                log += "*** Failed to fetch log file from worker. {}\n".format(str(e))

        return log
Beispiel #23
0
 def test_with_execution_date_parameter_only(self):
     """
     Tests graph view with execution_date URL parameter.
     Scenario: click link from dag runs view.
     Should only show dag runs older than execution_date in the drop down.
     Should select the particular dag run.
     Should set base date to execution date.
     """
     response = self.app.get(self.endpoint + '&execution_date={}'.format(
         self.runs[1].execution_date.isoformat()))
     self.test.assertEqual(response.status_code, 200)
     data = response.data.decode('utf-8')
     self.assertBaseDateAndNumRuns(
         self.runs[1].execution_date,
         configuration.getint('webserver',
                              'default_dag_run_display_number'), data)
     self.assertRunIsNotInDropdown(self.runs[0], data)
     self.assertRunIsSelected(self.runs[1], data)
     self.assertRunIsInDropdownNotSelected(self.runs[2], data)
     self.assertRunIsInDropdownNotSelected(self.runs[3], data)
Beispiel #24
0
    def __init__(
            self,
            dag_id=None,
            subdir=None,
            test_mode=False,
            refresh_dags_every=10,
            num_runs=None,
            do_pickle=False,
            *args, **kwargs):

        self.dag_id = dag_id
        self.subdir = subdir
        if test_mode:
            self.num_runs = 1
        else:
            self.num_runs = num_runs
        self.refresh_dags_every = refresh_dags_every
        self.do_pickle = do_pickle
        super(SchedulerJob, self).__init__(*args, **kwargs)

        self.heartrate = configuration.getint('scheduler', 'SCHEDULER_HEARTBEAT_SEC')
 def test_with_execution_date_parameter_only(self):
     """
     Tests graph view with execution_date URL parameter.
     Scenario: click link from dag runs view.
     Should only show dag runs older than execution_date in the drop down.
     Should select the particular dag run.
     Should set base date to execution date.
     """
     response = self.app.get(
         self.endpoint + '&execution_date={}'.format(
             self.runs[1].execution_date.isoformat())
     )
     self.test.assertEqual(response.status_code, 200)
     data = response.data.decode('utf-8')
     self.assertBaseDateAndNumRuns(
         self.runs[1].execution_date,
         configuration.getint('webserver', 'default_dag_run_display_number'),
         data)
     self.assertRunIsNotInDropdown(self.runs[0], data)
     self.assertRunIsSelected(self.runs[1], data)
     self.assertRunIsInDropdownNotSelected(self.runs[2], data)
     self.assertRunIsInDropdownNotSelected(self.runs[3], data)
class CeleryConfig(object):
    CELERY_ACCEPT_CONTENT = ['json', 'pickle']
    CELERY_EVENT_SERIALIZER = 'json'
    CELERY_RESULT_SERIALIZER = 'pickle'
    CELERY_TASK_SERIALIZER = 'pickle'
    CELERYD_PREFETCH_MULTIPLIER = 1
    CELERY_ACKS_LATE = True
    BROKER_URL = configuration.get('celery', 'BROKER_URL')
    CELERY_RESULT_BACKEND = configuration.get('celery',
                                              'CELERY_RESULT_BACKEND')
    CELERYD_CONCURRENCY = configuration.getint('celery', 'CELERYD_CONCURRENCY')
    CELERY_DEFAULT_QUEUE = DEFAULT_QUEUE
    CELERY_DEFAULT_EXCHANGE = DEFAULT_QUEUE

    celery_ssl_active = False
    try:
        celery_ssl_active = configuration.getboolean('celery',
                                                     'CELERY_SSL_ACTIVE')
    except AirflowConfigException as e:
        log = LoggingMixin().logger
        log.warning("Celery Executor will run without SSL")

    try:
        if celery_ssl_active:
            BROKER_USE_SSL = {
                'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'),
                'certfile': configuration.get('celery', 'CELERY_SSL_CERT'),
                'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'),
                'cert_reqs': ssl.CERT_REQUIRED
            }
    except AirflowConfigException as e:
        raise AirflowException(
            'AirflowConfigException: CELERY_SSL_ACTIVE is True, please ensure CELERY_SSL_KEY, '
            'CELERY_SSL_CERT and CELERY_SSL_CACERT are set')
    except Exception as e:
        raise AirflowException(
            'Exception: There was an unknown Celery SSL Error.  Please ensure you want to use '
            'SSL and/or have all necessary certs and key.')
class CeleryConfig(object):
    CELERY_ACCEPT_CONTENT = ['json', 'pickle']
    CELERY_EVENT_SERIALIZER = 'json'
    CELERY_RESULT_SERIALIZER = 'pickle'
    CELERY_TASK_SERIALIZER = 'pickle'
    CELERYD_PREFETCH_MULTIPLIER = 1
    CELERY_ACKS_LATE = True
    BROKER_URL = configuration.get('celery', 'BROKER_URL')
    CELERY_RESULT_BACKEND = configuration.get('celery', 'CELERY_RESULT_BACKEND')
    CELERYD_CONCURRENCY = configuration.getint('celery', 'CELERYD_CONCURRENCY')
    CELERY_DEFAULT_QUEUE = DEFAULT_QUEUE
    CELERY_DEFAULT_EXCHANGE = DEFAULT_QUEUE
    if configuration.getboolean('celery', 'CELERY_SSL_ACTIVE'):
        try:
            BROKER_USE_SSL = {'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'),
                              'certfile': configuration.get('celery', 'CELERY_SSL_CERT'),
                              'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'),
                              'cert_reqs': ssl.CERT_REQUIRED}
        except ValueError:
            raise AirflowException('ValueError: CELERY_SSL_ACTIVE is True, please ensure CELERY_SSL_KEY, '
                                   'CELERY_SSL_CERT and CELERY_SSL_CACERT are set')
        except Exception as e:
            raise AirflowException('Exception: There was an unknown Celery SSL Error.  Please ensure you want to use '
                                   'SSL and/or have all necessary certs and key.')
Beispiel #28
0
    def kill_zombies(self, session=None):
        """
        Fail zombie tasks, which are tasks that haven't
        had a heartbeat for too long, in the current DagBag.

        :param session: DB session.
        :type session: sqlalchemy.orm.session.Session
        """
        # Avoid circular import
        from airflow.models.taskinstance import TaskInstance as TI
        from airflow.jobs import LocalTaskJob as LJ

        # How many seconds do we wait for tasks to heartbeat before mark them as zombies.
        zombie_threshold_secs = (configuration.getint(
            'scheduler', 'scheduler_zombie_task_threshold'))
        limit_dttm = timezone.utcnow() - timedelta(
            seconds=zombie_threshold_secs)
        self.log.debug("Failing jobs without heartbeat after %s", limit_dttm)

        tis = (session.query(TI).join(LJ, TI.job_id == LJ.id).filter(
            TI.state == State.RUNNING).filter(TI.dag_id.in_(self.dags)).filter(
                or_(
                    LJ.state != State.RUNNING,
                    LJ.latest_heartbeat < limit_dttm,
                )).all())
        for ti in tis:
            self.log.info(
                "Detected zombie job with dag_id %s, task_id %s, and execution date %s",
                ti.dag_id, ti.task_id, ti.execution_date.isoformat())
            ti.test_mode = configuration.getboolean('core', 'unit_test_mode')
            ti.task = self.dags[ti.dag_id].get_task(ti.task_id)
            ti.handle_failure("{} detected as zombie".format(ti), ti.test_mode,
                              ti.get_template_context())
            self.log.info('Marked zombie job %s as %s', ti, ti.state)
            Stats.incr('zombies_killed')
        session.commit()
    def _execute(self):
        self.task_runner = get_task_runner(self)

        def signal_handler(signum, frame):
            """Setting kill signal handler"""
            self.log.error("Received SIGTERM. Terminating subprocesses")
            self.on_kill()
            raise AirflowException("LocalTaskJob received SIGTERM signal")

        signal.signal(signal.SIGTERM, signal_handler)

        if not self.task_instance._check_and_change_state_before_execution(
                mark_success=self.mark_success,
                ignore_all_deps=self.ignore_all_deps,
                ignore_depends_on_past=self.ignore_depends_on_past,
                ignore_task_deps=self.ignore_task_deps,
                ignore_ti_state=self.ignore_ti_state,
                job_id=self.id,
                pool=self.pool):
            self.log.info("Task is not able to be run")
            return

        try:
            self.task_runner.start()

            last_heartbeat_time = time.time()
            heartbeat_time_limit = conf.getint(
                'scheduler', 'scheduler_zombie_task_threshold')
            while True:
                # Monitor the task to see if it's done
                return_code = self.task_runner.return_code()
                if return_code is not None:
                    self.log.info("Task exited with return code %s",
                                  return_code)
                    return

                # Periodically heartbeat so that the scheduler doesn't think this
                # is a zombie
                try:
                    self.heartbeat()
                    last_heartbeat_time = time.time()
                except OperationalError:
                    Stats.incr('local_task_job_heartbeat_failure', 1, 1)
                    self.log.exception(
                        "Exception while trying to heartbeat! Sleeping for %s seconds",
                        self.heartrate)
                    time.sleep(self.heartrate)

                # If it's been too long since we've heartbeat, then it's possible that
                # the scheduler rescheduled this task, so kill launched processes.
                time_since_last_heartbeat = time.time() - last_heartbeat_time
                if time_since_last_heartbeat > heartbeat_time_limit:
                    Stats.incr('local_task_job_prolonged_heartbeat_failure', 1,
                               1)
                    self.log.error("Heartbeat time limited exceeded!")
                    raise AirflowException(
                        "Time since last heartbeat({:.2f}s) "
                        "exceeded limit ({}s).".format(
                            time_since_last_heartbeat, heartbeat_time_limit))
        finally:
            self.on_kill()
Beispiel #30
0
    def __init__(self):
        configuration_dict = configuration.as_dict(display_sensitive=True)
        self.core_configuration = configuration_dict['core']
        self.kube_secrets = configuration_dict.get('kubernetes_secrets', {})
        self.kube_env_vars = configuration_dict.get(
            'kubernetes_environment_variables', {})
        self.env_from_configmap_ref = configuration.get(
            self.kubernetes_section, 'env_from_configmap_ref')
        self.env_from_secret_ref = configuration.get(self.kubernetes_section,
                                                     'env_from_secret_ref')
        self.airflow_home = settings.AIRFLOW_HOME
        self.dags_folder = configuration.get(self.core_section, 'dags_folder')
        self.parallelism = configuration.getint(self.core_section,
                                                'parallelism')
        self.worker_container_repository = configuration.get(
            self.kubernetes_section, 'worker_container_repository')
        self.worker_container_tag = configuration.get(self.kubernetes_section,
                                                      'worker_container_tag')
        self.kube_image = '{}:{}'.format(self.worker_container_repository,
                                         self.worker_container_tag)
        self.kube_image_pull_policy = configuration.get(
            self.kubernetes_section, "worker_container_image_pull_policy")
        self.kube_node_selectors = configuration_dict.get(
            'kubernetes_node_selectors', {})
        self.kube_annotations = configuration_dict.get(
            'kubernetes_annotations', {})
        self.kube_labels = configuration_dict.get('kubernetes_labels', {})
        self.delete_worker_pods = conf.getboolean(self.kubernetes_section,
                                                  'delete_worker_pods')
        self.worker_pods_creation_batch_size = conf.getint(
            self.kubernetes_section, 'worker_pods_creation_batch_size')
        self.worker_service_account_name = conf.get(
            self.kubernetes_section, 'worker_service_account_name')
        self.image_pull_secrets = conf.get(self.kubernetes_section,
                                           'image_pull_secrets')

        # NOTE: user can build the dags into the docker image directly,
        # this will set to True if so
        self.dags_in_image = conf.getboolean(self.kubernetes_section,
                                             'dags_in_image')

        # Run as user for pod security context
        self.worker_run_as_user = self._get_security_context_val('run_as_user')
        self.worker_fs_group = self._get_security_context_val('fs_group')

        # NOTE: `git_repo` and `git_branch` must be specified together as a pair
        # The http URL of the git repository to clone from
        self.git_repo = conf.get(self.kubernetes_section, 'git_repo')
        # The branch of the repository to be checked out
        self.git_branch = conf.get(self.kubernetes_section, 'git_branch')
        # Optionally, the directory in the git repository containing the dags
        self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath')
        # Optionally, the root directory for git operations
        self.git_sync_root = conf.get(self.kubernetes_section, 'git_sync_root')
        # Optionally, the name at which to publish the checked-out files under --root
        self.git_sync_dest = conf.get(self.kubernetes_section, 'git_sync_dest')
        # Optionally, if git_dags_folder_mount_point is set the worker will use
        # {git_dags_folder_mount_point}/{git_sync_dest}/{git_subpath} as dags_folder
        self.git_dags_folder_mount_point = conf.get(
            self.kubernetes_section, 'git_dags_folder_mount_point')

        # Optionally a user may supply a (`git_user` AND `git_password`) OR
        # (`git_ssh_key_secret_name` AND `git_ssh_key_secret_key`) for private repositories
        self.git_user = conf.get(self.kubernetes_section, 'git_user')
        self.git_password = conf.get(self.kubernetes_section, 'git_password')
        self.git_ssh_key_secret_name = conf.get(self.kubernetes_section,
                                                'git_ssh_key_secret_name')
        self.git_ssh_known_hosts_configmap_name = conf.get(
            self.kubernetes_section, 'git_ssh_known_hosts_configmap_name')

        # NOTE: The user may optionally use a volume claim to mount a PV containing
        # DAGs directly
        self.dags_volume_claim = conf.get(self.kubernetes_section,
                                          'dags_volume_claim')

        # This prop may optionally be set for PV Claims and is used to write logs
        self.logs_volume_claim = conf.get(self.kubernetes_section,
                                          'logs_volume_claim')

        # This prop may optionally be set for PV Claims and is used to locate DAGs
        # on a SubPath
        self.dags_volume_subpath = conf.get(self.kubernetes_section,
                                            'dags_volume_subpath')

        # This prop may optionally be set for PV Claims and is used to locate logs
        # on a SubPath
        self.logs_volume_subpath = conf.get(self.kubernetes_section,
                                            'logs_volume_subpath')

        # Optionally, hostPath volume containing DAGs
        self.dags_volume_host = conf.get(self.kubernetes_section,
                                         'dags_volume_host')

        # Optionally, write logs to a hostPath Volume
        self.logs_volume_host = conf.get(self.kubernetes_section,
                                         'logs_volume_host')

        # This prop may optionally be set for PV Claims and is used to write logs
        self.base_log_folder = configuration.get(self.core_section,
                                                 'base_log_folder')

        # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note
        # that if your
        # cluster has RBAC enabled, your scheduler may need service account permissions to
        # create, watch, get, and delete pods in this namespace.
        self.kube_namespace = conf.get(self.kubernetes_section, 'namespace')
        # The Kubernetes Namespace in which pods will be created by the executor. Note
        # that if your
        # cluster has RBAC enabled, your workers may need service account permissions to
        # interact with cluster components.
        self.executor_namespace = conf.get(self.kubernetes_section,
                                           'namespace')
        # Task secrets managed by KubernetesExecutor.
        self.gcp_service_account_keys = conf.get(self.kubernetes_section,
                                                 'gcp_service_account_keys')

        # If the user is using the git-sync container to clone their repository via git,
        # allow them to specify repository, tag, and pod name for the init container.
        self.git_sync_container_repository = conf.get(
            self.kubernetes_section, 'git_sync_container_repository')

        self.git_sync_container_tag = conf.get(self.kubernetes_section,
                                               'git_sync_container_tag')
        self.git_sync_container = '{}:{}'.format(
            self.git_sync_container_repository, self.git_sync_container_tag)

        self.git_sync_init_container_name = conf.get(
            self.kubernetes_section, 'git_sync_init_container_name')

        self.git_sync_run_as_user = self._get_security_context_val(
            'git_sync_run_as_user')

        # The worker pod may optionally have a  valid Airflow config loaded via a
        # configmap
        self.airflow_configmap = conf.get(self.kubernetes_section,
                                          'airflow_configmap')

        affinity_json = conf.get(self.kubernetes_section, 'affinity')
        if affinity_json:
            self.kube_affinity = json.loads(affinity_json)
        else:
            self.kube_affinity = None

        tolerations_json = conf.get(self.kubernetes_section, 'tolerations')
        if tolerations_json:
            self.kube_tolerations = json.loads(tolerations_json)
        else:
            self.kube_tolerations = None

        kube_client_request_args = conf.get(self.kubernetes_section,
                                            'kube_client_request_args')
        if kube_client_request_args:
            self.kube_client_request_args = json.loads(
                kube_client_request_args)
            if self.kube_client_request_args['_request_timeout'] and \
                    isinstance(self.kube_client_request_args['_request_timeout'], list):
                self.kube_client_request_args['_request_timeout'] = \
                    tuple(self.kube_client_request_args['_request_timeout'])
        else:
            self.kube_client_request_args = {}
        self._validate()
Beispiel #31
0
    def start(self):
        self.task_queue = Queue()
        self.result_queue = Queue()
        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''

        if not configuration.get('mesos', 'MASTER'):
            self.log.error("Expecting mesos master URL for mesos executor")
            raise AirflowException(
                "mesos.master not provided for mesos executor")

        master = configuration.get('mesos', 'MASTER')

        framework.name = get_framework_name()

        if not configuration.get('mesos', 'TASK_CPU'):
            task_cpu = 1
        else:
            task_cpu = configuration.getint('mesos', 'TASK_CPU')

        if not configuration.get('mesos', 'TASK_MEMORY'):
            task_memory = 256
        else:
            task_memory = configuration.getint('mesos', 'TASK_MEMORY')

        if configuration.getboolean('mesos', 'CHECKPOINT'):
            framework.checkpoint = True

            if configuration.get('mesos', 'FAILOVER_TIMEOUT'):
                # Import here to work around a circular import error
                from airflow.models import Connection

                # Query the database to get the ID of the Mesos Framework, if available.
                conn_id = FRAMEWORK_CONNID_PREFIX + framework.name
                session = Session()
                connection = session.query(Connection).filter_by(
                    conn_id=conn_id).first()
                if connection is not None:
                    # Set the Framework ID to let the scheduler reconnect with running tasks.
                    framework.id.value = connection.extra

                framework.failover_timeout = configuration.getint(
                    'mesos', 'FAILOVER_TIMEOUT')
        else:
            framework.checkpoint = False

        self.log.info(
            'MesosFramework master : %s, name : %s, cpu : %s, mem : %s, checkpoint : %s',
            master, framework.name, str(task_cpu), str(task_memory),
            str(framework.checkpoint))

        implicit_acknowledgements = 1

        if configuration.getboolean('mesos', 'AUTHENTICATE'):
            if not configuration.get('mesos', 'DEFAULT_PRINCIPAL'):
                self.log.error(
                    "Expecting authentication principal in the environment")
                raise AirflowException(
                    "mesos.default_principal not provided in authenticated mode"
                )
            if not configuration.get('mesos', 'DEFAULT_SECRET'):
                self.log.error(
                    "Expecting authentication secret in the environment")
                raise AirflowException(
                    "mesos.default_secret not provided in authenticated mode")

            credential = mesos_pb2.Credential()
            credential.principal = configuration.get('mesos',
                                                     'DEFAULT_PRINCIPAL')
            credential.secret = configuration.get('mesos', 'DEFAULT_SECRET')

            framework.principal = credential.principal

            driver = mesos.native.MesosSchedulerDriver(
                AirflowMesosScheduler(self.task_queue, self.result_queue,
                                      task_cpu, task_memory), framework,
                master, implicit_acknowledgements, credential)
        else:
            framework.principal = 'Airflow'
            driver = mesos.native.MesosSchedulerDriver(
                AirflowMesosScheduler(self.task_queue, self.result_queue,
                                      task_cpu, task_memory), framework,
                master, implicit_acknowledgements)

        self.mesos_driver = driver
        self.mesos_driver.start()
Beispiel #32
0
 def is_alive(self):
     return ((datetime.now() - self.latest_heartbeat).seconds <
             (conf.getint('scheduler', 'JOB_HEARTBEAT_SEC') * 2.1))
Beispiel #33
0
 def is_alive(self):
     return (
         (datetime.now() - self.latest_heartbeat).seconds <
         (conf.getint('scheduler', 'JOB_HEARTBEAT_SEC') * 2.1)
     )
def auto_conn():
    logging.info('Creating connections, pool and sql path')

    session = Session()

    def create_new_conn(session, attributes):
        new_conn = models.Connection()
        new_conn.conn_id = attributes.get("conn_id")
        new_conn.conn_type = attributes.get('conn_type')
        new_conn.host = attributes.get('host')
        new_conn.port = attributes.get('port')
        new_conn.schema = attributes.get('schema')
        new_conn.login = attributes.get('login')
        new_conn.extra = attributes.get('extra')
        # new_conn.password = attributes.get('password')
        new_conn.set_password(attributes.get('password'))

        session.add(new_conn)
        session.commit()

    create_new_conn(session,
                    {"conn_id": configuration.get('s3' , 's3_conn_id'),
                     "conn_type": configuration.get('s3' , 's3_conn_type'),
                     "extra":configuration.get('s3', 's3_extra')

                     })

    create_new_conn(session,
                    {"conn_id": configuration.get('mysql', 'mysql_conn_id'),
                     "conn_type": configuration.get('mysql', 'mysql_conn_type'),
                     "schema":configuration.get('mysql', 'mysql_schema'),
                     "host": configuration.get('mysql', 'mysql_host'),
                     "port": configuration.getint('mysql', 'mysql_port'),
                     "login": configuration.get('mysql', 'mysql_login'),
                     "password": configuration.get('mysql', 'mysql_password')})

    create_new_conn(session,
                    {"conn_id": configuration.get('postgresql', 'postgresql_conn_id'),
                     "conn_type": configuration.get('postgresql', 'postgresql_conn_type'),
                     "host": configuration.get('postgresql', 'postgresql_host'),
                     "port": configuration.getint('postgresql', 'postgresql_port'),
                     "schema": configuration.get('postgresql', 'postgresql_schema'),
                     "login": configuration.get('postgresql', 'postgresql_login'),
                     "password": configuration.get('postgresql', 'postgresql_password')})

    create_new_conn(session,
                    {"conn_id": "airflow_connection",
                     "conn_type": configuration.get('mysql', 'mysql_conn_type'),
                     "schema": "airflow",
                     "host": "localhost",
                     "login": "******",
                     "password": "******"})

    create_new_conn(session, {
        "conn_id": "mongo_connection",
        "conn_type": "mongo",
        "host": "13.126.117.239",
        "port": "27017",
        "login": "******",
        "password": "******"
    });




    session.close()
Beispiel #35
0
    @classmethod
    def decr(cls, stat, count=1, rate=1):
        pass

    @classmethod
    def gauge(cls, stat, value, rate=1, delta=False):
        pass


Stats = DummyStatsLogger

if conf.getboolean('scheduler', 'statsd_on'):
    from statsd import StatsClient
    statsd = StatsClient(host=conf.get('scheduler', 'statsd_host'),
                         port=conf.getint('scheduler', 'statsd_port'),
                         prefix=conf.get('scheduler', 'statsd_prefix'))
    Stats = statsd
else:
    Stats = DummyStatsLogger

HEADER = """\
  ____________       _____________
 ____    |__( )_________  __/__  /________      __
____  /| |_  /__  ___/_  /_ __  /_  __ \_ | /| / /
___  ___ |  / _  /   _  __/ _  / / /_/ /_ |/ |/ /
 _/_/  |_/_/  /_/    /_/    /_/  \____/____/|__/
 """

BASE_LOG_URL = '/admin/airflow/log'
AIRFLOW_HOME = os.path.expanduser(conf.get('core', 'AIRFLOW_HOME'))
Beispiel #36
0
    def incr(cls, stat, count=1, rate=1):
        pass
    @classmethod
    def decr(cls, stat, count=1, rate=1):
        pass
    @classmethod
    def gauge(cls, stat, value, rate=1, delta=False):
        pass

Stats = DummyStatsLogger

if conf.getboolean('scheduler', 'statsd_on'):
    from statsd import StatsClient
    statsd = StatsClient(
        host=conf.get('scheduler', 'statsd_host'),
        port=conf.getint('scheduler', 'statsd_port'),
        prefix=conf.get('scheduler', 'statsd_prefix'))
    Stats = statsd
else:
    Stats = DummyStatsLogger



HEADER = """\
  ____________       _____________
 ____    |__( )_________  __/__  /________      __
____  /| |_  /__  ___/_  /_ __  /_  __ \_ | /| / /
___  ___ |  / _  /   _  __/ _  / / /_/ /_ |/ |/ /
 _/_/  |_/_/  /_/    /_/    /_/  \____/____/|__/
 """
Beispiel #37
0
    def __init__(self):
        configuration_dict = configuration.as_dict(display_sensitive=True)
        self.core_configuration = configuration_dict['core']
        self.kube_secrets = configuration_dict.get('kubernetes_secrets', {})
        self.kube_env_vars = configuration_dict.get('kubernetes_environment_variables', {})
        self.airflow_home = configuration.get(self.core_section, 'airflow_home')
        self.dags_folder = configuration.get(self.core_section, 'dags_folder')
        self.parallelism = configuration.getint(self.core_section, 'PARALLELISM')
        self.worker_container_repository = configuration.get(
            self.kubernetes_section, 'worker_container_repository')
        self.worker_container_tag = configuration.get(
            self.kubernetes_section, 'worker_container_tag')
        self.kube_image = '{}:{}'.format(
            self.worker_container_repository, self.worker_container_tag)
        self.kube_image_pull_policy = configuration.get(
            self.kubernetes_section, "worker_container_image_pull_policy"
        )
        self.kube_node_selectors = configuration_dict.get('kubernetes_node_selectors', {})
        self.kube_annotations = configuration_dict.get('kubernetes_annotations', {})
        self.delete_worker_pods = conf.getboolean(
            self.kubernetes_section, 'delete_worker_pods')
        self.worker_pods_creation_batch_size = conf.getint(
            self.kubernetes_section, 'worker_pods_creation_batch_size')
        self.worker_service_account_name = conf.get(
            self.kubernetes_section, 'worker_service_account_name')
        self.image_pull_secrets = conf.get(self.kubernetes_section, 'image_pull_secrets')

        # NOTE: user can build the dags into the docker image directly,
        # this will set to True if so
        self.dags_in_image = conf.getboolean(self.kubernetes_section, 'dags_in_image')

        # NOTE: `git_repo` and `git_branch` must be specified together as a pair
        # The http URL of the git repository to clone from
        self.git_repo = conf.get(self.kubernetes_section, 'git_repo')
        # The branch of the repository to be checked out
        self.git_branch = conf.get(self.kubernetes_section, 'git_branch')
        # Optionally, the directory in the git repository containing the dags
        self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath')
        # Optionally, the root directory for git operations
        self.git_sync_root = conf.get(self.kubernetes_section, 'git_sync_root')
        # Optionally, the name at which to publish the checked-out files under --root
        self.git_sync_dest = conf.get(self.kubernetes_section, 'git_sync_dest')
        # Optionally, if git_dags_folder_mount_point is set the worker will use
        # {git_dags_folder_mount_point}/{git_sync_dest}/{git_subpath} as dags_folder
        self.git_dags_folder_mount_point = conf.get(self.kubernetes_section,
                                                    'git_dags_folder_mount_point')

        # Optionally a user may supply a (`git_user` AND `git_password`) OR
        # (`git_ssh_key_secret_name` AND `git_ssh_key_secret_key`) for private repositories
        self.git_user = conf.get(self.kubernetes_section, 'git_user')
        self.git_password = conf.get(self.kubernetes_section, 'git_password')
        self.git_ssh_key_secret_name = conf.get(self.kubernetes_section, 'git_ssh_key_secret_name')
        self.git_ssh_known_hosts_configmap_name = conf.get(self.kubernetes_section,
                                                           'git_ssh_known_hosts_configmap_name')

        # NOTE: The user may optionally use a volume claim to mount a PV containing
        # DAGs directly
        self.dags_volume_claim = conf.get(self.kubernetes_section, 'dags_volume_claim')

        # This prop may optionally be set for PV Claims and is used to write logs
        self.logs_volume_claim = conf.get(self.kubernetes_section, 'logs_volume_claim')

        # This prop may optionally be set for PV Claims and is used to locate DAGs
        # on a SubPath
        self.dags_volume_subpath = conf.get(
            self.kubernetes_section, 'dags_volume_subpath')

        # This prop may optionally be set for PV Claims and is used to locate logs
        # on a SubPath
        self.logs_volume_subpath = conf.get(
            self.kubernetes_section, 'logs_volume_subpath')

        # Optionally, hostPath volume containing DAGs
        self.dags_volume_host = conf.get(self.kubernetes_section, 'dags_volume_host')

        # Optionally, write logs to a hostPath Volume
        self.logs_volume_host = conf.get(self.kubernetes_section, 'logs_volume_host')

        # This prop may optionally be set for PV Claims and is used to write logs
        self.base_log_folder = configuration.get(self.core_section, 'base_log_folder')

        # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note
        # that if your
        # cluster has RBAC enabled, your scheduler may need service account permissions to
        # create, watch, get, and delete pods in this namespace.
        self.kube_namespace = conf.get(self.kubernetes_section, 'namespace')
        # The Kubernetes Namespace in which pods will be created by the executor. Note
        # that if your
        # cluster has RBAC enabled, your workers may need service account permissions to
        # interact with cluster components.
        self.executor_namespace = conf.get(self.kubernetes_section, 'namespace')
        # Task secrets managed by KubernetesExecutor.
        self.gcp_service_account_keys = conf.get(self.kubernetes_section,
                                                 'gcp_service_account_keys')

        # If the user is using the git-sync container to clone their repository via git,
        # allow them to specify repository, tag, and pod name for the init container.
        self.git_sync_container_repository = conf.get(
            self.kubernetes_section, 'git_sync_container_repository')

        self.git_sync_container_tag = conf.get(
            self.kubernetes_section, 'git_sync_container_tag')
        self.git_sync_container = '{}:{}'.format(
            self.git_sync_container_repository, self.git_sync_container_tag)

        self.git_sync_init_container_name = conf.get(
            self.kubernetes_section, 'git_sync_init_container_name')

        # The worker pod may optionally have a  valid Airflow config loaded via a
        # configmap
        self.airflow_configmap = conf.get(self.kubernetes_section, 'airflow_configmap')

        affinity_json = conf.get(self.kubernetes_section, 'affinity')
        if affinity_json:
            self.kube_affinity = json.loads(affinity_json)
        else:
            self.kube_affinity = None

        tolerations_json = conf.get(self.kubernetes_section, 'tolerations')
        if tolerations_json:
            self.kube_tolerations = json.loads(tolerations_json)
        else:
            self.kube_tolerations = None

        self._validate()
Beispiel #38
0
    def process_file(self, filepath, only_if_updated=True):
        """
        Given a path to a python module or zip file, this method imports
        the module and look for dag objects within it.
        """
        found_dags = []
        # if the source file no longer exists in the DB or in the filesystem,
        # return an empty list
        # todo: raise exception?
        if filepath is None or not os.path.isfile(filepath):
            return found_dags

        try:
            # This failed before in what may have been a git sync
            # race condition
            file_last_changed = datetime.fromtimestamp(
                os.path.getmtime(filepath))
            if only_if_updated \
                    and filepath in self.dagbag.file_last_changed \
                    and file_last_changed == self.dagbag.file_last_changed[filepath]:
                return found_dags

        except Exception as e:
            self.log.exception(e)
            return found_dags

        mods = []
        if not zipfile.is_zipfile(filepath):
            if self.safe_mode and os.path.isfile(filepath):
                with open(filepath, 'rb') as f:
                    content = f.read()
                    if not all([s in content for s in (b'DAG', b'airflow')]):
                        self.dagbag.file_last_changed[
                            filepath] = file_last_changed
                        return found_dags

            self.log.debug("Importing %s", filepath)
            org_mod_name, _ = os.path.splitext(os.path.split(filepath)[-1])
            mod_name = ('unusual_prefix_' +
                        hashlib.sha1(filepath.encode('utf-8')).hexdigest() +
                        '_' + org_mod_name)

            if mod_name in sys.modules:
                del sys.modules[mod_name]

            with timeout(configuration.getint('core',
                                              "DAGBAG_IMPORT_TIMEOUT")):
                try:
                    m = imp.load_source(mod_name, filepath)
                    mods.append(m)
                except Exception as e:
                    self.log.exception("Failed to import: %s", filepath)
                    self.dagbag.import_errors[filepath] = str(e)
                    self.dagbag.file_last_changed[filepath] = file_last_changed

        else:
            zip_file = zipfile.ZipFile(filepath)
            for mod in zip_file.infolist():
                head, _ = os.path.split(mod.filename)
                mod_name, ext = os.path.splitext(mod.filename)
                if not head and (ext == '.py' or ext == '.pyc'):
                    if mod_name == '__init__':
                        self.log.warning("Found __init__.%s at root of %s",
                                         ext, filepath)
                    if self.safe_mode:
                        with zip_file.open(mod.filename) as zf:
                            self.log.debug("Reading %s from %s", mod.filename,
                                           filepath)
                            content = zf.read()
                            if not all(
                                [s in content for s in (b'DAG', b'airflow')]):
                                self.dagbag.file_last_changed[filepath] = (
                                    file_last_changed)
                                # todo: create ignore list
                                return found_dags

                    if mod_name in sys.modules:
                        del sys.modules[mod_name]

                    try:
                        sys.path.insert(0, filepath)
                        m = importlib.import_module(mod_name)
                        mods.append(m)
                    except Exception as e:
                        self.log.exception("Failed to import: %s", filepath)
                        self.dagbag.import_errors[filepath] = str(e)
                        self.dagbag.file_last_changed[
                            filepath] = file_last_changed

        for m in mods:
            for dag in list(m.__dict__.values()):
                if isinstance(dag, airflow.models.DAG):
                    if not dag.full_filepath:
                        dag.full_filepath = filepath
                        if dag.fileloc != filepath:
                            dag.fileloc = filepath
                    try:
                        dag.is_subdag = False
                        self.dagbag.bag_dag(dag, parent_dag=dag, root_dag=dag)
                        found_dags.append(dag)
                        found_dags += dag.subdags
                    except AirflowDagCycleException as cycle_exception:
                        self.log.exception("Failed to bag_dag: %s",
                                           dag.full_filepath)
                        self.dagbag.import_errors[dag.full_filepath] = \
                            str(cycle_exception)
                        self.dagbag.file_last_changed[dag.full_filepath] = \
                            file_last_changed

        self.dagbag.file_last_changed[filepath] = file_last_changed
        return found_dags
Beispiel #39
0
from sqlalchemy import Column, Integer, String, DateTime, func, Index, or_
from sqlalchemy.orm.session import make_transient

from airflow import executors, models, settings, utils
from airflow import configuration
from airflow.utils import AirflowException, State, LoggingMixin

Base = models.Base
ID_LEN = models.ID_LEN

# Setting up a statsd client if needed
statsd = None
if configuration.getboolean('scheduler', 'statsd_on'):
    from statsd import StatsClient
    statsd = StatsClient(host=configuration.get('scheduler', 'statsd_host'),
                         port=configuration.getint('scheduler', 'statsd_port'),
                         prefix=configuration.get('scheduler',
                                                  'statsd_prefix'))


class BaseJob(Base, LoggingMixin):
    """
    Abstract class to be derived for jobs. Jobs are processing items with state
    and duration that aren't task instances. For instance a BackfillJob is
    a collection of task instance runs, but should have it's own state, start
    and end time.
    """

    __tablename__ = "job"

    id = Column(Integer, primary_key=True)
Beispiel #40
0
from datetime import datetime
import getpass
import imp
import os
import re
import signal
import subprocess
import sys
import warnings

from airflow import configuration
from airflow.exceptions import AirflowException

# When killing processes, time to wait after issuing a SIGTERM before issuing a
# SIGKILL.
DEFAULT_TIME_TO_WAIT_AFTER_SIGTERM = configuration.getint('core', 'KILLED_TASK_CLEANUP_TIME')


def validate_key(k, max_length=250):
    if not isinstance(k, basestring):
        raise TypeError("The key has to be a string")
    elif len(k) > max_length:
        raise AirflowException(
            "The key has to be less than {0} characters".format(max_length))
    elif not re.match(r'^[A-Za-z0-9_\-\.]+$', k):
        raise AirflowException(
            "The key ({k}) has to be made of alphanumeric characters, dashes, "
            "dots and underscores exclusively".format(**locals()))
    else:
        return True
Beispiel #41
0
import imp
import os
import re
import signal
import subprocess
import sys
import warnings

from jinja2 import Template

from airflow import configuration
from airflow.exceptions import AirflowException

# When killing processes, time to wait after issuing a SIGTERM before issuing a
# SIGKILL.
DEFAULT_TIME_TO_WAIT_AFTER_SIGTERM = configuration.getint(
    'core', 'KILLED_TASK_CLEANUP_TIME')


def validate_key(k, max_length=250):
    if not isinstance(k, basestring):
        raise TypeError("The key has to be a string")
    elif len(k) > max_length:
        raise AirflowException(
            "The key has to be less than {0} characters".format(max_length))
    elif not re.match(r'^[A-Za-z0-9_\-\.]+$', k):
        raise AirflowException(
            "The key ({k}) has to be made of alphanumeric characters, dashes, "
            "dots and underscores exclusively".format(**locals()))
    else:
        return True
Beispiel #42
0
    def __init__(self):
        configuration_dict = configuration.as_dict(display_sensitive=True)
        self.core_configuration = configuration_dict['core']
        self.kube_secrets = configuration_dict.get('kubernetes_secrets', {})
        self.airflow_home = configuration.get(self.core_section, 'airflow_home')
        self.dags_folder = configuration.get(self.core_section, 'dags_folder')
        self.parallelism = configuration.getint(self.core_section, 'PARALLELISM')
        self.worker_container_repository = configuration.get(
            self.kubernetes_section, 'worker_container_repository')
        self.worker_container_tag = configuration.get(
            self.kubernetes_section, 'worker_container_tag')
        self.worker_dags_folder = configuration.get(
            self.kubernetes_section, 'worker_dags_folder')
        self.kube_image = '{}:{}'.format(
            self.worker_container_repository, self.worker_container_tag)
        self.kube_image_pull_policy = configuration.get(
            self.kubernetes_section, "worker_container_image_pull_policy"
        )
        self.kube_node_selectors = configuration_dict.get('kubernetes_node_selectors', {})
        self.delete_worker_pods = conf.getboolean(
            self.kubernetes_section, 'delete_worker_pods')

        self.worker_service_account_name = conf.get(
            self.kubernetes_section, 'worker_service_account_name')
        self.image_pull_secrets = conf.get(self.kubernetes_section, 'image_pull_secrets')

        # NOTE: `git_repo` and `git_branch` must be specified together as a pair
        # The http URL of the git repository to clone from
        self.git_repo = conf.get(self.kubernetes_section, 'git_repo')
        # The branch of the repository to be checked out
        self.git_branch = conf.get(self.kubernetes_section, 'git_branch')
        # Optionally, the directory in the git repository containing the dags
        self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath')

        # Optionally a user may supply a `git_user` and `git_password` for private
        # repositories
        self.git_user = conf.get(self.kubernetes_section, 'git_user')
        self.git_password = conf.get(self.kubernetes_section, 'git_password')

        # NOTE: The user may optionally use a volume claim to mount a PV containing
        # DAGs directly
        self.dags_volume_claim = conf.get(self.kubernetes_section, 'dags_volume_claim')

        # This prop may optionally be set for PV Claims and is used to write logs
        self.logs_volume_claim = conf.get(self.kubernetes_section, 'logs_volume_claim')

        # This prop may optionally be set for PV Claims and is used to locate DAGs
        # on a SubPath
        self.dags_volume_subpath = conf.get(
            self.kubernetes_section, 'dags_volume_subpath')

        # This prop may optionally be set for PV Claims and is used to locate logs
        # on a SubPath
        self.logs_volume_subpath = conf.get(
            self.kubernetes_section, 'logs_volume_subpath')

        # This prop may optionally be set for PV Claims and is used to write logs
        self.base_log_folder = configuration.get(self.core_section, 'base_log_folder')

        # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note
        # that if your
        # cluster has RBAC enabled, your scheduler may need service account permissions to
        # create, watch, get, and delete pods in this namespace.
        self.kube_namespace = conf.get(self.kubernetes_section, 'namespace')
        # The Kubernetes Namespace in which pods will be created by the executor. Note
        # that if your
        # cluster has RBAC enabled, your workers may need service account permissions to
        # interact with cluster components.
        self.executor_namespace = conf.get(self.kubernetes_section, 'namespace')
        # Task secrets managed by KubernetesExecutor.
        self.gcp_service_account_keys = conf.get(self.kubernetes_section,
                                                 'gcp_service_account_keys')

        # If the user is using the git-sync container to clone their repository via git,
        # allow them to specify repository, tag, and pod name for the init container.
        self.git_sync_container_repository = conf.get(
            self.kubernetes_section, 'git_sync_container_repository')

        self.git_sync_container_tag = conf.get(
            self.kubernetes_section, 'git_sync_container_tag')
        self.git_sync_container = '{}:{}'.format(
            self.git_sync_container_repository, self.git_sync_container_tag)

        self.git_sync_init_container_name = conf.get(
            self.kubernetes_section, 'git_sync_init_container_name')

        # The worker pod may optionally have a  valid Airflow config loaded via a
        # configmap
        self.airflow_configmap = conf.get(self.kubernetes_section, 'airflow_configmap')

        self._validate()
Beispiel #43
0
from airflow.www.forms import (DateTimeForm, DateTimeWithNumRunsForm,
                               DateTimeWithNumRunsWithDagRunsForm)
from airflow.www.validators import GreaterEqualThan

QUERY_LIMIT = 100000
CHART_LIMIT = 200000

UTF8_READER = codecs.getreader('utf-8')

dagbag = models.DagBag(settings.DAGS_FOLDER)

# logout_user = airflow.login.logout_user

FILTER_BY_OWNER = False

PAGE_SIZE = conf.getint('webserver', 'page_size')

if conf.getboolean('webserver', 'FILTER_BY_OWNER'):
    # filter_by_owner if authentication is enabled and filter_by_owner is true
    FILTER_BY_OWNER = not current_app.config['LOGIN_DISABLED']


def dag_link(v, c, m, p):
    if m.dag_id is None:
        return Markup()

    kwargs = {'dag_id': m.dag_id}

    # This is called with various objects, TIs, (ORM) DAG - some have this,
    # some don't
    if hasattr(m, 'execution_date'):
Beispiel #44
0
    def prioritize_queued(self, session, executor, dagbag):
        # Prioritizing queued task instances

        pools = {p.pool: p for p in session.query(models.Pool).all()}
        TI = models.TaskInstance
        queued_tis = (
            session.query(TI)
            .filter(TI.state == State.QUEUED)
            .all()
        )
        self.logger.info(
            "Prioritizing {} queued jobs".format(len(queued_tis)))
        session.expunge_all()
        d = defaultdict(list)
        for ti in queued_tis:
            if ti.dag_id not in dagbag.dags:
                self.logger.info(
                    "DAG no longer in dagbag, deleting {}".format(ti))
                session.delete(ti)
                session.commit()
            elif not dagbag.dags[ti.dag_id].has_task(ti.task_id):
                self.logger.info(
                    "Task no longer exists, deleting {}".format(ti))
                session.delete(ti)
                session.commit()
            else:
                d[ti.pool].append(ti)

        dag_blacklist = set(dagbag.paused_dags())
        for pool, tis in list(d.items()):
            if not pool:
                # Arbitrary:
                # If queued outside of a pool, trigger no more than
                # non_pooled_task_slot_count per run
                open_slots = conf.getint('core', 'non_pooled_task_slot_count')
            else:
                open_slots = pools[pool].open_slots(session=session)

            queue_size = len(tis)
            self.logger.info("Pool {pool} has {open_slots} slots, {queue_size} "
                             "task instances in queue".format(**locals()))
            if open_slots <= 0:
                continue
            tis = sorted(
                tis, key=lambda ti: (-ti.priority_weight, ti.start_date))
            for ti in tis:
                if open_slots <= 0:
                    continue
                task = None
                try:
                    task = dagbag.dags[ti.dag_id].get_task(ti.task_id)
                except:
                    self.logger.error("Queued task {} seems gone".format(ti))
                    session.delete(ti)
                    session.commit()
                    continue

                if not task:
                    continue

                ti.task = task

                # picklin'
                dag = dagbag.dags[ti.dag_id]
                pickle_id = None
                if self.do_pickle and self.executor.__class__ not in (
                        executors.LocalExecutor,
                        executors.SequentialExecutor):
                    self.logger.info("Pickling DAG {}".format(dag))
                    pickle_id = dag.pickle(session).id

                if dag.dag_id in dag_blacklist:
                    continue
                if dag.concurrency_reached:
                    dag_blacklist.add(dag.dag_id)
                    continue
                if ti.are_dependencies_met():
                    executor.queue_task_instance(ti, pickle_id=pickle_id)
                    open_slots -= 1
                else:
                    session.delete(ti)
                    session.commit()
                    continue
                ti.task = task

                session.commit()
Beispiel #45
0
def restart_workers(gunicorn_master_proc, num_workers_expected):
    """
    Runs forever, monitoring the child processes of @gunicorn_master_proc and
    restarting workers occasionally.

    Each iteration of the loop traverses one edge of this state transition
    diagram, where each state (node) represents
    [ num_ready_workers_running / num_workers_running ]. We expect most time to
    be spent in [n / n]. `bs` is the setting webserver.worker_refresh_batch_size.

    The horizontal transition at ? happens after the new worker parses all the
    dags (so it could take a while!)

       V ────────────────────────────────────────────────────────────────────────┐
    [n / n] ──TTIN──> [ [n, n+bs) / n + bs ]  ────?───> [n + bs / n + bs] ──TTOU─┘
       ^                          ^───────────────┘
       │
       │      ┌────────────────v
       └──────┴────── [ [0, n) / n ] <─── start

    We change the number of workers by sending TTIN and TTOU to the gunicorn
    master process, which increases and decreases the number of child workers
    respectively. Gunicorn guarantees that on TTOU workers are terminated
    gracefully and that the oldest worker is terminated.
    """
    def wait_until_true(fn):
        """
        Sleeps until fn is true
        """
        while not fn():
            time.sleep(0.1)

    def get_num_workers_running(gunicorn_master_proc):
        workers = psutil.Process(gunicorn_master_proc.pid).children()
        return len(workers)

    def get_num_ready_workers_running(gunicorn_master_proc):
        workers = psutil.Process(gunicorn_master_proc.pid).children()
        ready_workers = [
            proc for proc in workers
            if settings.GUNICORN_WORKER_READY_PREFIX in proc.cmdline()[0]
        ]
        return len(ready_workers)

    def start_refresh(gunicorn_master_proc):
        batch_size = conf.getint('webserver', 'worker_refresh_batch_size')
        logging.debug('%s doing a refresh of %s workers', state, batch_size)
        sys.stdout.flush()
        sys.stderr.flush()

        excess = 0
        for _ in range(batch_size):
            gunicorn_master_proc.send_signal(signal.SIGTTIN)
            excess += 1
            wait_until_true(lambda: num_workers_expected + excess ==
                            get_num_workers_running(gunicorn_master_proc))

    wait_until_true(lambda: num_workers_expected == get_num_workers_running(
        gunicorn_master_proc))

    while True:
        num_workers_running = get_num_workers_running(gunicorn_master_proc)
        num_ready_workers_running = get_num_ready_workers_running(
            gunicorn_master_proc)

        state = '[{0} / {1}]'.format(num_ready_workers_running,
                                     num_workers_running)

        # Whenever some workers are not ready, wait until all workers are ready
        if num_ready_workers_running < num_workers_running:
            logging.debug('%s some workers are starting up, waiting...', state)
            sys.stdout.flush()
            time.sleep(1)

        # Kill a worker gracefully by asking gunicorn to reduce number of workers
        elif num_workers_running > num_workers_expected:
            excess = num_workers_running - num_workers_expected
            logging.debug('%s killing %s workers', state, excess)

            for _ in range(excess):
                gunicorn_master_proc.send_signal(signal.SIGTTOU)
                excess -= 1
                wait_until_true(lambda: num_workers_expected + excess ==
                                get_num_workers_running(gunicorn_master_proc))

        # Start a new worker by asking gunicorn to increase number of workers
        elif num_workers_running == num_workers_expected:
            refresh_interval = conf.getint('webserver',
                                           'worker_refresh_interval')
            logging.debug('%s sleeping for %ss starting doing a refresh...',
                          state, refresh_interval)
            time.sleep(refresh_interval)
            start_refresh(gunicorn_master_proc)

        else:
            # num_ready_workers_running == num_workers_running < num_workers_expected
            logging.error(("%s some workers seem to have died and gunicorn"
                           "did not restart them as expected"), state)
            time.sleep(10)
            if len(psutil.Process(gunicorn_master_proc.pid).children()
                   ) < num_workers_expected:
                start_refresh(gunicorn_master_proc)
Beispiel #46
0
    def prioritize_queued(self, session, executor, dagbag):
        # Prioritizing queued task instances

        pools = {p.pool: p for p in session.query(models.Pool).all()}

        self.logger.info("Prioritizing {} queued jobs".format(
            len(self.queued_tis)))
        session.expunge_all()
        d = defaultdict(list)
        for ti in self.queued_tis:
            if ti.dag_id not in dagbag.dags:
                self.logger.info(
                    "DAG no longer in dagbag, deleting {}".format(ti))
                session.delete(ti)
                session.commit()
            elif not dagbag.dags[ti.dag_id].has_task(ti.task_id):
                self.logger.info(
                    "Task no longer exists, deleting {}".format(ti))
                session.delete(ti)
                session.commit()
            else:
                d[ti.pool].append(ti)

        self.queued_tis.clear()

        dag_blacklist = set(dagbag.paused_dags())
        for pool, tis in list(d.items()):
            if not pool:
                # Arbitrary:
                # If queued outside of a pool, trigger no more than
                # non_pooled_task_slot_count per run
                open_slots = conf.getint('core', 'non_pooled_task_slot_count')
            else:
                open_slots = pools[pool].open_slots(session=session)

            queue_size = len(tis)
            self.logger.info(
                "Pool {pool} has {open_slots} slots, {queue_size} "
                "task instances in queue".format(**locals()))
            if open_slots <= 0:
                continue
            tis = sorted(tis,
                         key=lambda ti: (-ti.priority_weight, ti.start_date))
            for ti in tis:
                if open_slots <= 0:
                    continue
                task = None
                try:
                    task = dagbag.dags[ti.dag_id].get_task(ti.task_id)
                except:
                    self.logger.error("Queued task {} seems gone".format(ti))
                    session.delete(ti)
                    session.commit()
                    continue

                if not task:
                    continue

                ti.task = task

                # picklin'
                dag = dagbag.dags[ti.dag_id]
                pickle_id = None
                if self.do_pickle and self.executor.__class__ not in (
                        executors.LocalExecutor, executors.SequentialExecutor):
                    self.logger.info("Pickling DAG {}".format(dag))
                    pickle_id = dag.pickle(session).id

                if dag.dag_id in dag_blacklist:
                    continue
                if dag.concurrency_reached:
                    dag_blacklist.add(dag.dag_id)
                    continue
                if ti.are_dependencies_met():
                    executor.queue_task_instance(ti, pickle_id=pickle_id)
                    open_slots -= 1
                else:
                    session.delete(ti)
                    continue
                ti.task = task

                session.commit()
Beispiel #47
0
def webserver(args):
    print(settings.HEADER)

    app = cached_app(conf)
    access_logfile = args.access_logfile or conf.get('webserver',
                                                     'access_logfile')
    error_logfile = args.error_logfile or conf.get('webserver',
                                                   'error_logfile')
    num_workers = args.workers or conf.get('webserver', 'workers')
    worker_timeout = (args.worker_timeout
                      or conf.get('webserver', 'webserver_worker_timeout'))
    ssl_cert = args.ssl_cert or conf.get('webserver', 'web_server_ssl_cert')
    ssl_key = args.ssl_key or conf.get('webserver', 'web_server_ssl_key')
    if ssl_cert is None and ssl_key is not None:
        raise AirflowException(
            'An SSL certificate must also be provided for use with ' + ssl_key)
    if ssl_cert is not None and ssl_key is None:
        raise AirflowException(
            'An SSL key must also be provided for use with ' + ssl_cert)

    if args.debug:
        print("Starting the web server on port {0} and host {1}.".format(
            args.port, args.hostname))
        app.run(debug=True,
                port=args.port,
                host=args.hostname,
                ssl_context=(ssl_cert, ssl_key))
    else:
        pid, stdout, stderr, log_file = setup_locations("webserver",
                                                        pid=args.pid)
        print(
            textwrap.dedent('''\
                Running the Gunicorn Server with:
                Workers: {num_workers} {args.workerclass}
                Host: {args.hostname}:{args.port}
                Timeout: {worker_timeout}
                Logfiles: {access_logfile} {error_logfile}
                =================================================================\
            '''.format(**locals())))

        run_args = [
            'gunicorn', '-w',
            str(num_workers), '-k',
            str(args.workerclass), '-t',
            str(worker_timeout), '-b', args.hostname + ':' + str(args.port),
            '-n', 'airflow-webserver', '-p',
            str(pid), '-c', 'airflow.www.gunicorn_config'
        ]

        if args.access_logfile:
            run_args += ['--access-logfile', str(args.access_logfile)]

        if args.error_logfile:
            run_args += ['--error-logfile', str(args.error_logfile)]

        if args.daemon:
            run_args += ["-D"]
        if ssl_cert:
            run_args += ['--certfile', ssl_cert, '--keyfile', ssl_key]

        run_args += ["airflow.www.app:cached_app()"]

        gunicorn_master_proc = subprocess.Popen(run_args)

        def kill_proc(dummy_signum, dummy_frame):
            gunicorn_master_proc.terminate()
            gunicorn_master_proc.wait()
            sys.exit(0)

        signal.signal(signal.SIGINT, kill_proc)
        signal.signal(signal.SIGTERM, kill_proc)

        # These run forever until SIG{INT, TERM, KILL, ...} signal is sent
        if conf.getint('webserver', 'worker_refresh_interval') > 0:
            restart_workers(gunicorn_master_proc, num_workers)
        else:
            while True:
                time.sleep(1)
Beispiel #48
0
    def _process_backfill_task_instances(self,
                                         ti_status,
                                         executor,
                                         pickle_id,
                                         start_date=None,
                                         session=None):
        """
        Process a set of task instances from a set of dag runs. Special handling is done
        to account for different task instance states that could be present when running
        them in a backfill process.

        :param ti_status: the internal status of the job
        :type ti_status: BackfillJob._DagRunTaskStatus
        :param executor: the executor to run the task instances
        :type executor: BaseExecutor
        :param pickle_id: the pickle_id if dag is pickled, None otherwise
        :type pickle_id: int
        :param start_date: the start date of the backfill job
        :type start_date: datetime.datetime
        :param session: the current session object
        :type session: sqlalchemy.orm.session.Session
        :return: the list of execution_dates for the finished dag runs
        :rtype: list
        """

        executed_run_dates = []

        while ((len(ti_status.to_run) > 0 or len(ti_status.running) > 0)
               and len(ti_status.deadlocked) == 0):
            self.log.debug("*** Clearing out not_ready list ***")
            ti_status.not_ready.clear()

            # we need to execute the tasks bottom to top
            # or leaf to root, as otherwise tasks might be
            # determined deadlocked while they are actually
            # waiting for their upstream to finish
            @provide_session
            def _per_task_process(task, key, ti, session=None):
                ti.refresh_from_db()

                task = self.dag.get_task(ti.task_id)
                ti.task = task

                ignore_depends_on_past = (self.ignore_first_depends_on_past
                                          and ti.execution_date
                                          == (start_date or ti.start_date))
                self.log.debug("Task instance to run %s state %s", ti,
                               ti.state)

                # The task was already marked successful or skipped by a
                # different Job. Don't rerun it.
                if ti.state == State.SUCCESS:
                    ti_status.succeeded.add(key)
                    self.log.debug("Task instance %s succeeded. Don't rerun.",
                                   ti)
                    ti_status.to_run.pop(key)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    return
                elif ti.state == State.SKIPPED:
                    ti_status.skipped.add(key)
                    self.log.debug("Task instance %s skipped. Don't rerun.",
                                   ti)
                    ti_status.to_run.pop(key)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    return

                # guard against externally modified tasks instances or
                # in case max concurrency has been reached at task runtime
                elif ti.state == State.NONE:
                    self.log.warning(
                        "FIXME: task instance {} state was set to None "
                        "externally. This should not happen")
                    ti.set_state(State.SCHEDULED, session=session)
                if self.rerun_failed_tasks:
                    # Rerun failed tasks or upstreamed failed tasks
                    if ti.state in (State.FAILED, State.UPSTREAM_FAILED):
                        self.log.error("Task instance {ti} "
                                       "with state {state}".format(
                                           ti=ti, state=ti.state))
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        # Reset the failed task in backfill to scheduled state
                        ti.set_state(State.SCHEDULED, session=session)
                else:
                    # Default behaviour which works for subdag.
                    if ti.state in (State.FAILED, State.UPSTREAM_FAILED):
                        self.log.error("Task instance {ti} "
                                       "with {state} state".format(
                                           ti=ti, state=ti.state))
                        ti_status.failed.add(key)
                        ti_status.to_run.pop(key)
                        if key in ti_status.running:
                            ti_status.running.pop(key)
                        return

                backfill_context = DepContext(
                    deps=RUN_DEPS,
                    ignore_depends_on_past=ignore_depends_on_past,
                    ignore_task_deps=self.ignore_task_deps,
                    flag_upstream_failed=True)

                # Is the task runnable? -- then run it
                # the dependency checker can change states of tis
                if ti.are_dependencies_met(dep_context=backfill_context,
                                           session=session,
                                           verbose=self.verbose):
                    ti.refresh_from_db(lock_for_update=True, session=session)
                    if ti.state in (State.SCHEDULED, State.UP_FOR_RETRY,
                                    State.UP_FOR_RESCHEDULE):
                        if executor.has_task(ti):
                            self.log.debug(
                                "Task Instance %s already in executor "
                                "waiting for queue to clear", ti)
                        else:
                            self.log.debug('Sending %s to executor', ti)
                            # Skip scheduled state, we are executing immediately
                            ti.state = State.QUEUED
                            ti.queued_dttm = timezone.utcnow(
                            ) if not ti.queued_dttm else ti.queued_dttm
                            session.merge(ti)

                            cfg_path = None
                            if executor.__class__ in (
                                    executors.LocalExecutor,
                                    executors.SequentialExecutor):
                                cfg_path = tmp_configuration_copy()

                            executor.queue_task_instance(
                                ti,
                                mark_success=self.mark_success,
                                pickle_id=pickle_id,
                                ignore_task_deps=self.ignore_task_deps,
                                ignore_depends_on_past=ignore_depends_on_past,
                                pool=self.pool,
                                cfg_path=cfg_path)
                            ti_status.running[key] = ti
                            ti_status.to_run.pop(key)
                    session.commit()
                    return

                if ti.state == State.UPSTREAM_FAILED:
                    self.log.error("Task instance %s upstream failed", ti)
                    ti_status.failed.add(key)
                    ti_status.to_run.pop(key)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    return

                # special case
                if ti.state == State.UP_FOR_RETRY:
                    self.log.debug(
                        "Task instance %s retry period not "
                        "expired yet", ti)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    ti_status.to_run[key] = ti
                    return

                # special case
                if ti.state == State.UP_FOR_RESCHEDULE:
                    self.log.debug(
                        "Task instance %s reschedule period not "
                        "expired yet", ti)
                    if key in ti_status.running:
                        ti_status.running.pop(key)
                    ti_status.to_run[key] = ti
                    return

                # all remaining tasks
                self.log.debug('Adding %s to not_ready', ti)
                ti_status.not_ready.add(key)

            non_pool_slots = conf.getint(
                'core', 'non_pooled_backfill_task_slot_count')

            try:
                for task in self.dag.topological_sort():
                    for key, ti in list(ti_status.to_run.items()):
                        if task.task_id != ti.task_id:
                            continue
                        if task.pool:
                            pool = session.query(models.Pool) \
                                .filter(models.Pool.pool == task.pool) \
                                .first()
                            if not pool:
                                raise PoolNotFound('Unknown pool: {}'.format(
                                    task.pool))

                            open_slots = pool.open_slots(session=session)
                            if open_slots <= 0:
                                raise NoAvailablePoolSlot(
                                    "Not scheduling since there are "
                                    "%s open slots in pool %s".format(
                                        open_slots, task.pool))
                        else:
                            if non_pool_slots <= 0:
                                raise NoAvailablePoolSlot(
                                    "Not scheduling since there are no "
                                    "non_pooled_backfill_task_slot_count.")
                            non_pool_slots -= 1

                        num_running_task_instances_in_dag = DAG.get_num_task_instances(
                            self.dag_id,
                            states=self.STATES_COUNT_AS_RUNNING,
                        )

                        if num_running_task_instances_in_dag >= self.dag.concurrency:
                            raise DagConcurrencyLimitReached(
                                "Not scheduling since DAG concurrency limit "
                                "is reached.")

                        if task.task_concurrency:
                            num_running_task_instances_in_task = DAG.get_num_task_instances(
                                dag_id=self.dag_id,
                                task_ids=[task.task_id],
                                states=self.STATES_COUNT_AS_RUNNING,
                            )

                            if num_running_task_instances_in_task >= task.task_concurrency:
                                raise TaskConcurrencyLimitReached(
                                    "Not scheduling since Task concurrency limit "
                                    "is reached.")

                        _per_task_process(task, key, ti)
            except (NoAvailablePoolSlot, DagConcurrencyLimitReached,
                    TaskConcurrencyLimitReached) as e:
                self.log.debug(e)

            # execute the tasks in the queue
            self.heartbeat()
            executor.heartbeat()

            # If the set of tasks that aren't ready ever equals the set of
            # tasks to run and there are no running tasks then the backfill
            # is deadlocked
            if (ti_status.not_ready
                    and ti_status.not_ready == set(ti_status.to_run)
                    and len(ti_status.running) == 0):
                self.log.warning("Deadlock discovered for ti_status.to_run=%s",
                                 ti_status.to_run.values())
                ti_status.deadlocked.update(ti_status.to_run.values())
                ti_status.to_run.clear()

            # check executor state
            self._manage_executor_state(ti_status.running)

            # update the task counters
            self._update_counters(ti_status=ti_status)

            # update dag run state
            _dag_runs = ti_status.active_runs[:]
            for run in _dag_runs:
                run.update_state(session=session)
                if run.state in State.finished():
                    ti_status.finished_runs += 1
                    ti_status.active_runs.remove(run)
                    executed_run_dates.append(run.execution_date)

            self._log_progress(ti_status)

        # return updated status
        return executed_run_dates
Beispiel #49
0
    @classmethod
    def gauge(cls, stat, value, rate=1, delta=False):
        pass

    @classmethod
    def timing(cls, stat, dt):
        pass

Stats = DummyStatsLogger

if conf.getboolean('scheduler', 'statsd_on'):
    from statsd import StatsClient
    statsd = StatsClient(
        host=conf.get('scheduler', 'statsd_host'),
        port=conf.getint('scheduler', 'statsd_port'),
        prefix=conf.get('scheduler', 'statsd_prefix'))
    Stats = statsd
else:
    Stats = DummyStatsLogger


HEADER = """\
  ____________       _____________
 ____    |__( )_________  __/__  /________      __
____  /| |_  /__  ___/_  /_ __  /_  __ \_ | /| / /
___  ___ |  / _  /   _  __/ _  / / /_/ /_ |/ |/ /
 _/_/  |_/_/  /_/    /_/    /_/  \____/____/|__/
 """

BASE_LOG_URL = '/admin/airflow/log'
Beispiel #50
0
    def mesos_driver(self):
        """
        Lazily instantiates the Mesos scheduler driver if one was not injected in
        via the constructor
        """
        if self._mesos_driver is None:
            framework = Dict()
            framework.user = '******'

            if not configuration.get('mesos', 'MASTER'):
                logging.error("Expecting mesos master URL for mesos executor")
                raise AirflowException(
                    "mesos.master not provided for mesos executor")

            master = configuration.get('mesos', 'MASTER')

            framework.name = get_framework_name()

            if configuration.getboolean('mesos', 'CHECKPOINT'):
                framework.checkpoint = True

                if configuration.get('mesos', 'FAILOVER_TIMEOUT'):
                    # Import here to work around a circular import error
                    from airflow.models import Connection

                    # Query the database to get the ID of the Mesos Framework, if available.
                    conn_id = FRAMEWORK_CONNID_PREFIX + framework.name
                    session = Session()
                    connection = session.query(Connection).filter_by(
                        conn_id=conn_id).first()
                    if connection is not None:
                        # Set the Framework ID to let the scheduler reconnect with running tasks.
                        framework.id.value = connection.extra

                    framework.failover_timeout = configuration.getint(
                        'mesos', 'FAILOVER_TIMEOUT')
            else:
                framework.checkpoint = False

            logging.info(
                'MesosFramework master : %s, name : %s, checkpoint : %s',
                master, framework.name, str(framework.checkpoint))

            if configuration.getboolean('mesos', 'AUTHENTICATE'):
                if not configuration.get('mesos', 'DEFAULT_PRINCIPAL'):
                    logging.error(
                        "Expecting authentication principal in the environment"
                    )
                    raise AirflowException(
                        "mesos.default_principal not provided in authenticated mode"
                    )
                if not configuration.get('mesos', 'DEFAULT_SECRET'):
                    logging.error(
                        "Expecting authentication secret in the environment")
                    raise AirflowException(
                        "mesos.default_secret not provided in authenticated mode"
                    )

                principal = configuration.get('mesos', 'DEFAULT_PRINCIPAL')
                secret = configuration.get('mesos', 'DEFAULT_SECRET')

                framework.principal = credential.principal

                self._mesos_driver = MesosSchedulerDriver(
                    AirflowMesosScheduler(self.task_queue, self.result_queue),
                    framework,
                    master,
                    use_addict=True,
                    principal=principal,
                    secret=secret)
            else:
                framework.principal = 'Airflow'
                self._mesos_driver = MesosSchedulerDriver(
                    AirflowMesosScheduler(self.task_queue, self.result_queue),
                    framework,
                    master,
                    use_addict=True)
        return self._mesos_driver
Beispiel #51
0
broker_transport_options = configuration.getsection(
    'celery_broker_transport_options')
if broker_transport_options is None:
    broker_transport_options = {'visibility_timeout': 21600}

DEFAULT_CELERY_CONFIG = {
    'accept_content': ['json', 'pickle'],
    'event_serializer': 'json',
    'worker_prefetch_multiplier': 1,
    'task_acks_late': True,
    'task_default_queue': configuration.get('celery', 'DEFAULT_QUEUE'),
    'task_default_exchange': configuration.get('celery', 'DEFAULT_QUEUE'),
    'broker_url': configuration.get('celery', 'BROKER_URL'),
    'broker_transport_options': broker_transport_options,
    'result_backend': configuration.get('celery', 'RESULT_BACKEND'),
    'worker_concurrency': configuration.getint('celery', 'WORKER_CONCURRENCY'),
}

celery_ssl_active = False
try:
    celery_ssl_active = configuration.getboolean('celery', 'SSL_ACTIVE')
except AirflowConfigException as e:
    log.warning("Celery Executor will run without SSL")

try:
    if celery_ssl_active:
        broker_use_ssl = {
            'keyfile': configuration.get('celery', 'SSL_KEY'),
            'certfile': configuration.get('celery', 'SSL_CERT'),
            'ca_certs': configuration.get('celery', 'SSL_CACERT'),
            'cert_reqs': ssl.CERT_REQUIRED
    def __init__(self,
                 dag_directory,
                 file_paths,
                 max_runs,
                 processor_factory,
                 processor_timeout,
                 signal_conn,
                 async_mode=True):
        """
        :param dag_directory: Directory where DAG definitions are kept. All
            files in file_paths should be under this directory
        :type dag_directory: unicode
        :param file_paths: list of file paths that contain DAG definitions
        :type file_paths: list[unicode]
        :param max_runs: The number of times to parse and schedule each file. -1
            for unlimited.
        :type max_runs: int
        :param processor_factory: function that creates processors for DAG
            definition files. Arguments are (dag_definition_path)
        :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor)
        :param processor_timeout: How long to wait before timing out a DAG file processor
        :type processor_timeout: timedelta
        :param signal_conn: connection to communicate signal with processor agent.
        :type signal_conn: airflow.models.connection.Connection
        :param async_mode: whether to start the manager in async mode
        :type async_mode: bool
        """
        self._file_paths = file_paths
        self._file_path_queue = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        self._processor_factory = processor_factory
        self._signal_conn = signal_conn
        self._async_mode = async_mode

        self._parallelism = conf.getint('scheduler', 'max_threads')
        if 'sqlite' in conf.get('core',
                                'sql_alchemy_conn') and self._parallelism > 1:
            self.log.error("Cannot use more than 1 thread when using sqlite. "
                           "Setting parallelism to 1")
            self._parallelism = 1

        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler',
                                                  'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler',
                                                'print_stats_interval')
        # Map from file path to the processor
        self._processors = {}
        # Map from file path to the last runtime
        self._last_runtime = {}
        # Map from file path to the last finish time
        self._last_finish_time = {}
        self._last_zombie_query_time = timezone.utcnow()
        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.utcnow()
        # Last time stats were printed
        self.last_stat_print_time = timezone.datetime(2000, 1, 1)
        # TODO: Remove magic number
        self._zombie_query_interval = 10
        # Map from file path to the number of runs
        self._run_count = defaultdict(int)
        # Manager heartbeat key.
        self._heart_beat_key = 'heart-beat'
        # How long to wait before timing out a process to parse a DAG file
        self._processor_timeout = processor_timeout

        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler',
                                                 'dag_dir_list_interval')

        self._log = logging.getLogger('airflow.processor_manager')

        signal.signal(signal.SIGINT, self._exit_gracefully)
        signal.signal(signal.SIGTERM, self._exit_gracefully)
    def __init__(self):
        configuration_dict = configuration.as_dict(display_sensitive=True)
        self.core_configuration = configuration_dict['core']
        self.kube_secrets = configuration_dict.get('kubernetes_secrets', {})
        self.airflow_home = configuration.get(self.core_section, 'airflow_home')
        self.dags_folder = configuration.get(self.core_section, 'dags_folder')
        self.parallelism = configuration.getint(self.core_section, 'PARALLELISM')
        self.worker_container_repository = configuration.get(
            self.kubernetes_section, 'worker_container_repository')
        self.worker_container_tag = configuration.get(
            self.kubernetes_section, 'worker_container_tag')
        self.worker_dags_folder = configuration.get(
            self.kubernetes_section, 'worker_dags_folder')
        self.kube_image = '{}:{}'.format(
            self.worker_container_repository, self.worker_container_tag)
        self.kube_image_pull_policy = configuration.get(
            self.kubernetes_section, "worker_container_image_pull_policy"
        )
        self.kube_node_selectors = configuration_dict.get('kubernetes_node_selectors', {})
        self.delete_worker_pods = conf.getboolean(
            self.kubernetes_section, 'delete_worker_pods')

        self.worker_service_account_name = conf.get(
            self.kubernetes_section, 'worker_service_account_name')
        self.image_pull_secrets = conf.get(self.kubernetes_section, 'image_pull_secrets')

        # NOTE: `git_repo` and `git_branch` must be specified together as a pair
        # The http URL of the git repository to clone from
        self.git_repo = conf.get(self.kubernetes_section, 'git_repo')
        # The branch of the repository to be checked out
        self.git_branch = conf.get(self.kubernetes_section, 'git_branch')
        # Optionally, the directory in the git repository containing the dags
        self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath')

        # Optionally a user may supply a `git_user` and `git_password` for private
        # repositories
        self.git_user = conf.get(self.kubernetes_section, 'git_user')
        self.git_password = conf.get(self.kubernetes_section, 'git_password')

        # NOTE: The user may optionally use a volume claim to mount a PV containing
        # DAGs directly
        self.dags_volume_claim = conf.get(self.kubernetes_section, 'dags_volume_claim')

        # This prop may optionally be set for PV Claims and is used to write logs
        self.logs_volume_claim = conf.get(self.kubernetes_section, 'logs_volume_claim')

        # This prop may optionally be set for PV Claims and is used to locate DAGs
        # on a SubPath
        self.dags_volume_subpath = conf.get(
            self.kubernetes_section, 'dags_volume_subpath')

        # This prop may optionally be set for PV Claims and is used to locate logs
        # on a SubPath
        self.logs_volume_subpath = conf.get(
            self.kubernetes_section, 'logs_volume_subpath')

        # This prop may optionally be set for PV Claims and is used to write logs
        self.base_log_folder = configuration.get(self.core_section, 'base_log_folder')

        # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note
        # that if your
        # cluster has RBAC enabled, your scheduler may need service account permissions to
        # create, watch, get, and delete pods in this namespace.
        self.kube_namespace = conf.get(self.kubernetes_section, 'namespace')
        # The Kubernetes Namespace in which pods will be created by the executor. Note
        # that if your
        # cluster has RBAC enabled, your workers may need service account permissions to
        # interact with cluster components.
        self.executor_namespace = conf.get(self.kubernetes_section, 'namespace')
        # Task secrets managed by KubernetesExecutor.
        self.gcp_service_account_keys = conf.get(self.kubernetes_section,
                                                 'gcp_service_account_keys')

        # If the user is using the git-sync container to clone their repository via git,
        # allow them to specify repository, tag, and pod name for the init container.
        self.git_sync_container_repository = conf.get(
            self.kubernetes_section, 'git_sync_container_repository')

        self.git_sync_container_tag = conf.get(
            self.kubernetes_section, 'git_sync_container_tag')
        self.git_sync_container = '{}:{}'.format(
            self.git_sync_container_repository, self.git_sync_container_tag)

        self.git_sync_init_container_name = conf.get(
            self.kubernetes_section, 'git_sync_init_container_name')

        # The worker pod may optionally have a  valid Airflow config loaded via a
        # configmap
        self.airflow_configmap = conf.get(self.kubernetes_section, 'airflow_configmap')

        self._validate()
Beispiel #54
0
from airflow import executors, models, settings, utils
from airflow import configuration
from airflow.utils import AirflowException, State


Base = models.Base
ID_LEN = models.ID_LEN

# Setting up a statsd client if needed
statsd = None
if configuration.getboolean('scheduler', 'statsd_on'):
    from statsd import StatsClient
    statsd = StatsClient(
        host=configuration.get('scheduler', 'statsd_host'),
        port=configuration.getint('scheduler', 'statsd_port'),
        prefix=configuration.get('scheduler', 'statsd_prefix'))


class BaseJob(Base):
    """
    Abstract class to be derived for jobs. Jobs are processing items with state
    and duration that aren't task instances. For instance a BackfillJob is
    a collection of task instance runs, but should have it's own state, start
    and end time.
    """

    __tablename__ = "job"

    id = Column(Integer, primary_key=True)
    dag_id = Column(String(ID_LEN),)
    broker_transport_options = {'visibility_timeout': 21600}

DEFAULT_CELERY_CONFIG = {
    'accept_content': ['json', 'pickle'],
    'event_serializer': 'json',
    'result_serializer': 'pickle',
    'worker_prefetch_multiplier': 1,
    'task_acks_late': True,
    'task_default_queue': configuration.get('celery', 'DEFAULT_QUEUE'),
    'task_default_exchange': configuration.get('celery', 'DEFAULT_QUEUE'),
    'broker_url': configuration.get('celery', 'BROKER_URL'),
    'broker_transport_options': {
        'visibility_timeout': broker_transport_options
    },
    'result_backend': configuration.get('celery', 'CELERY_RESULT_BACKEND'),
    'worker_concurrency': configuration.getint('celery',
                                               'CELERYD_CONCURRENCY'),
}

celery_ssl_active = False
try:
    celery_ssl_active = configuration.getboolean('celery', 'CELERY_SSL_ACTIVE')
except AirflowConfigException as e:
    log.warning("Celery Executor will run without SSL")

try:
    if celery_ssl_active:
        broker_use_ssl = {
            'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'),
            'certfile': configuration.get('celery', 'CELERY_SSL_CERT'),
            'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'),
            'cert_reqs': ssl.CERT_REQUIRED
Beispiel #56
0
    @classmethod
    def gauge(cls, stat, value, rate=1, delta=False):
        pass

    @classmethod
    def timing(cls, stat, dt):
        pass


Stats = DummyStatsLogger

if conf.getboolean('scheduler', 'statsd_on'):
    from statsd import StatsClient
    statsd = StatsClient(host=conf.get('scheduler', 'statsd_host'),
                         port=conf.getint('scheduler', 'statsd_port'),
                         prefix=conf.get('scheduler', 'statsd_prefix'))
    Stats = statsd
else:
    Stats = DummyStatsLogger

HEADER = """\
  ____________       _____________
 ____    |__( )_________  __/__  /________      __
____  /| |_  /__  ___/_  /_ __  /_  __ \_ | /| / /
___  ___ |  / _  /   _  __/ _  / / /_/ /_ |/ |/ /
 _/_/  |_/_/  /_/    /_/    /_/  \____/____/|__/
 """

BASE_LOG_URL = '/admin/airflow/log'
LOGGING_LEVEL = logging.INFO
    def __init__(self,
                 dag_directory,
                 file_paths,
                 max_runs,
                 processor_factory,
                 signal_conn,
                 stat_queue,
                 result_queue,
                 async_mode=True):
        """
        :param dag_directory: Directory where DAG definitions are kept. All
            files in file_paths should be under this directory
        :type dag_directory: unicode
        :param file_paths: list of file paths that contain DAG definitions
        :type file_paths: list[unicode]
        :param max_runs: The number of times to parse and schedule each file. -1
            for unlimited.
        :type max_runs: int
        :param processor_factory: function that creates processors for DAG
            definition files. Arguments are (dag_definition_path)
        :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor)
        :param signal_conn: connection to communicate signal with processor agent.
        :type signal_conn: airflow.models.connection.Connection
        :param stat_queue: the queue to use for passing back parsing stat to agent.
        :type stat_queue: multiprocessing.Queue
        :param result_queue: the queue to use for passing back the result to agent.
        :type result_queue: multiprocessing.Queue
        :param async_mode: whether to start the manager in async mode
        :type async_mode: bool
        """
        self._file_paths = file_paths
        self._file_path_queue = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        self._processor_factory = processor_factory
        self._signal_conn = signal_conn
        self._stat_queue = stat_queue
        self._result_queue = result_queue
        self._async_mode = async_mode

        self._parallelism = conf.getint('scheduler', 'max_threads')
        if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1:
            self.log.error("Cannot use more than 1 thread when using sqlite. "
                           "Setting parallelism to 1")
            self._parallelism = 1

        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler',
                                                  'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler',
                                                'print_stats_interval')
        # How many seconds do we wait for tasks to heartbeat before mark them as zombies.
        self._zombie_threshold_secs = (
            conf.getint('scheduler', 'scheduler_zombie_task_threshold'))
        # Map from file path to the processor
        self._processors = {}
        # Map from file path to the last runtime
        self._last_runtime = {}
        # Map from file path to the last finish time
        self._last_finish_time = {}
        self._last_zombie_query_time = timezone.utcnow()
        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.utcnow()
        # Last time stats were printed
        self.last_stat_print_time = timezone.datetime(2000, 1, 1)
        # TODO: Remove magic number
        self._zombie_query_interval = 10
        # Map from file path to the number of runs
        self._run_count = defaultdict(int)
        # Manager heartbeat key.
        self._heart_beat_key = 'heart-beat'

        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler',
                                                 'dag_dir_list_interval')

        self._log = logging.getLogger('airflow.processor_manager')

        signal.signal(signal.SIGINT, self._exit_gracefully)
        signal.signal(signal.SIGTERM, self._exit_gracefully)
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from builtins import range

from airflow import configuration
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.utils.state import State

PARALLELISM = configuration.getint('core', 'PARALLELISM')


class BaseExecutor(LoggingMixin):
    def __init__(self, parallelism=PARALLELISM):
        """
        Class to derive in order to interface with executor-type systems
        like Celery, Mesos, Yarn and the likes.

        :param parallelism: how many jobs should run at one time. Set to
            ``0`` for infinity
        :type parallelism: int
        """
        self.parallelism = parallelism
        self.queued_tasks = {}
        self.running = {}
    def start(self):
        self.task_queue = Queue()
        self.result_queue = Queue()
        framework = mesos_pb2.FrameworkInfo()
        framework.user = ''

        if not configuration.get('mesos', 'MASTER'):
            logging.error("Expecting mesos master URL for mesos executor")
            raise AirflowException("mesos.master not provided for mesos executor")

        master = configuration.get('mesos', 'MASTER')

        framework.name = get_framework_name()

        if not configuration.get('mesos', 'TASK_CPU'):
            task_cpu = 1
        else:
            task_cpu = configuration.getint('mesos', 'TASK_CPU')

        if not configuration.get('mesos', 'TASK_MEMORY'):
            task_memory = 256
        else:
            task_memory = configuration.getint('mesos', 'TASK_MEMORY')

        if configuration.getboolean('mesos', 'CHECKPOINT'):
            framework.checkpoint = True

            if configuration.get('mesos', 'FAILOVER_TIMEOUT'):
                # Import here to work around a circular import error
                from airflow.models import Connection

                # Query the database to get the ID of the Mesos Framework, if available.
                conn_id = FRAMEWORK_CONNID_PREFIX + framework.name
                session = Session()
                connection = session.query(Connection).filter_by(conn_id=conn_id).first()
                if connection is not None:
                    # Set the Framework ID to let the scheduler reconnect with running tasks.
                    framework.id.value = connection.extra

                framework.failover_timeout = configuration.getint('mesos', 'FAILOVER_TIMEOUT')
        else:
            framework.checkpoint = False

        logging.info('MesosFramework master : %s, name : %s, cpu : %s, mem : %s, checkpoint : %s',
            master, framework.name, str(task_cpu), str(task_memory), str(framework.checkpoint))

        implicit_acknowledgements = 1

        if configuration.getboolean('mesos', 'AUTHENTICATE'):
            if not configuration.get('mesos', 'DEFAULT_PRINCIPAL'):
                logging.error("Expecting authentication principal in the environment")
                raise AirflowException("mesos.default_principal not provided in authenticated mode")
            if not configuration.get('mesos', 'DEFAULT_SECRET'):
                logging.error("Expecting authentication secret in the environment")
                raise AirflowException("mesos.default_secret not provided in authenticated mode")

            credential = mesos_pb2.Credential()
            credential.principal = configuration.get('mesos', 'DEFAULT_PRINCIPAL')
            credential.secret = configuration.get('mesos', 'DEFAULT_SECRET')

            framework.principal = credential.principal

            driver = mesos.native.MesosSchedulerDriver(
                AirflowMesosScheduler(self.task_queue, self.result_queue, task_cpu, task_memory),
                framework,
                master,
                implicit_acknowledgements,
                credential)
        else:
            framework.principal = 'Airflow'
            driver = mesos.native.MesosSchedulerDriver(
                AirflowMesosScheduler(self.task_queue, self.result_queue, task_cpu, task_memory),
                framework,
                master,
                implicit_acknowledgements)

        self.mesos_driver = driver
        self.mesos_driver.start()