Example #1
0
def flower(args):
    broka = conf.get('celery', 'BROKER_URL')
    args.port = args.port or conf.get('celery', 'FLOWER_PORT')
    port = '--port=' + args.port
    api = ''
    if args.broker_api:
        api = '--broker_api=' + args.broker_api

    if not args.foreground:
        pid, stdout, stderr, log_file = setup_locations("flower", args.pid, args.stdout, args.stderr, args.log_file)
        stdout = open(stdout, 'w+')
        stderr = open(stderr, 'w+')

        ctx = daemon.DaemonContext(
            pidfile=TimeoutPIDLockFile(pid, -1),
            stdout=stdout,
            stderr=stderr,
        )

        with ctx:
            sp = subprocess.Popen(['flower', '-b', broka, port, api])
            sp.wait()

        stdout.close()
        stderr.close()
    else:
        signal.signal(signal.SIGINT, sigint_handler)
        signal.signal(signal.SIGTERM, sigint_handler)

        sp = subprocess.Popen(['flower', '-b', broka, port, api])
        sp.wait()
Example #2
0
def renew_from_kt():
    # The config is specified in seconds. But we ask for that same amount in
    # minutes to give ourselves a large renewal buffer.
    renewal_lifetime = "%sm" % configuration.getint('kerberos', 'reinit_frequency')
    principal = configuration.get('kerberos', 'principal').replace("_HOST", socket.getfqdn())
    cmdv = [configuration.get('kerberos', 'kinit_path'),
            "-r", renewal_lifetime,
            "-k",  # host ticket
            "-t", configuration.get('kerberos', 'keytab'),   # specify keytab
            "-c", configuration.get('kerberos', 'ccache'),   # specify credentials cache
            principal]
    LOG.info("Reinitting kerberos from keytab: " +
             " ".join(cmdv))

    subp = subprocess.Popen(cmdv,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            close_fds=True,
                            bufsize=-1)
    subp.wait()
    if subp.returncode != 0:
        LOG.error("Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s" % (
            subp.returncode,
            "\n".join(subp.stdout.readlines()),
            "\n".join(subp.stderr.readlines())))
        sys.exit(subp.returncode)

    global NEED_KRB181_WORKAROUND
    if NEED_KRB181_WORKAROUND is None:
        NEED_KRB181_WORKAROUND = detect_conf_var()
    if NEED_KRB181_WORKAROUND:
        # (From: HUE-640). Kerberos clock have seconds level granularity. Make sure we
        # renew the ticket after the initial valid time.
        time.sleep(1.5)
        perform_krb181_workaround()
Example #3
0
    def try_login(username, password):
        conn = get_ldap_connection(configuration.get("ldap", "bind_user"), configuration.get("ldap", "bind_password"))

        search_filter = "(&({0})({1}={2}))".format(
            configuration.get("ldap", "user_filter"),
            configuration.get("ldap", "user_name_attr"),
            username
        )

        # todo: BASE or ONELEVEL?

        res = conn.search(configuration.get("ldap", "basedn"), search_filter, search_scope=LEVEL)

        # todo: use list or result?
        if not res:
            LOG.info("Cannot find user %s", username)
            raise AuthenticationError("Invalid username or password")

        entry = conn.response[0]

        conn.unbind()
        conn = get_ldap_connection(entry['dn'], password)

        if not conn:
            LOG.info("Password incorrect for user %s", username)
            raise AuthenticationError("Invalid username or password")
Example #4
0
def webserver(args):
    print(settings.HEADER)

    from airflow.www.app import cached_app
    app = cached_app(conf)
    workers = args.workers or conf.get('webserver', 'workers')
    worker_timeout = (args.worker_timeout or
                      conf.get('webserver', 'webserver_worker_timeout'))
    if args.debug:
        print(
            "Starting the web server on port {0} and host {1}.".format(
                args.port, args.hostname))
        app.run(debug=True, port=args.port, host=args.hostname)
    else:
        pid, stdout, stderr, log_file = setup_locations("webserver", pid=args.pid)
        print(
            'Running the Gunicorn server with {workers} {args.workerclass}'
            'workers on host {args.hostname} and port '
            '{args.port} with a timeout of {worker_timeout}...'.format(**locals()))
        sp = subprocess.Popen([
            'gunicorn', '-w', str(args.workers), '-k', str(args.workerclass),
            '-t', str(args.worker_timeout), '-b', args.hostname + ':' + str(args.port),
            '-n', 'airflow-webserver', '--pid', pid,
            'airflow.www.app:cached_app()']
        )
        if args.foreground:
            sp.wait()
Example #5
0
def configure_vars():
    global AIRFLOW_HOME
    global SQL_ALCHEMY_CONN
    global DAGS_FOLDER
    AIRFLOW_HOME = os.path.expanduser(conf.get('core', 'AIRFLOW_HOME'))
    SQL_ALCHEMY_CONN = conf.get('core', 'SQL_ALCHEMY_CONN')
    DAGS_FOLDER = os.path.expanduser(conf.get('core', 'DAGS_FOLDER'))
Example #6
0
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False):
    log = LoggingMixin().log

    SMTP_HOST = configuration.get('smtp', 'SMTP_HOST')
    SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT')
    SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS')
    SMTP_SSL = configuration.getboolean('smtp', 'SMTP_SSL')
    SMTP_USER = None
    SMTP_PASSWORD = None

    try:
        SMTP_USER = configuration.get('smtp', 'SMTP_USER')
        SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD')
    except AirflowConfigException:
        log.debug("No user/password found for SMTP, so logging in with no authentication.")

    if not dryrun:
        s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP(SMTP_HOST, SMTP_PORT)
        if SMTP_STARTTLS:
            s.starttls()
        if SMTP_USER and SMTP_PASSWORD:
            s.login(SMTP_USER, SMTP_PASSWORD)
        log.info("Sent an alert email to %s", e_to)
        s.sendmail(e_from, e_to, mime_msg.as_string())
        s.quit()
Example #7
0
    def get_metastore_client(self):
        """
        Returns a Hive thrift client.
        """
        from thrift.transport import TSocket, TTransport
        from thrift.protocol import TBinaryProtocol
        from hive_service import ThriftHive
        ms = self.metastore_conn
        auth_mechanism = ms.extra_dejson.get('authMechanism', 'NOSASL')
        if configuration.get('core', 'security') == 'kerberos':
            auth_mechanism = ms.extra_dejson.get('authMechanism', 'GSSAPI')
            kerberos_service_name = ms.extra_dejson.get('kerberos_service_name', 'hive')

        socket = TSocket.TSocket(ms.host, ms.port)
        if configuration.get('core', 'security') == 'kerberos' and auth_mechanism == 'GSSAPI':
            try:
                import saslwrapper as sasl
            except ImportError:
                import sasl

            def sasl_factory():
                sasl_client = sasl.Client()
                sasl_client.setAttr("host", ms.host)
                sasl_client("service", kerberos_service_name)
                sasl_client.init()

            from thrift_sasl import TSaslClientTransport
            transport = TSaslClientTransport(sasl_factory, "GSSAPI", socket)
        else:
            transport = TTransport.TBufferedTransport(socket)

        protocol = TBinaryProtocol.TBinaryProtocol(transport)

        return ThriftHive.Client(protocol)
Example #8
0
def flower(args):
    broka = configuration.get("celery", "BROKER_URL")
    args.port = args.port or configuration.get("celery", "FLOWER_PORT")
    port = "--port=" + args.port
    api = ""
    if args.broker_api:
        api = "--broker_api=" + args.broker_api
    sp = subprocess.Popen(["flower", "-b", broka, port, api])
    sp.wait()
Example #9
0
def flower(args):
    broka = configuration.get('celery', 'BROKER_URL')
    args.port = args.port or configuration.get('celery', 'FLOWER_PORT')
    port = '--port=' + args.port
    api = ''
    if args.broker_api:
        api = '--broker_api=' + args.broker_api
    sp = subprocess.Popen(['flower', '-b', broka, port, api])
    sp.wait()
Example #10
0
def flower(args):
    broka = configuration.get('celery', 'BROKER_URL')
    args.port = args.port or configuration.get('celery', 'FLOWER_PORT')
    port = '--port=' + args.port
    api = ''
    if args.broker_api:
        api = '--broker_api=' + args.broker_api

    flower = distutils.spawn.find_executable('flower')
    os.execv(flower, [flower, '-b', broka, port, api])
 def __init__(self, cluster_address=None):
     if cluster_address is None:
         cluster_address = configuration.conf.get('dask', 'cluster_address')
     if not cluster_address:
         raise ValueError(
             'Please provide a Dask cluster address in airflow.cfg')
     self.cluster_address = cluster_address
     # ssl / tls parameters
     self.tls_ca = configuration.get('dask', 'tls_ca')
     self.tls_key = configuration.get('dask', 'tls_key')
     self.tls_cert = configuration.get('dask', 'tls_cert')
     super(DaskExecutor, self).__init__(parallelism=0)
Example #12
0
def webserver(args):
    print(settings.HEADER)

    from airflow.www.app import cached_app

    app = cached_app(conf)
    access_logfile = args.access_logfile or conf.get("webserver", "access_logfile")
    error_logfile = args.error_logfile or conf.get("webserver", "error_logfile")
    workers = args.workers or conf.get("webserver", "workers")
    worker_timeout = args.worker_timeout or conf.get("webserver", "webserver_worker_timeout")
    if args.debug:
        print("Starting the web server on port {0} and host {1}.".format(args.port, args.hostname))
        app.run(debug=True, port=args.port, host=args.hostname)
    else:
        pid, stdout, stderr, log_file = setup_locations("webserver", pid=args.pid)
        print(
            textwrap.dedent(
                """\
                Running the Gunicorn Server with:
                Workers: {workers} {args.workerclass}
                Host: {args.hostname}:{args.port}
                Timeout: {worker_timeout}
                Logfiles: {access_logfile} {error_logfile}
                =================================================================\
            """.format(
                    **locals()
                )
            )
        )

        run_args = [
            "gunicorn",
            "-w " + str(args.workers),
            "-k " + str(args.workerclass),
            "-t " + str(args.worker_timeout),
            "-b " + args.hostname + ":" + str(args.port),
            "-n " + "airflow-webserver",
            "-p " + str(pid),
        ]

        if args.access_logfile:
            run_args += ["--access-logfile", str(args.access_logfile)]

        if args.error_logfile:
            run_args += ["--error-logfile", str(args.error_logfile)]

        if args.daemon:
            run_args += ["-D"]

        module = "airflow.www.app:cached_app()".encode()
        run_args += [module]
        os.execvp("gunicorn", run_args)
Example #13
0
def webserver(args):
    print(settings.HEADER)

    from airflow.www.app import cached_app
    app = cached_app(conf)
    access_logfile = args.access_logfile or conf.get('webserver', 'access_logfile')
    error_logfile = args.error_logfile or conf.get('webserver', 'error_logfile')
    workers = args.workers or conf.get('webserver', 'workers')
    worker_timeout = (args.worker_timeout or
                      conf.get('webserver', 'webserver_worker_timeout'))
    if args.debug:
        print(
            "Starting the web server on port {0} and host {1}.".format(
                args.port, args.hostname))
        app.run(debug=True, port=args.port, host=args.hostname)
    else:
        pid, stdout, stderr, log_file = setup_locations("webserver", pid=args.pid)
        print(
            textwrap.dedent('''\
                Running the Gunicorn Server with:
                Workers: {workers} {args.workerclass}
                Host: {args.hostname}:{args.port}
                Timeout: {worker_timeout}
                Logfiles: {access_logfile} {error_logfile}
                =================================================================\
            '''.format(**locals())))

        run_args = [
            'gunicorn',
            '-w ' + str(args.workers),
            '-k ' + str(args.workerclass),
            '-t ' + str(args.worker_timeout),
            '-b ' + args.hostname + ':' + str(args.port),
            '-n ' + 'airflow-webserver',
            '-p ' + str(pid),
        ]

        if args.access_logfile:
            run_args += ['--access-logfile', str(args.access_logfile)]

        if args.error_logfile:
            run_args += ['--error-logfile', str(args.error_logfile)]

        if args.daemon:
            run_args += ["-D"]

        module = "airflow.www.app:cached_app()".encode()
        run_args += [module]
        os.execvp(
            'gunicorn', run_args
        )
    def authenticate(username, password):
        service_principal = "%s/%s" % (configuration.get('kerberos', 'principal'), utils.get_fqdn())
        realm = configuration.get("kerberos", "default_realm")
        user_principal = utils.principal_from_username(username)

        try:
            # this is pykerberos specific, verify = True is needed to prevent KDC spoofing
            if not kerberos.checkPassword(user_principal, password, service_principal, realm, True):
                raise AuthenticationError()
        except kerberos.KrbError as e:
            logging.error('Password validation for principal %s failed %s', user_principal, e)
            raise AuthenticationError(e)

        return
 def __init__(self,
              task_queue,
              result_queue,
              task_cpu=1,
              task_mem=256):
     self.task_queue = task_queue
     self.result_queue = result_queue
     self.task_cpu = task_cpu
     self.task_mem = task_mem
     self.task_counter = 0
     self.task_key_map = {}
     if configuration.get('mesos', 'DOCKER_IMAGE_SLAVE'):
         self.mesos_slave_docker_image = configuration.get(
             'mesos', 'DOCKER_IMAGE_SLAVE'
         )
Example #16
0
    def get_conn(self):
        """
        Returns a snakebite HDFSClient object.
        """
        connections = self.get_connections(self.hdfs_conn_id)
        use_sasl = False
        if configuration.get('core', 'security') == 'kerberos':
            use_sasl = True

        # When using HAClient, proxy_user must be the same, so is ok to always take the first.
        effective_user = self.proxy_user or connections[0].login
        if len(connections) == 1:
            autoconfig = connections[0].extra_dejson.get('autoconfig', False)
            if autoconfig:
                client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl)
            else:
                hdfs_namenode_principal = connections[0].extra_dejson.get('hdfs_namenode_principal')
                client = Client(connections[0].host, connections[0].port,
                                effective_user=effective_user, use_sasl=use_sasl,
                                hdfs_namenode_principal=hdfs_namenode_principal)
        elif len(connections) > 1:
            hdfs_namenode_principal = connections[0].extra_dejson.get('hdfs_namenode_principal')
            nn = [Namenode(conn.host, conn.port) for conn in connections]
            client = HAClient(nn, effective_user=effective_user, use_sasl=use_sasl,
                              hdfs_namenode_principal=hdfs_namenode_principal)
        else:
            raise HDFSHookException("conn_id doesn't exist in the repository")

        return client
Example #17
0
def flower(args):
    broka = conf.get("celery", "BROKER_URL")
    address = "--address={}".format(args.hostname)
    port = "--port={}".format(args.port)
    api = ""
    if args.broker_api:
        api = "--broker_api=" + args.broker_api

    flower_conf = ""
    if args.flower_conf:
        flower_conf = "--conf=" + args.flower_conf

    if args.daemon:
        pid, stdout, stderr, log_file = setup_locations("flower", args.pid, args.stdout, args.stderr, args.log_file)
        stdout = open(stdout, "w+")
        stderr = open(stderr, "w+")

        ctx = daemon.DaemonContext(pidfile=TimeoutPIDLockFile(pid, -1), stdout=stdout, stderr=stderr)

        with ctx:
            os.execvp("flower", ["flower", "-b", broka, address, port, api, flower_conf])

        stdout.close()
        stderr.close()
    else:
        signal.signal(signal.SIGINT, sigint_handler)
        signal.signal(signal.SIGTERM, sigint_handler)

        os.execvp("flower", ["flower", "-b", broka, address, port, api, flower_conf])
Example #18
0
def send_email(to, subject, html_content, files=None, dryrun=False):
    """
    Send an email with html content

    >>> send_email('*****@*****.**', 'foo', '<b>Foo</b> bar', ['/dev/null'], dryrun=True)
    """
    SMTP_MAIL_FROM = configuration.get('smtp', 'SMTP_MAIL_FROM')

    if isinstance(to, basestring):
        if ',' in to:
            to = to.split(',')
        elif ';' in to:
            to = to.split(';')
        else:
            to = [to]

    msg = MIMEMultipart('alternative')
    msg['Subject'] = subject
    msg['From'] = SMTP_MAIL_FROM
    msg['To'] = ", ".join(to)
    mime_text = MIMEText(html_content, 'html')
    msg.attach(mime_text)

    for fname in files or []:
        basename = os.path.basename(fname)
        with open(fname, "rb") as f:
            msg.attach(MIMEApplication(
                f.read(),
                Content_Disposition='attachment; filename="%s"' % basename,
                Name=basename
            ))

    send_MIME_email(SMTP_MAIL_FROM, to, msg, dryrun)
Example #19
0
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False):
    SMTP_HOST = configuration.get('smtp', 'SMTP_HOST')
    SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT')
    SMTP_USER = configuration.get('smtp', 'SMTP_USER')
    SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD')
    SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS')

    if not dryrun:
        s = smtplib.SMTP(SMTP_HOST, SMTP_PORT)
        if SMTP_STARTTLS:
            s.starttls()
        if SMTP_USER and SMTP_PASSWORD:
            s.login(SMTP_USER, SMTP_PASSWORD)
        logging.info("Sent an alert email to " + str(e_to))
        s.sendmail(e_from, e_to, mime_msg.as_string())
        s.quit()
Example #20
0
def flower(args):
    broka = conf.get('celery', 'BROKER_URL')
    port = '--port={}'.format(args.port)
    api = ''
    if args.broker_api:
        api = '--broker_api=' + args.broker_api

    if args.daemon:
        pid, stdout, stderr, log_file = setup_locations("flower", args.pid, args.stdout, args.stderr, args.log_file)
        stdout = open(stdout, 'w+')
        stderr = open(stderr, 'w+')

        ctx = daemon.DaemonContext(
            pidfile=TimeoutPIDLockFile(pid, -1),
            stdout=stdout,
            stderr=stderr,
        )

        with ctx:
            os.execvp("flower", ['flower', '-b', broka, port, api])

        stdout.close()
        stderr.close()
    else:
        signal.signal(signal.SIGINT, sigint_handler)
        signal.signal(signal.SIGTERM, sigint_handler)

        os.execvp("flower", ['flower', '-b', broka, port, api])
Example #21
0
    def get_results(self, ti=None, fp=None, inline=True, delim=None, fetch=True):
        """
        Get results (or just s3 locations) of a command from Qubole and save into a file
        :param ti: Task Instance of the dag, used to determine the Quboles command id
        :param fp: Optional file pointer, will create one and return if None passed
        :param inline: True to download actual results, False to get s3 locations only
        :param delim: Replaces the CTL-A chars with the given delim, defaults to ','
        :param fetch: when inline is True, get results directly from s3 (if large)
        :return: file location containing actual results or s3 locations of results
        """
        if fp is None:
            iso = datetime.datetime.utcnow().isoformat()
            logpath = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER'))
            resultpath = logpath + '/' + self.dag_id + '/' + self.task_id + '/results'
            configuration.mkdir_p(resultpath)
            fp = open(resultpath + '/' + iso, 'wb')

        if self.cmd is None:
            cmd_id = ti.xcom_pull(key="qbol_cmd_id", task_ids=self.task_id)
            self.cmd = self.cls.find(cmd_id)

        self.cmd.get_results(fp, inline, delim, fetch)
        fp.flush()
        fp.close()
        return fp.name
Example #22
0
    def __init__(
            self,
            dag_id=None,
            dag_ids=None,
            subdir=None,
            test_mode=False,
            refresh_dags_every=10,
            num_runs=None,
            do_pickle=False,
            *args, **kwargs):

        # for BaseJob compatibility
        self.dag_id = dag_id
        self.dag_ids = [dag_id] if dag_id else []
        if dag_ids:
            self.dag_ids.extend(dag_ids)

        self.subdir = subdir

        if test_mode:
            self.num_runs = 1
        else:
            self.num_runs = num_runs

        self.refresh_dags_every = refresh_dags_every
        self.do_pickle = do_pickle
        super(SchedulerJob, self).__init__(*args, **kwargs)

        self.heartrate = conf.getint('scheduler', 'SCHEDULER_HEARTBEAT_SEC')
        self.max_threads = min(conf.getint('scheduler', 'max_threads'), multiprocessing.cpu_count())
        if 'sqlite' in conf.get('core', 'sql_alchemy_conn'):
            if self.max_threads > 1:
                self.logger.error("Cannot use more than 1 thread when using sqlite. Setting max_threads to 1")
            self.max_threads = 1
    def __init__(
            self, hql,
            hive_cli_conn_id='hive_cli_default',
            schema='default',
            hiveconfs=None,
            hiveconf_jinja_translate=False,
            script_begin_tag=None,
            run_as_owner=False,
            mapred_queue=None,
            mapred_queue_priority=None,
            mapred_job_name=None,
            *args, **kwargs):

        super(HiveOperator, self).__init__(*args, **kwargs)
        self.hql = hql
        self.hive_cli_conn_id = hive_cli_conn_id
        self.schema = schema
        self.hiveconfs = hiveconfs or {}
        self.hiveconf_jinja_translate = hiveconf_jinja_translate
        self.script_begin_tag = script_begin_tag
        self.run_as = None
        if run_as_owner:
            self.run_as = self.dag.owner
        self.mapred_queue = mapred_queue
        self.mapred_queue_priority = mapred_queue_priority
        self.mapred_job_name = mapred_job_name
        self.mapred_job_name_template = configuration.get('hive',
                                                          'mapred_job_name_template')

        # assigned lazily - just for consistency we can create the attribute with a
        # `None` initial value, later it will be populated by the execute method.
        # This also makes `on_kill` implementation consistent since it assumes `self.hook`
        # is defined.
        self.hook = None
Example #24
0
def load_login():
    log = LoggingMixin().log

    auth_backend = 'airflow.default_login'
    try:
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            auth_backend = conf.get('webserver', 'auth_backend')
    except conf.AirflowConfigException:
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            log.warning(
                "auth_backend not found in webserver config reverting to "
                "*deprecated*  behavior of importing airflow_login")
            auth_backend = "airflow_login"

    try:
        global login
        login = import_module(auth_backend)
    except ImportError as err:
        log.critical(
            "Cannot import authentication module %s. "
            "Please correct your authentication backend or disable authentication: %s",
            auth_backend, err
        )
        if conf.getboolean('webserver', 'AUTHENTICATE'):
            raise AirflowException("Failed to import authentication backend")
Example #25
0
def resetdb(args):
    print("DB: " + configuration.get("core", "SQL_ALCHEMY_CONN"))
    if input("This will drop existing tables if they exist. " "Proceed? (y/n)").upper() == "Y":
        logging.basicConfig(level=settings.LOGGING_LEVEL, format=settings.SIMPLE_LOG_FORMAT)
        utils.resetdb()
    else:
        print("Bail.")
def validate_logging_config(logging_config):
    # Now lets validate the other logging-related settings
    task_log_reader = conf.get('core', 'task_log_reader')

    logger = logging.getLogger('airflow.task')

    def _get_handler(name):
        return next((h for h in logger.handlers if h.name == name), None)

    if _get_handler(task_log_reader) is None:
        # Check for pre 1.10 setting that might be in deployed airflow.cfg files
        if task_log_reader == "file.task" and _get_handler("task"):
            warnings.warn(
                "task_log_reader setting in [core] has a deprecated value of "
                "{!r}, but no handler with this name was found. Please update "
                "your config to use {!r}. Running config has been adjusted to "
                "match".format(
                    task_log_reader,
                    "task",
                ),
                DeprecationWarning,
            )
            conf.set('core', 'task_log_reader', 'task')
        else:
            raise AirflowConfigException(
                "Configured task_log_reader {!r} was not a handler of the 'airflow.task' "
                "logger.".format(task_log_reader)
            )
Example #27
0
def webserver(args):
    print(settings.HEADER)
    log_to_stdout()
    from airflow.www.app import cached_app

    app = cached_app(configuration)
    threads = args.threads or configuration.get("webserver", "threads")
    if args.debug:
        print("Starting the web server on port {0} and host {1}.".format(args.port, args.hostname))
        app.run(debug=True, port=args.port, host=args.hostname)
    else:
        print(
            "Running the Gunicorn server with {threads}"
            "on host {args.hostname} and port "
            "{args.port}...".format(**locals())
        )
        sp = subprocess.Popen(
            [
                "gunicorn",
                "-w",
                str(args.threads),
                "-t",
                "120",
                "-b",
                args.hostname + ":" + str(args.port),
                "airflow.www.app:cached_app()",
            ]
        )
        sp.wait()
Example #28
0
 def serve_logs(filename):  # noqa
     log = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
     return flask.send_from_directory(
         log,
         filename,
         mimetype="application/json",
         as_attachment=False)
Example #29
0
    def write(self, log, remote_log_location, append=False):
        """
        Writes the log to the remote_log_location. Fails silently if no hook
        was created.

        :param log: the log to write to the remote_log_location
        :type log: string
        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param append: if False, any existing log file is overwritten. If True,
            the new log is appended to any existing logs.
        :type append: bool

        """
        if self.hook:

            if append:
                old_log = self.read(remote_log_location)
                log = old_log + '\n' + log
            try:
                self.hook.load_string(
                    log,
                    key=remote_log_location,
                    replace=True,
                    encrypt=configuration.get('core', 'ENCRYPT_S3_LOGS'))
                return
            except:
                pass

        # raise/return error if we get here
        logging.error('Could not write logs to {}'.format(remote_log_location))
Example #30
0
    def __init__(
            self,
            hive_cli_conn_id="hive_cli_default",
            run_as=None,
            mapred_queue=None,
            mapred_queue_priority=None,
            mapred_job_name=None):
        conn = self.get_connection(hive_cli_conn_id)
        self.hive_cli_params = conn.extra_dejson.get('hive_cli_params', '')
        self.use_beeline = conn.extra_dejson.get('use_beeline', False)
        self.auth = conn.extra_dejson.get('auth', 'noSasl')
        self.conn = conn
        self.run_as = run_as

        if mapred_queue_priority:
            mapred_queue_priority = mapred_queue_priority.upper()
            if mapred_queue_priority not in HIVE_QUEUE_PRIORITIES:
                raise AirflowException(
                    "Invalid Mapred Queue Priority.  Valid values are: "
                    "{}".format(', '.join(HIVE_QUEUE_PRIORITIES)))

        self.mapred_queue = mapred_queue or configuration.get('hive',
                                                              'default_hive_mapred_queue')
        self.mapred_queue_priority = mapred_queue_priority
        self.mapred_job_name = mapred_job_name
Example #31
0
def run(args):

    utils.pessimistic_connection_handling()
    # Setting up logging
    log = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER'))
    directory = log + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    args.execution_date = dateutil.parser.parse(args.execution_date)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    # store old log (to help with S3 appends)
    if os.path.exists(filename):
        with open(filename, 'r') as logfile:
            old_log = logfile.read()
    else:
        old_log = None

    subdir = process_subdir(args.subdir)
    logging.basicConfig(filename=filename,
                        level=settings.LOGGING_LEVEL,
                        format=settings.LOG_FORMAT)
    if not args.pickle:
        dagbag = DagBag(subdir)
        if args.dag_id not in dagbag.dags:
            msg = 'DAG [{0}] could not be found in {1}'.format(
                args.dag_id, subdir)
            logging.error(msg)
            raise AirflowException(msg)
        dag = dagbag.dags[args.dag_id]
        task = dag.get_task(task_id=args.task_id)
    else:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(DagPickle).filter(
            DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
        task = dag.get_task(task_id=args.task_id)

    task_start_date = None
    if args.task_start_date:
        task_start_date = dateutil.parser.parse(args.task_start_date)
        task.start_date = task_start_date
    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            task_start_date=task_start_date,
            ignore_dependencies=args.ignore_dependencies,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print(('Pickled dag {dag} '
                       'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            force=args.force)
        executor.heartbeat()
        executor.end()

    if configuration.get('core', 'S3_LOG_FOLDER').startswith('s3:'):
        import boto
        s3_log = filename.replace(log,
                                  configuration.get('core', 'S3_LOG_FOLDER'))
        bucket, key = s3_log.lstrip('s3:/').split('/', 1)
        if os.path.exists(filename):

            # get logs
            with open(filename, 'r') as logfile:
                new_log = logfile.read()

            # remove old logs (since they are already in S3)
            if old_log:
                new_log.replace(old_log, '')

            try:
                s3 = boto.connect_s3()
                s3_key = boto.s3.key.Key(s3.get_bucket(bucket), key)

                # append new logs to old S3 logs, if available
                if s3_key.exists():
                    old_s3_log = s3_key.get_contents_as_string().decode()
                    new_log = old_s3_log + '\n' + new_log

                # send log to S3
                s3_key.set_contents_from_string(new_log)
            except:
                print('Could not send logs to S3.')
Example #32
0
from airflow import DAG
from datetime import datetime, timedelta
from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator
from airflow import configuration as conf

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 1, 1),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

namespace = conf.get('kubernetes', 'NAMESPACE')

# This will detect the default namespace locally and read the
# environment namespace when deployed to Astronomer.
if namespace == 'default':
    config_file = '/usr/local/airflow/include/.kube/config'
    in_cluster = False
else:
    in_cluster = True
    config_file = None

dag = DAG('example_kubernetes_pod',
          schedule_interval='@once',
          default_args=default_args)

compute_resource = {
Example #33
0
import traceback
import time
import psutil

import airflow
from airflow import jobs, settings
from airflow import configuration as conf
from airflow.exceptions import AirflowException
from airflow.executors import DEFAULT_EXECUTOR
from airflow.models import DagModel, DagBag, TaskInstance, DagPickle, DagRun, Variable
from airflow.utils import db as db_utils
from airflow.utils import logging as logging_utils
from airflow.utils.state import State
from airflow.www.app import cached_app

DAGS_FOLDER = os.path.expanduser(conf.get('core', 'DAGS_FOLDER'))


def sigint_handler(sig, frame):
    sys.exit(0)


def sigquit_handler(sig, frame):
    """Helps debug deadlocks by printing stacktraces when this gets a SIGQUIT
    e.g. kill -s QUIT <PID> or CTRL+\
    """
    print("Dumping stack traces for all threads in PID {}".format(os.getpid()))
    id_to_name = dict([(th.ident, th.name) for th in threading.enumerate()])
    code = []
    for thread_id, stack in sys._current_frames().items():
        code.append("\n# Thread: {}({})"
from airflow import DAG
from airflow.operators import RedshiftVacuumOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow import configuration;
from datetime import datetime

args = {
    'owner': 'scopeworker',
    'provide_context': True
}

dag = DAG('redshift_vacuum_plugin', description='REDSHIFT VACUUM DAG',
          schedule_interval='0 5 * * *',
          start_date=datetime(2017, 3, 20),
          catchup=False,
          default_args=args)


redshift_operator = RedshiftVacuumOperator(task_id="vacumming_task",
                                        redshift_connection_id=configuration.get("postgresql", "postgresql_conn_id"),
                                         query = "COMMIT;vacuum; ANALYZE; COMMIT;",
                                         dag=dag)

dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)

dummy_operator >> redshift_operator
from airflow.operators import python_operator
from airflow.utils.trigger_rule import TriggerRule
from airflow.operators import email_operator

# We set the start_date of the DAG to the previous date. This will
# make the DAG immediately available for scheduling.
YESTERDAY = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1),
    datetime.datetime.min.time())

# We define some variables that we will use in the DAG tasks.
SUCCESS_TAG = 'success'
FAILURE_TAG = 'failure'

DS_TAG = '{{ ds }}'
DATAFLOW_FILE = os.path.join(configuration.get('core', 'dags_folder'),
                             'dataflow', 'process_json.py')

DEFAULT_DAG_ARGS = {
    'start_date': YESTERDAY,
    'email': models.Variable.get('email'),
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
    'project_id': models.Variable.get('gcp_project'),
    'dataflow_default_options': {
        'project': models.Variable.get('gcp_project'),
        'temp_location': models.Variable.get('gcp_temp_location'),
        'runner': 'DataflowRunner'
    }
}
Example #36
0
def get_es_hosts():
    result = conf.get("core", "elasticsearch_hosts")
    assert result
    return [x.strip().split(':') for x in result.split(',')]
Example #37
0
def verify_s3_prefix():
    reach_s3_prefix = conf.get("core", "reach_s3_prefix")
    assert reach_s3_prefix.startswith('s3://')
    assert not reach_s3_prefix.endswith('/')
import unittest

from airflow import configuration
from airflow.models import DAG, DagBag, TaskInstance, State
from airflow.jobs import BackfillJob
from airflow.operators.python_operator import PythonOperator

try:
    from airflow.executors.dask_executor import DaskExecutor
    from distributed import LocalCluster
    SKIP_DASK = False
except ImportError:
    logging.error('Dask unavailable, skipping DaskExecutor tests')
    SKIP_DASK = True

if 'sqlite' in configuration.get('core', 'sql_alchemy_conn'):
    logging.error('sqlite does not support concurrent access')
    SKIP_DASK = True

DEFAULT_DATE = datetime.datetime(2017, 1, 1)


class DaskExecutorTest(unittest.TestCase):
    def setUp(self):
        self.dagbag = DagBag(include_examples=True)

    @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration')
    def test_dask_executor_functions(self):
        cluster = LocalCluster()

        executor = DaskExecutor(cluster_address=cluster.scheduler_address)
Example #39
0
import os
import pendulum

from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker
from sqlalchemy.pool import NullPool

from airflow import configuration as conf
from airflow.logging_config import configure_logging

log = logging.getLogger(__name__)


TIMEZONE = pendulum.timezone('UTC')
try:
    tz = conf.get("core", "default_timezone")
    if tz == "system":
        TIMEZONE = pendulum.local_timezone()
    else:
        TIMEZONE = pendulum.timezone(tz)
except:
    pass
log.info("Configured default timezone %s" % TIMEZONE)


class DummyStatsLogger(object):
    @classmethod
    def incr(cls, stat, count=1, rate=1):
        pass

    @classmethod
Example #40
0
class CLIFactory(object):
    args = {
        # Shared
        'dag_id':
        Arg(("dag_id", ), "The id of the dag"),
        'task_id':
        Arg(("task_id", ), "The id of the task"),
        'execution_date':
        Arg(("execution_date", ),
            help="The execution date of the DAG",
            type=parsedate),
        'task_regex':
        Arg(("-t", "--task_regex"),
            "The regex to filter specific task_ids to backfill (optional)"),
        'subdir':
        Arg(("-sd", "--subdir"),
            "File location or directory from which to look for the dag",
            default=DAGS_FOLDER),
        'start_date':
        Arg(("-s", "--start_date"),
            "Override start_date YYYY-MM-DD",
            type=parsedate),
        'end_date':
        Arg(("-e", "--end_date"),
            "Override end_date YYYY-MM-DD",
            type=parsedate),
        'dry_run':
        Arg(("-dr", "--dry_run"), "Perform a dry run", "store_true"),
        'pid':
        Arg(("--pid", ), "PID file location", nargs='?'),
        'foreground':
        Arg(("-f", "--foreground"), "Do not detach. Run in foreground",
            "store_true"),
        'stderr':
        Arg(("--stderr", ), "Redirect stderr to this file"),
        'stdout':
        Arg(("--stdout", ), "Redirect stdout to this file"),
        'log_file':
        Arg(("-l", "--log-file"), "Location of the log file"),

        # backfill
        'mark_success':
        Arg(("-m", "--mark_success"),
            "Mark jobs as succeeded without running them", "store_true"),
        'local':
        Arg(("-l", "--local"), "Run the task using the LocalExecutor",
            "store_true"),
        'donot_pickle':
        Arg(("-x", "--donot_pickle"),
            ("Do not attempt to pickle the DAG object to send over "
             "to the workers, just tell the workers to run their version "
             "of the code."), "store_true"),
        'include_adhoc':
        Arg(("-a", "--include_adhoc"),
            "Include dags with the adhoc parameter.", "store_true"),
        'bf_ignore_dependencies':
        Arg(("-i", "--ignore_dependencies"),
            ("Skip upstream tasks, run only the tasks "
             "matching the regexp. Only works in conjunction "
             "with task_regex"), "store_true"),
        'bf_ignore_first_depends_on_past':
        Arg(("-I", "--ignore_first_depends_on_past"),
            ("Ignores depends_on_past dependencies for the first "
             "set of tasks only (subsequent executions in the backfill "
             "DO respect depends_on_past)."), "store_true"),
        'pool':
        Arg(("--pool", ), "Resource pool to use"),
        # list_dags
        'tree':
        Arg(("-t", "--tree"), "Tree view", "store_true"),
        # clear
        'upstream':
        Arg(("-u", "--upstream"), "Include upstream tasks", "store_true"),
        'only_failed':
        Arg(("-f", "--only_failed"), "Only failed jobs", "store_true"),
        'only_running':
        Arg(("-r", "--only_running"), "Only running jobs", "store_true"),
        'downstream':
        Arg(("-d", "--downstream"), "Include downstream tasks", "store_true"),
        'no_confirm':
        Arg(("-c", "--no_confirm"), "Do not request confirmation",
            "store_true"),
        # trigger_dag
        'run_id':
        Arg(("-r", "--run_id"), "Helps to indentify this run"),
        'conf':
        Arg(('-c', '--conf'),
            "json string that gets pickled into the DagRun's conf attribute"),
        # kerberos
        'principal':
        Arg(("principal", ),
            "kerberos principal",
            nargs='?',
            default=conf.get('kerberos', 'principal')),
        'keytab':
        Arg(("-kt", "--keytab"),
            "keytab",
            nargs='?',
            default=conf.get('kerberos', 'keytab')),
        # run
        'force':
        Arg(("-f", "--force"), "Force a run regardless or previous success",
            "store_true"),
        'raw':
        Arg(("-r", "--raw"), argparse.SUPPRESS, "store_true"),
        'ignore_dependencies':
        Arg(("-i", "--ignore_dependencies"),
            "Ignore upstream and depends_on_past dependencies", "store_true"),
        'ignore_depends_on_past':
        Arg(("-I", "--ignore_depends_on_past"),
            "Ignore depends_on_past dependencies (but respect "
            "upstream dependencies)", "store_true"),
        'ship_dag':
        Arg(("--ship_dag", ),
            "Pickles (serializes) the DAG and ships it to the worker",
            "store_true"),
        'pickle':
        Arg(("-p", "--pickle"),
            "Serialized pickle object of the entire dag (used internally)"),
        'job_id':
        Arg(("-j", "--job_id"), argparse.SUPPRESS),
        # webserver
        'port':
        Arg(("-p", "--port"),
            default=conf.get('webserver', 'WEB_SERVER_PORT'),
            type=int,
            help="The port on which to run the server"),
        'workers':
        Arg(("-w", "--workers"),
            default=conf.get('webserver', 'WORKERS'),
            type=int,
            help="Number of workers to run the webserver on"),
        'workerclass':
        Arg(("-k", "--workerclass"),
            default=conf.get('webserver', 'WORKER_CLASS'),
            choices=['sync', 'eventlet', 'gevent', 'tornado'],
            help="The worker class to use for gunicorn"),
        'worker_timeout':
        Arg(("-t", "--worker_timeout"),
            default=conf.get('webserver', 'WEB_SERVER_WORKER_TIMEOUT'),
            type=int,
            help="The timeout for waiting on webserver workers"),
        'hostname':
        Arg(("-hn", "--hostname"),
            default=conf.get('webserver', 'WEB_SERVER_HOST'),
            help="Set the hostname on which to run the web server"),
        'debug':
        Arg(("-d", "--debug"),
            "Use the server that ships with Flask in debug mode",
            "store_true"),
        # resetdb
        'yes':
        Arg(("-y", "--yes"),
            "Do not prompt to confirm reset. Use with care!",
            "store_true",
            default=False),
        # scheduler
        'dag_id_opt':
        Arg(("-d", "--dag_id"), help="The id of the dag to run"),
        'num_runs':
        Arg(("-n", "--num_runs"),
            default=None,
            type=int,
            help="Set the number of runs to execute before exiting"),
        # worker
        'do_pickle':
        Arg(("-p", "--do_pickle"),
            default=False,
            help=(
                "Attempt to pickle the DAG object to send over "
                "to the workers, instead of letting workers run their version "
                "of the code."),
            action="store_true"),
        'queues':
        Arg(("-q", "--queues"),
            help="Comma delimited list of queues to serve",
            default=conf.get('celery', 'DEFAULT_QUEUE')),
        'concurrency':
        Arg(("-c", "--concurrency"),
            type=int,
            help="The number of worker processes",
            default=conf.get('celery', 'celeryd_concurrency')),
        # flower
        'broker_api':
        Arg(("-a", "--broker_api"), help="Broker api"),
        'flower_port':
        Arg(("-p", "--port"),
            default=conf.get('webserver', 'WEB_SERVER_PORT'),
            type=int,
            help="The port on which to run the server"),
        'task_params':
        Arg(("-tp", "--task_params"),
            help="Sends a JSON params dict to the task"),
    }
    subparsers = (
        {
            'func':
            backfill,
            'help':
            "Run subsections of a DAG for a specified date range",
            'args':
            ('dag_id', 'task_regex', 'start_date', 'end_date', 'mark_success',
             'local', 'donot_pickle', 'include_adhoc',
             'bf_ignore_dependencies', 'bf_ignore_first_depends_on_past',
             'subdir', 'pool', 'dry_run')
        },
        {
            'func': list_tasks,
            'help': "List the tasks within a DAG",
            'args': ('dag_id', 'tree', 'subdir'),
        },
        {
            'func':
            clear,
            'help':
            "Clear a set of task instance, as if they never ran",
            'args': ('dag_id', 'task_regex', 'start_date', 'end_date',
                     'subdir', 'upstream', 'downstream', 'no_confirm'),
        },
        {
            'func': pause,
            'help': "Pause a DAG",
            'args': ('dag_id', 'subdir'),
        },
        {
            'func': unpause,
            'help': "Pause a DAG",
            'args': ('dag_id', 'subdir'),
        },
        {
            'func': trigger_dag,
            'help': "Trigger a DAG run",
            'args': ('dag_id', 'subdir', 'run_id', 'conf'),
        },
        {
            'func':
            kerberos,
            'help':
            "Start a kerberos ticket renewer",
            'args': ('principal', 'keytab', 'pid', 'foreground', 'stdout',
                     'stderr', 'log_file'),
        },
        {
            'func': render,
            'help': "Render a task instance's template(s)",
            'args': ('dag_id', 'task_id', 'execution_date', 'subdir'),
        },
        {
            'func':
            run,
            'help':
            "Run a single task instance",
            'args':
            ('dag_id', 'task_id', 'execution_date', 'subdir', 'mark_success',
             'force', 'pool', 'local', 'raw', 'ignore_dependencies',
             'ignore_depends_on_past', 'ship_dag', 'pickle', 'job_id'),
        },
        {
            'func': initdb,
            'help': "Initialize the metadata database",
            'args': tuple(),
        },
        {
            'func': list_dags,
            'help': "List all the DAGs",
            'args': ('subdir', ),
        },
        {
            'func': task_state,
            'help': "Get the status of a task instance",
            'args': ('dag_id', 'task_id', 'execution_date', 'subdir'),
        },
        {
            'func': serve_logs,
            'help': "Serve logs generate by worker",
            'args': tuple(),
        },
        {
            'func':
            test,
            'help':
            ("Test a task instance. This will run a task without checking for "
             "dependencies or recording it's state in the database."),
            'args': ('dag_id', 'task_id', 'execution_date', 'subdir',
                     'dry_run', 'task_params'),
        },
        {
            'func':
            webserver,
            'help':
            "Start a Airflow webserver instance",
            'args':
            ('port', 'workers', 'workerclass', 'worker_timeout', 'hostname',
             'pid', 'foreground', 'stdout', 'stderr', 'log_file', 'debug'),
        },
        {
            'func': resetdb,
            'help': "Burn down and rebuild the metadata database",
            'args': ('yes', ),
        },
        {
            'func': upgradedb,
            'help': "Upgrade metadata database to latest version",
            'args': tuple(),
        },
        {
            'func':
            scheduler,
            'help':
            "Start a scheduler scheduler instance",
            'args': ('dag_id_opt', 'subdir', 'num_runs', 'do_pickle', 'pid',
                     'foreground', 'stdout', 'stderr', 'log_file'),
        },
        {
            'func':
            worker,
            'help':
            "Start a Celery worker node",
            'args': ('do_pickle', 'queues', 'concurrency', 'pid', 'foreground',
                     'stdout', 'stderr', 'log_file'),
        },
        {
            'func':
            flower,
            'help':
            "Start a Celery Flower",
            'args': ('flower_port', 'broker_api', 'pid', 'foreground',
                     'stdout', 'stderr', 'log_file'),
        },
        {
            'func': version,
            'help': "Show the version",
            'args': tuple(),
        },
    )
    subparsers_dict = {sp['func'].__name__: sp for sp in subparsers}
    dag_subparsers = ('list_tasks', 'backfill', 'test', 'run', 'pause',
                      'unpause')

    @classmethod
    def get_parser(cls, dag_parser=False):
        parser = argparse.ArgumentParser()
        subparsers = parser.add_subparsers(help='sub-command help',
                                           dest='subcommand')
        subparsers.required = True

        subparser_list = cls.dag_subparsers if dag_parser else cls.subparsers_dict.keys(
        )
        for sub in subparser_list:
            sub = cls.subparsers_dict[sub]
            sp = subparsers.add_parser(sub['func'].__name__, help=sub['help'])
            for arg in sub['args']:
                if 'dag_id' in arg and dag_parser:
                    continue
                arg = cls.args[arg]
                kwargs = {
                    f: getattr(arg, f)
                    for f in arg._fields if f != 'flags' and getattr(arg, f)
                }
                sp.add_argument(*arg.flags, **kwargs)
            sp.set_defaults(func=sub['func'])
        return parser
    def get_service_token(self, method, json):

        if method == 'POST':
            request_func = requests.post
            endpoint = self.service_connection.extra_dejson.get(
                'POST_END_POINT', None)
            if endpoint is None:
                endpoint = Service_Token_EndPoint
        else:
            raise AirflowException('Unexpected HTTP Method: ' + method)

        url = 'https://{host}/{endpoint}'.format(host=self.parse_host(
            self.service_connection.host),
                                                 endpoint=endpoint)
        logging.info('URL :: ' + url)
        logging.info(json)

        for attempt_num in range(1, self.retry_limit + 1):
            try:
                if os.getenv("id") is not None:
                    id = os.environ['id']
                else:
                    id = configuration.get('service', 'id')
                if os.getenv("service") is not None:
                    service = os.environ['service']
                else:
                    service = configuration.get('service', 'service')
                if os.getenv("code") is not None:
                    code = os.environ['code']
                else:
                    code = configuration.get('service', 'code')
                if os.getenv("type") is not None:
                    type = os.environ['type']
                else:
                    type = configuration.get('service', 'type')
                logging.info('URL :: ' + url)
                logging.info(' id :: ' + id)
                logging.info(' service :: ' + service)
                logging.info(' code :: ' + code)
                logging.info(' type :: ' + type)
                query_params = '?type=%s&id=%s&service=%s&code=%s' % (
                    type, id, service, code)
                logging.info('Final query_params :: ' + query_params)
                url = url + query_params
                logging.info('Final Appended URL :: ' + url)

                response = request_func(url,
                                        json=json,
                                        headers=User_Headers,
                                        timeout=self.timeout_seconds)
                if response.status_code == 200:
                    return response.json()
                else:
                    raise AirflowException(
                        'Response: {0}, Status Code: {1}'.format(
                            response.content, response.status_code))
            except (requests_exceptions.ConnectionError,
                    requests_exceptions.Timeout) as e:
                logging.info(
                    'Attempt %s API Request to Query Service failed with reason: %s',
                    attempt_num, e)
        raise AirflowException(
            ('API requests to IMS Gateway Service failed {} times. ' +
             'Giving up.').format(self.retry_limit))
Example #42
0
    def __init__(self,
                 dag_directory,
                 file_paths,
                 max_runs,
                 processor_factory,
                 processor_timeout,
                 signal_conn,
                 async_mode=True):
        """
        :param dag_directory: Directory where DAG definitions are kept. All
            files in file_paths should be under this directory
        :type dag_directory: unicode
        :param file_paths: list of file paths that contain DAG definitions
        :type file_paths: list[unicode]
        :param max_runs: The number of times to parse and schedule each file. -1
            for unlimited.
        :type max_runs: int
        :param processor_factory: function that creates processors for DAG
            definition files. Arguments are (dag_definition_path)
        :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor)
        :param processor_timeout: How long to wait before timing out a DAG file processor
        :type processor_timeout: timedelta
        :param signal_conn: connection to communicate signal with processor agent.
        :type signal_conn: airflow.models.connection.Connection
        :param async_mode: whether to start the manager in async mode
        :type async_mode: bool
        """
        self._file_paths = file_paths
        self._file_path_queue = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        self._processor_factory = processor_factory
        self._signal_conn = signal_conn
        self._async_mode = async_mode

        self._parallelism = conf.getint('scheduler', 'max_threads')
        if 'sqlite' in conf.get('core',
                                'sql_alchemy_conn') and self._parallelism > 1:
            self.log.error("Cannot use more than 1 thread when using sqlite. "
                           "Setting parallelism to 1")
            self._parallelism = 1

        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler',
                                                  'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler',
                                                'print_stats_interval')
        # Map from file path to the processor
        self._processors = {}
        # Map from file path to the last runtime
        self._last_runtime = {}
        # Map from file path to the last finish time
        self._last_finish_time = {}
        self._last_zombie_query_time = timezone.utcnow()
        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.utcnow()
        # Last time stats were printed
        self.last_stat_print_time = timezone.datetime(2000, 1, 1)
        # TODO: Remove magic number
        self._zombie_query_interval = 10
        # Map from file path to the number of runs
        self._run_count = defaultdict(int)
        # Manager heartbeat key.
        self._heart_beat_key = 'heart-beat'
        # How long to wait before timing out a process to parse a DAG file
        self._processor_timeout = processor_timeout

        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler',
                                                 'dag_dir_list_interval')

        self._log = logging.getLogger('airflow.processor_manager')

        signal.signal(signal.SIGINT, self._exit_gracefully)
        signal.signal(signal.SIGTERM, self._exit_gracefully)
Example #43
0
def run(args, dag=None):
    db_utils.pessimistic_connection_handling()
    if dag:
        args.dag_id = dag.dag_id

    # Setting up logging
    log_base = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER'))
    directory = log_base + "/{args.dag_id}/{args.task_id}".format(args=args)
    if not os.path.exists(directory):
        os.makedirs(directory)
    iso = args.execution_date.isoformat()
    filename = "{directory}/{iso}".format(**locals())

    logging.root.handlers = []
    logging.basicConfig(filename=filename,
                        level=settings.LOGGING_LEVEL,
                        format=settings.LOG_FORMAT)

    if not args.pickle and not dag:
        dag = get_dag(args)
    elif not dag:
        session = settings.Session()
        logging.info('Loading pickle id {args.pickle}'.format(**locals()))
        dag_pickle = session.query(DagPickle).filter(
            DagPickle.id == args.pickle).first()
        if not dag_pickle:
            raise AirflowException("Who hid the pickle!? [missing pickle]")
        dag = dag_pickle.pickle
    task = dag.get_task(task_id=args.task_id)

    ti = TaskInstance(task, args.execution_date)

    if args.local:
        print("Logging into: " + filename)
        run_job = jobs.LocalTaskJob(
            task_instance=ti,
            mark_success=args.mark_success,
            force=args.force,
            pickle_id=args.pickle,
            ignore_dependencies=args.ignore_dependencies,
            ignore_depends_on_past=args.ignore_depends_on_past,
            pool=args.pool)
        run_job.run()
    elif args.raw:
        ti.run(
            mark_success=args.mark_success,
            force=args.force,
            ignore_dependencies=args.ignore_dependencies,
            ignore_depends_on_past=args.ignore_depends_on_past,
            job_id=args.job_id,
            pool=args.pool,
        )
    else:
        pickle_id = None
        if args.ship_dag:
            try:
                # Running remotely, so pickling the DAG
                session = settings.Session()
                pickle = DagPickle(dag)
                session.add(pickle)
                session.commit()
                pickle_id = pickle.id
                print(('Pickled dag {dag} '
                       'as pickle_id:{pickle_id}').format(**locals()))
            except Exception as e:
                print('Could not pickle the DAG')
                print(e)
                raise e

        executor = DEFAULT_EXECUTOR
        executor.start()
        print("Sending to executor.")
        executor.queue_task_instance(
            ti,
            mark_success=args.mark_success,
            pickle_id=pickle_id,
            ignore_dependencies=args.ignore_dependencies,
            ignore_depends_on_past=args.ignore_depends_on_past,
            force=args.force,
            pool=args.pool)
        executor.heartbeat()
        executor.end()

    # store logs remotely
    remote_base = conf.get('core', 'REMOTE_BASE_LOG_FOLDER')

    # deprecated as of March 2016
    if not remote_base and conf.get('core', 'S3_LOG_FOLDER'):
        warnings.warn(
            'The S3_LOG_FOLDER conf key has been replaced by '
            'REMOTE_BASE_LOG_FOLDER. Your conf still works but please '
            'update airflow.cfg to ensure future compatibility.',
            DeprecationWarning)
        remote_base = conf.get('core', 'S3_LOG_FOLDER')

    if os.path.exists(filename):
        # read log and remove old logs to get just the latest additions

        with open(filename, 'r') as logfile:
            log = logfile.read()

        remote_log_location = filename.replace(log_base, remote_base)
        # S3
        if remote_base.startswith('s3:/'):
            logging_utils.S3Log().write(log, remote_log_location)
        # GCS
        elif remote_base.startswith('gs:/'):
            logging_utils.GCSLog().write(log, remote_log_location, append=True)
        # Other
        elif remote_base and remote_base != 'None':
            logging.error(
                'Unsupported remote log location: {}'.format(remote_base))
    def __init__(self):
        configuration_dict = configuration.as_dict(display_sensitive=True)
        self.core_configuration = configuration_dict['core']
        self.kube_secrets = configuration_dict.get('kubernetes_secrets', {})
        self.airflow_home = configuration.get(self.core_section,
                                              'airflow_home')
        self.dags_folder = configuration.get(self.core_section, 'dags_folder')
        self.parallelism = configuration.getint(self.core_section,
                                                'PARALLELISM')
        self.worker_container_repository = configuration.get(
            self.kubernetes_section, 'worker_container_repository')
        self.worker_container_tag = configuration.get(self.kubernetes_section,
                                                      'worker_container_tag')
        self.kube_image = '{}:{}'.format(self.worker_container_repository,
                                         self.worker_container_tag)
        self.kube_image_pull_policy = configuration.get(
            self.kubernetes_section, "worker_container_image_pull_policy")
        self.kube_node_selectors = configuration_dict.get(
            'kubernetes_node_selectors', {})
        self.delete_worker_pods = conf.getboolean(self.kubernetes_section,
                                                  'delete_worker_pods')
        self.worker_pods_creation_batch_size = conf.getint(
            self.kubernetes_section, 'worker_pods_creation_batch_size')
        self.worker_service_account_name = conf.get(
            self.kubernetes_section, 'worker_service_account_name')
        self.image_pull_secrets = conf.get(self.kubernetes_section,
                                           'image_pull_secrets')

        # NOTE: user can build the dags into the docker image directly,
        # this will set to True if so
        self.dags_in_image = conf.getboolean(self.kubernetes_section,
                                             'dags_in_image')

        # NOTE: `git_repo` and `git_branch` must be specified together as a pair
        # The http URL of the git repository to clone from
        self.git_repo = conf.get(self.kubernetes_section, 'git_repo')
        # The branch of the repository to be checked out
        self.git_branch = conf.get(self.kubernetes_section, 'git_branch')
        # Optionally, the directory in the git repository containing the dags
        self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath')
        # Optionally, the root directory for git operations
        self.git_sync_root = conf.get(self.kubernetes_section, 'git_sync_root')
        # Optionally, the name at which to publish the checked-out files under --root
        self.git_sync_dest = conf.get(self.kubernetes_section, 'git_sync_dest')
        # Optionally, if git_dags_folder_mount_point is set the worker will use
        # {git_dags_folder_mount_point}/{git_sync_dest}/{git_subpath} as dags_folder
        self.git_dags_folder_mount_point = conf.get(
            self.kubernetes_section, 'git_dags_folder_mount_point')

        # Optionally a user may supply a `git_user` and `git_password` for private
        # repositories
        self.git_user = conf.get(self.kubernetes_section, 'git_user')
        self.git_password = conf.get(self.kubernetes_section, 'git_password')

        # NOTE: The user may optionally use a volume claim to mount a PV containing
        # DAGs directly
        self.dags_volume_claim = conf.get(self.kubernetes_section,
                                          'dags_volume_claim')

        # This prop may optionally be set for PV Claims and is used to write logs
        self.logs_volume_claim = conf.get(self.kubernetes_section,
                                          'logs_volume_claim')

        # This prop may optionally be set for PV Claims and is used to locate DAGs
        # on a SubPath
        self.dags_volume_subpath = conf.get(self.kubernetes_section,
                                            'dags_volume_subpath')

        # This prop may optionally be set for PV Claims and is used to locate logs
        # on a SubPath
        self.logs_volume_subpath = conf.get(self.kubernetes_section,
                                            'logs_volume_subpath')

        # Optionally, hostPath volume containing DAGs
        self.dags_volume_host = conf.get(self.kubernetes_section,
                                         'dags_volume_host')

        # Optionally, write logs to a hostPath Volume
        self.logs_volume_host = conf.get(self.kubernetes_section,
                                         'logs_volume_host')

        # This prop may optionally be set for PV Claims and is used to write logs
        self.base_log_folder = configuration.get(self.core_section,
                                                 'base_log_folder')

        # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note
        # that if your
        # cluster has RBAC enabled, your scheduler may need service account permissions to
        # create, watch, get, and delete pods in this namespace.
        self.kube_namespace = conf.get(self.kubernetes_section, 'namespace')
        # The Kubernetes Namespace in which pods will be created by the executor. Note
        # that if your
        # cluster has RBAC enabled, your workers may need service account permissions to
        # interact with cluster components.
        self.executor_namespace = conf.get(self.kubernetes_section,
                                           'namespace')
        # Task secrets managed by KubernetesExecutor.
        self.gcp_service_account_keys = conf.get(self.kubernetes_section,
                                                 'gcp_service_account_keys')

        # If the user is using the git-sync container to clone their repository via git,
        # allow them to specify repository, tag, and pod name for the init container.
        self.git_sync_container_repository = conf.get(
            self.kubernetes_section, 'git_sync_container_repository')

        self.git_sync_container_tag = conf.get(self.kubernetes_section,
                                               'git_sync_container_tag')
        self.git_sync_container = '{}:{}'.format(
            self.git_sync_container_repository, self.git_sync_container_tag)

        self.git_sync_init_container_name = conf.get(
            self.kubernetes_section, 'git_sync_init_container_name')

        # The worker pod may optionally have a  valid Airflow config loaded via a
        # configmap
        self.airflow_configmap = conf.get(self.kubernetes_section,
                                          'airflow_configmap')

        affinity_json = conf.get(self.kubernetes_section, 'affinity')
        if affinity_json:
            self.kube_affinity = json.loads(affinity_json)
        else:
            self.kube_affinity = None

        tolerations_json = conf.get(self.kubernetes_section, 'tolerations')
        if tolerations_json:
            self.kube_tolerations = json.loads(tolerations_json)
        else:
            self.kube_tolerations = None

        self._validate()
Example #45
0
def create_app(config=None):
    app = Flask(__name__)
    app.secret_key = configuration.get('webserver', 'SECRET_KEY')
    app.config['LOGIN_DISABLED'] = not configuration.getboolean('webserver', 'AUTHENTICATE')

    csrf.init_app(app)

    #app.config = config
    airflow.load_login()
    airflow.login.login_manager.init_app(app)

    cache = Cache(
        app=app, config={'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp'})

    app.register_blueprint(ck, url_prefix='/ck')
    app.register_blueprint(routes)
    app.jinja_env.add_extension("chartkick.ext.charts")

    with app.app_context():
        from airflow.www import views

        admin = Admin(
            app, name='Airflow',
            static_url_path='/admin',
            index_view=views.HomeView(endpoint='', url='/admin', name="DAGs"),
            template_mode='bootstrap3',
        )
        av = admin.add_view
        vs = views
        av(vs.Airflow(name='DAGs', category='DAGs'))

        av(vs.QueryView(name='Ad Hoc Query', category="Data Profiling"))
        av(vs.ChartModelView(
            models.Chart, Session, name="Charts", category="Data Profiling"))
        av(vs.KnowEventView(
            models.KnownEvent,
            Session, name="Known Events", category="Data Profiling"))
        av(vs.SlaMissModelView(
            models.SlaMiss,
            Session, name="SLA Misses", category="Browse"))
        av(vs.TaskInstanceModelView(models.TaskInstance,
            Session, name="Task Instances", category="Browse"))
        av(vs.LogModelView(
            models.Log, Session, name="Logs", category="Browse"))
        av(vs.JobModelView(
            jobs.BaseJob, Session, name="Jobs", category="Browse"))
        av(vs.PoolModelView(
            models.Pool, Session, name="Pools", category="Admin"))
        av(vs.ConfigurationView(
            name='Configuration', category="Admin"))
        av(vs.UserModelView(
            models.User, Session, name="Users", category="Admin"))
        av(vs.ConnectionModelView(
            models.Connection, Session, name="Connections", category="Admin"))
        av(vs.VariableView(
            models.Variable, Session, name="Variables", category="Admin"))

        admin.add_link(base.MenuLink(
            category='Docs', name='Documentation',
            url='http://pythonhosted.org/airflow/'))
        admin.add_link(
            base.MenuLink(category='Docs',
                name='Github',url='https://github.com/airbnb/airflow'))

        av(vs.VersionView(name='Version', category="About"))

        av(vs.DagRunModelView(
            models.DagRun, Session, name="DAG Runs", category="Browse"))
        av(vs.DagModelView(models.DagModel, Session, name=None))
        # Hack to not add this view to the menu
        admin._menu = admin._menu[:-1]

        def integrate_plugins():
            """Integrate plugins to the context"""
            from airflow.plugins_manager import (
                admin_views, flask_blueprints, menu_links)
            for v in admin_views:
                admin.add_view(v)
            for bp in flask_blueprints:
                app.register_blueprint(bp)
            for ml in sorted(menu_links, key=lambda x: x.name):
                admin.add_link(ml)

        integrate_plugins()

        @app.context_processor
        def jinja_globals():
            return {
                'hostname': socket.getfqdn(),
            }

        @app.teardown_appcontext
        def shutdown_session(exception=None):
            settings.Session.remove()

        return app
Example #46
0
class BackfillJobTest(unittest.TestCase):
    def setUp(self):
        self.parser = cli.CLIFactory.get_parser()
        self.dagbag = DagBag(include_examples=True)

    @unittest.skipIf('sqlite' in configuration.get('core', 'sql_alchemy_conn'),
                     "concurrent access not supported in sqlite")
    def test_trigger_controller_dag(self):
        dag = self.dagbag.get_dag('example_trigger_controller_dag')
        target_dag = self.dagbag.get_dag('example_trigger_target_dag')
        dag.clear()
        target_dag.clear()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)
        self.assertFalse(queue.append.called)

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          ignore_first_depends_on_past=True)
        job.run()

        scheduler = SchedulerJob()
        queue = mock.Mock()
        scheduler._process_task_instances(target_dag, queue=queue)

        self.assertTrue(queue.append.called)
        target_dag.clear()
        dag.clear()

    @unittest.skipIf('sqlite' in configuration.get('core', 'sql_alchemy_conn'),
                     "concurrent access not supported in sqlite")
    def test_backfill_multi_dates(self):
        dag = self.dagbag.get_dag('example_bash_operator')
        dag.clear()

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=1),
                          ignore_first_depends_on_past=True)
        job.run()

        session = settings.Session()
        drs = session.query(DagRun).filter(
            DagRun.dag_id == 'example_bash_operator').order_by(
                DagRun.execution_date).all()

        self.assertTrue(drs[0].execution_date == DEFAULT_DATE)
        self.assertTrue(drs[0].state == State.SUCCESS)
        self.assertTrue(drs[1].execution_date == DEFAULT_DATE +
                        datetime.timedelta(days=1))
        self.assertTrue(drs[1].state == State.SUCCESS)

        dag.clear()
        session.close()

    @unittest.skipIf('sqlite' in configuration.get('core', 'sql_alchemy_conn'),
                     "concurrent access not supported in sqlite")
    def test_backfill_examples(self):
        """
        Test backfilling example dags
        """

        # some DAGs really are just examples... but try to make them work!
        skip_dags = [
            'example_http_operator',
            'example_twitter_dag',
            'example_trigger_target_dag',
            'example_trigger_controller_dag',  # tested above
            'test_utils',  # sleeps forever
        ]

        logger = logging.getLogger('BackfillJobTest.test_backfill_examples')
        dags = [
            dag for dag in self.dagbag.dags.values() if
            'example_dags' in dag.full_filepath and dag.dag_id not in skip_dags
        ]

        for dag in dags:
            dag.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            logger.info('*** Running example DAG #{}: {}'.format(
                i, dag.dag_id))
            job = BackfillJob(dag=dag,
                              start_date=DEFAULT_DATE,
                              end_date=DEFAULT_DATE,
                              ignore_first_depends_on_past=True)
            job.run()

    def test_backfill_pooled_tasks(self):
        """
        Test that queued tasks are executed by BackfillJob

        Test for https://github.com/airbnb/airflow/pull/1225
        """
        session = settings.Session()
        pool = Pool(pool='test_backfill_pooled_task_pool', slots=1)
        session.add(pool)
        session.commit()

        dag = self.dagbag.get_dag('test_backfill_pooled_task_dag')
        dag.clear()

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE)

        # run with timeout because this creates an infinite loop if not
        # caught
        with timeout(seconds=30):
            job.run()

        ti = TI(task=dag.get_task('test_backfill_pooled_task'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)

    def test_backfill_depends_on_past(self):
        """
        Test that backfill respects ignore_depends_on_past
        """
        dag = self.dagbag.get_dag('test_depends_on_past')
        dag.clear()
        run_date = DEFAULT_DATE + datetime.timedelta(days=5)

        # backfill should deadlock
        self.assertRaisesRegexp(
            AirflowException, 'BackfillJob is deadlocked',
            BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run)

        BackfillJob(dag=dag,
                    start_date=run_date,
                    end_date=run_date,
                    ignore_first_depends_on_past=True).run()

        # ti should have succeeded
        ti = TI(dag.tasks[0], run_date)
        ti.refresh_from_db()
        self.assertEquals(ti.state, State.SUCCESS)

    def test_cli_backfill_depends_on_past(self):
        """
        Test that CLI respects -I argument
        """
        dag_id = 'test_dagrun_states_deadlock'
        run_date = DEFAULT_DATE + datetime.timedelta(days=1)
        args = [
            'backfill',
            dag_id,
            '-l',
            '-s',
            run_date.isoformat(),
        ]
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        self.assertRaisesRegexp(AirflowException, 'BackfillJob is deadlocked',
                                cli.backfill, self.parser.parse_args(args))

        cli.backfill(self.parser.parse_args(args + ['-I']))
        ti = TI(dag.get_task('test_depends_on_past'), run_date)
        ti.refresh_from_db()
        # task ran
        self.assertEqual(ti.state, State.SUCCESS)
        dag.clear()
Example #47
0
    def index(self, session=None):
        TASK_NAME = "Task Name"
        COMMAND = "Command"
        request_args_filter = RequestArgsFilter(DcmpDag, request.args, (
            ("Category", {
                "operations": ["contains"]
            }),
            (TASK_NAME, {
                "operations": ["contains"],
                "no_filters": True
            }),
            (COMMAND, {
                "operations": ["contains"],
                "no_filters": True
            }),
        ))
        confs = OrderedDict()

        current_user = get_current_user()
        curr_user = airflow.login.current_user

        do_filter = FILTER_BY_OWNER and (not curr_user.is_superuser())
        owner_mode = conf.get('webserver', 'OWNER_MODE').strip().lower()

        if wwwutils.get_filter_by_user():
            dcmp_dags = session.query(DcmpDag).order_by(
                DcmpDag.dag_name).filter(
                    DcmpDag.last_editor_user_name == curr_user.user.username,
                    *request_args_filter.filters)
        else:
            dcmp_dags = session.query(DcmpDag).order_by(
                DcmpDag.dag_name).filter(*request_args_filter.filters)

        dcmp_dags_count = dcmp_dags.count()
        dcmp_dags = dcmp_dags[:]
        for dcmp_dag in dcmp_dags:
            dcmp_dag.conf = dcmp_dag.get_conf(session=session)

        if request_args_filter.filters_dict.get(TASK_NAME):
            task_name_value = request_args_filter.filters_dict.get(
                TASK_NAME)["value"]

            def filter_dcmp_dags_by_task_name(dcmp_dag):
                for task in dcmp_dag.conf["tasks"]:
                    if task_name_value in task["task_name"]:
                        return True
                return False

            dcmp_dags = filter(filter_dcmp_dags_by_task_name, dcmp_dags)

        if request_args_filter.filters_dict.get(COMMAND):
            command_value = request_args_filter.filters_dict.get(
                COMMAND)["value"]

            def filter_dcmp_dags_by_command(dcmp_dag):
                for task in dcmp_dag.conf["tasks"]:
                    if command_value in task["command"]:
                        return True
                return False

            dcmp_dags = filter(filter_dcmp_dags_by_command, dcmp_dags)

        search = request.args.get("search", "")
        if search:
            searched_dcmp_dags = []
            for dcmp_dag in dcmp_dags:
                dcmp_dag.search_results = []
                for result_task_name, result_key, result_line in search_conf_iter(
                        search, dcmp_dag.conf):
                    dcmp_dag.search_results.append({
                        "key":
                        result_key,
                        "full_key":
                        "%s__%s" % (result_task_name, result_key),
                        "line":
                        result_line,
                        "html_line": ('<span class="nb">[%s]</span> ' %
                                      result_key if result_key else "") +
                        result_line.replace(
                            search,
                            '<span class="highlighted">%s</span>' % search),
                    })
                if dcmp_dag.search_results:
                    searched_dcmp_dags.append(dcmp_dag)
            dcmp_dags = searched_dcmp_dags

        return self.render(
            "dcmp/index.html",
            can_access_approver=can_access_approver(),
            dcmp_dags=dcmp_dags,
            dcmp_dags_count=dcmp_dags_count,
            filter_groups=request_args_filter.filter_groups,
            active_filters=request_args_filter.active_filters,
            search=search,
        )
Example #48
0
def create_app(config=None, testing=False):
    app = Flask(__name__)
    app.secret_key = configuration.get('webserver', 'SECRET_KEY')
    app.config['LOGIN_DISABLED'] = not configuration.getboolean(
        'webserver', 'AUTHENTICATE')
    app.config['PREFERRED_URL_SCHEME'] = configuration.get(
        'webserver', 'WEB_PREFERRED_URL_SCHEME')

    csrf.init_app(app)

    app.config['TESTING'] = testing

    airflow.load_login()
    airflow.login.login_manager.init_app(app)

    from airflow import api
    api.load_auth()
    api.api_auth.init_app(app)

    cache = Cache(app=app,
                  config={
                      'CACHE_TYPE': 'filesystem',
                      'CACHE_DIR': '/tmp'
                  })

    app.register_blueprint(routes)

    log_format = airflow.settings.LOG_FORMAT_WITH_PID
    airflow.settings.configure_logging(log_format=log_format)

    with app.app_context():
        from airflow.www import views

        admin = Admin(
            app,
            name='Airflow',
            static_url_path='/admin',
            index_view=views.HomeView(endpoint='', url='/admin', name="DAGs"),
            template_mode='bootstrap3',
        )
        av = admin.add_view
        vs = views
        av(vs.Airflow(name='DAGs', category='DAGs'))

        av(vs.QueryView(name='Ad Hoc Query', category="Data Profiling"))
        av(
            vs.ChartModelView(models.Chart,
                              Session,
                              name="Charts",
                              category="Data Profiling"))
        av(
            vs.KnowEventView(models.KnownEvent,
                             Session,
                             name="Known Events",
                             category="Data Profiling"))
        av(
            vs.SlaMissModelView(models.SlaMiss,
                                Session,
                                name="SLA Misses",
                                category="Browse"))
        av(
            vs.TaskInstanceModelView(models.TaskInstance,
                                     Session,
                                     name="Task Instances",
                                     category="Browse"))
        av(vs.LogModelView(models.Log, Session, name="Logs",
                           category="Browse"))
        av(
            vs.JobModelView(jobs.BaseJob,
                            Session,
                            name="Jobs",
                            category="Browse"))
        av(
            vs.PoolModelView(models.Pool,
                             Session,
                             name="Pools",
                             category="Admin"))
        av(vs.ConfigurationView(name='Configuration', category="Admin"))
        av(
            vs.UserModelView(models.User,
                             Session,
                             name="Users",
                             category="Admin"))
        av(
            vs.ConnectionModelView(models.Connection,
                                   Session,
                                   name="Connections",
                                   category="Admin"))
        av(
            vs.VariableView(models.Variable,
                            Session,
                            name="Variables",
                            category="Admin"))
        av(vs.XComView(models.XCom, Session, name="XComs", category="Admin"))

        admin.add_link(
            base.MenuLink(category='Docs',
                          name='Documentation',
                          url='http://pythonhosted.org/airflow/'))
        admin.add_link(
            base.MenuLink(category='Docs',
                          name='Github',
                          url='https://github.com/apache/incubator-airflow'))

        av(vs.VersionView(name='Version', category="About"))

        av(
            vs.DagRunModelView(models.DagRun,
                               Session,
                               name="DAG Runs",
                               category="Browse"))
        av(vs.DagModelView(models.DagModel, Session, name=None))
        # Hack to not add this view to the menu
        admin._menu = admin._menu[:-1]

        def integrate_plugins():
            """Integrate plugins to the context"""
            from airflow.plugins_manager import (admin_views, flask_blueprints,
                                                 menu_links)
            for v in admin_views:
                logging.debug('Adding view ' + v.name)
                admin.add_view(v)
            for bp in flask_blueprints:
                logging.debug('Adding blueprint ' + bp.name)
                app.register_blueprint(bp)
            for ml in sorted(menu_links, key=lambda x: x.name):
                logging.debug('Adding menu link ' + ml.name)
                admin.add_link(ml)

        integrate_plugins()

        import airflow.www.api.experimental.endpoints as e
        # required for testing purposes otherwise the module retains
        # a link to the default_auth
        if app.config['TESTING']:
            if six.PY2:
                reload(e)
            else:
                import importlib
                importlib.reload(e)

        app.register_blueprint(e.api_experimental,
                               url_prefix='/api/experimental')

        @app.context_processor
        def jinja_globals():
            return {
                'hostname': socket.getfqdn(),
            }

        @app.teardown_appcontext
        def shutdown_session(exception=None):
            settings.Session.remove()

        return app
Example #49
0
def create_app(config=None, session=None, testing=False, app_name="Airflow"):
    global app, appbuilder
    app = Flask(__name__)
    if conf.getboolean('webserver', 'ENABLE_PROXY_FIX'):
        app.wsgi_app = ProxyFix(app.wsgi_app)
    app.secret_key = conf.get('webserver', 'SECRET_KEY')

    airflow_home_path = conf.get('core', 'AIRFLOW_HOME')
    webserver_config_path = airflow_home_path + '/webserver_config.py'
    app.config.from_pyfile(webserver_config_path, silent=True)
    app.config['APP_NAME'] = app_name
    app.config['TESTING'] = testing

    csrf.init_app(app)

    db = SQLA(app)

    from airflow import api
    api.load_auth()
    api.api_auth.init_app(app)

    # flake8: noqa: F841
    cache = Cache(app=app,
                  config={
                      'CACHE_TYPE': 'filesystem',
                      'CACHE_DIR': '/tmp'
                  })

    from airflow.www.blueprints import routes
    app.register_blueprint(routes)

    configure_logging()
    configure_manifest_files(app)

    with app.app_context():

        from airflow.www.security import AirflowSecurityManager
        security_manager_class = app.config.get('SECURITY_MANAGER_CLASS') or \
            AirflowSecurityManager

        if not issubclass(security_manager_class, AirflowSecurityManager):
            raise Exception(
                """Your CUSTOM_SECURITY_MANAGER must now extend AirflowSecurityManager,
                 not FAB's security manager.""")

        appbuilder = AppBuilder(app,
                                db.session if not session else session,
                                security_manager_class=security_manager_class,
                                base_template='appbuilder/baselayout.html')

        def init_views(appbuilder):
            from airflow.www import views
            appbuilder.add_view_no_menu(views.Airflow())
            appbuilder.add_view_no_menu(views.DagModelView())
            appbuilder.add_view_no_menu(views.ConfigurationView())
            appbuilder.add_view_no_menu(views.VersionView())
            appbuilder.add_view(views.DagRunModelView,
                                "DAG Runs",
                                category="Browse",
                                category_icon="fa-globe")
            appbuilder.add_view(views.JobModelView, "Jobs", category="Browse")
            appbuilder.add_view(views.LogModelView, "Logs", category="Browse")
            appbuilder.add_view(views.SlaMissModelView,
                                "SLA Misses",
                                category="Browse")
            appbuilder.add_view(views.TaskInstanceModelView,
                                "Task Instances",
                                category="Browse")
            appbuilder.add_link("Configurations",
                                href='/configuration',
                                category="Admin",
                                category_icon="fa-user")
            appbuilder.add_view(views.ConnectionModelView,
                                "Connections",
                                category="Admin")
            appbuilder.add_view(views.PoolModelView, "Pools", category="Admin")
            appbuilder.add_view(views.VariableModelView,
                                "Variables",
                                category="Admin")
            appbuilder.add_view(views.XComModelView, "XComs", category="Admin")
            appbuilder.add_link("Documentation",
                                href='https://airflow.apache.org/',
                                category="Docs",
                                category_icon="fa-cube")
            appbuilder.add_link("Github",
                                href='https://github.com/apache/airflow',
                                category="Docs")
            appbuilder.add_link('Version',
                                href='/version',
                                category='About',
                                category_icon='fa-th')

            def integrate_plugins():
                """Integrate plugins to the context"""
                from airflow.plugins_manager import (
                    flask_appbuilder_views, flask_appbuilder_menu_links)

                for v in flask_appbuilder_views:
                    log.debug("Adding view %s", v["name"])
                    appbuilder.add_view(v["view"],
                                        v["name"],
                                        category=v["category"])
                for ml in sorted(flask_appbuilder_menu_links,
                                 key=lambda x: x["name"]):
                    log.debug("Adding menu link %s", ml["name"])
                    appbuilder.add_link(ml["name"],
                                        href=ml["href"],
                                        category=ml["category"],
                                        category_icon=ml["category_icon"])

            integrate_plugins()
            # Garbage collect old permissions/views after they have been modified.
            # Otherwise, when the name of a view or menu is changed, the framework
            # will add the new Views and Menus names to the backend, but will not
            # delete the old ones.

        init_views(appbuilder)

        security_manager = appbuilder.sm
        security_manager.sync_roles()

        from airflow.www.api.experimental import endpoints as e
        # required for testing purposes otherwise the module retains
        # a link to the default_auth
        if app.config['TESTING']:
            if six.PY2:
                reload(e)  # noqa
            else:
                import importlib
                importlib.reload(e)

        app.register_blueprint(e.api_experimental,
                               url_prefix='/api/experimental')

        @app.context_processor
        def jinja_globals():
            return {
                'hostname': socket.getfqdn(),
                'navbar_color': conf.get('webserver', 'NAVBAR_COLOR'),
            }

        @app.teardown_appcontext
        def shutdown_session(exception=None):
            settings.Session.remove()

    return app, appbuilder
Example #50
0
 def jinja_globals():
     return {
         'hostname': get_hostname(),
         'navbar_color': configuration.get('webserver', 'NAVBAR_COLOR'),
     }
Example #51
0
# specific language governing permissions and limitations
# under the License.
"""Default configuration for the Airflow webserver"""
import os
from flask_appbuilder.security.manager import AUTH_DB
# from flask_appbuilder.security.manager import AUTH_LDAP
# from flask_appbuilder.security.manager import AUTH_OAUTH
# from flask_appbuilder.security.manager import AUTH_OID
# from flask_appbuilder.security.manager import AUTH_REMOTE_USER

from airflow import configuration as conf

basedir = os.path.abspath(os.path.dirname(__file__))

# The SQLAlchemy connection string.
SQLALCHEMY_DATABASE_URI = conf.get('core', 'SQL_ALCHEMY_CONN')

# Flask-WTF flag for CSRF
CSRF_ENABLED = True

# ----------------------------------------------------
# AUTHENTICATION CONFIG
# ----------------------------------------------------
# For details on how to set up each of the following authentication, see
# http://flask-appbuilder.readthedocs.io/en/latest/security.html# authentication-methods
# for details.

# The authentication type
# AUTH_OID : Is for OpenID
# AUTH_DB : Is for database
# AUTH_LDAP : Is for LDAP
Example #52
0
def upgradedb(args):
    print("DB: " + configuration.get('core', 'SQL_ALCHEMY_CONN'))
    utils.upgradedb()
Example #53
0
 def serve_logs(filename):
     log = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER'))
     return flask.send_from_directory(log,
                                      filename,
                                      mimetype="application/json",
                                      as_attachment=False)
Example #54
0
def get_parser():
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(help='sub-command help')

    ht = "Run subsections of a DAG for a specified date range"
    parser_backfill = subparsers.add_parser('backfill', help=ht)
    parser_backfill.add_argument("dag_id", help="The id of the dag to run")
    parser_backfill.add_argument(
        "-t",
        "--task_regex",
        help="The regex to filter specific task_ids to backfill (optional)")
    parser_backfill.add_argument("-s",
                                 "--start_date",
                                 help="Override start_date YYYY-MM-DD")
    parser_backfill.add_argument("-e",
                                 "--end_date",
                                 help="Override end_date YYYY-MM-DD")
    parser_backfill.add_argument("-m",
                                 "--mark_success",
                                 help=mark_success_help,
                                 action="store_true")
    parser_backfill.add_argument("-l",
                                 "--local",
                                 help="Run the task using the LocalExecutor",
                                 action="store_true")
    parser_backfill.add_argument(
        "-x",
        "--donot_pickle",
        help=("Do not attempt to pickle the DAG object to send over "
              "to the workers, just tell the workers to run their version "
              "of the code."),
        action="store_true")
    parser_backfill.add_argument("-a",
                                 "--include_adhoc",
                                 help="Include dags with the adhoc parameter.",
                                 action="store_true")
    parser_backfill.add_argument(
        "-i",
        "--ignore_dependencies",
        help=(
            "Skip upstream tasks, run only the tasks "
            "matching the regexp. Only works in conjunction with task_regex"),
        action="store_true")
    parser_backfill.add_argument("-sd",
                                 "--subdir",
                                 help=subdir_help,
                                 default=DAGS_FOLDER)
    parser_backfill.add_argument("-p",
                                 "--pool",
                                 help="Pool to use to run the backfill")
    parser_backfill.add_argument("-dr",
                                 "--dry_run",
                                 help="Perform a dry run",
                                 action="store_true")
    parser_backfill.set_defaults(func=backfill)

    ht = "Clear a set of task instance, as if they never ran"
    parser_clear = subparsers.add_parser('clear', help=ht)
    parser_clear.add_argument("dag_id", help="The id of the dag to run")
    parser_clear.add_argument(
        "-t",
        "--task_regex",
        help="The regex to filter specific task_ids to clear (optional)")
    parser_clear.add_argument("-s",
                              "--start_date",
                              help="Override start_date YYYY-MM-DD")
    parser_clear.add_argument("-e",
                              "--end_date",
                              help="Override end_date YYYY-MM-DD")
    ht = "Include upstream tasks"
    parser_clear.add_argument("-u", "--upstream", help=ht, action="store_true")
    ht = "Only failed jobs"
    parser_clear.add_argument("-f",
                              "--only_failed",
                              help=ht,
                              action="store_true")
    ht = "Only running jobs"
    parser_clear.add_argument("-r",
                              "--only_running",
                              help=ht,
                              action="store_true")
    ht = "Include downstream tasks"
    parser_clear.add_argument("-d",
                              "--downstream",
                              help=ht,
                              action="store_true")
    parser_clear.add_argument("-sd",
                              "--subdir",
                              help=subdir_help,
                              default=DAGS_FOLDER)
    parser_clear.add_argument("-c",
                              "--no_confirm",
                              help=ht,
                              action="store_true")
    parser_clear.set_defaults(func=clear)

    ht = "Trigger a DAG"
    parser_trigger_dag = subparsers.add_parser('trigger_dag', help=ht)
    parser_trigger_dag.add_argument("dag_id", help="The id of the dag to run")
    parser_trigger_dag.add_argument("-r",
                                    "--run_id",
                                    help="Helps to indentify this run")
    parser_trigger_dag.set_defaults(func=trigger_dag)

    ht = "Run a single task instance"
    parser_run = subparsers.add_parser('run', help=ht)
    parser_run.add_argument("dag_id", help="The id of the dag to run")
    parser_run.add_argument("task_id", help="The task_id to run")
    parser_run.add_argument("execution_date", help="The execution date to run")
    parser_run.add_argument("-sd",
                            "--subdir",
                            help=subdir_help,
                            default=DAGS_FOLDER)
    parser_run.add_argument(
        "-s",
        "--task_start_date",
        help="Override the tasks's start_date (used internally)",
    )
    parser_run.add_argument("-m",
                            "--mark_success",
                            help=mark_success_help,
                            action="store_true")
    parser_run.add_argument("-f",
                            "--force",
                            help="Force a run regardless or previous success",
                            action="store_true")
    parser_run.add_argument(
        "-l",
        "--local",
        help="Runs the task locally, don't use the executor",
        action="store_true")
    parser_run.add_argument("-r",
                            "--raw",
                            help=argparse.SUPPRESS,
                            action="store_true")
    parser_run.add_argument("--pool",
                            help="Pool to use to run the task instance")
    parser_run.add_argument(
        "-i",
        "--ignore_dependencies",
        help="Ignore upstream and depends_on_past dependencies",
        action="store_true")
    parser_run.add_argument(
        "--ship_dag",
        help="Pickles (serializes) the DAG and ships it to the worker",
        action="store_true")
    parser_run.add_argument(
        "-p",
        "--pickle",
        help="Serialized pickle object of the entire dag (used internally)")
    parser_run.add_argument("-j", "--job_id", help=argparse.SUPPRESS)
    parser_run.set_defaults(func=run)

    ht = ("Test a task instance. This will run a task without checking for "
          "dependencies or recording it's state in the database.")
    parser_test = subparsers.add_parser('test', help=ht)
    parser_test.add_argument("dag_id", help="The id of the dag to run")
    parser_test.add_argument("task_id", help="The task_id to run")
    parser_test.add_argument("execution_date",
                             help="The execution date to run")
    parser_test.add_argument("-sd",
                             "--subdir",
                             help=subdir_help,
                             default=DAGS_FOLDER)
    parser_test.add_argument("-dr",
                             "--dry_run",
                             help="Perform a dry run",
                             action="store_true")
    parser_test.set_defaults(func=test)

    ht = "Get the status of a task instance."
    parser_task_state = subparsers.add_parser('task_state', help=ht)
    parser_task_state.add_argument("dag_id", help="The id of the dag to check")
    parser_task_state.add_argument("task_id", help="The task_id to check")
    parser_task_state.add_argument("execution_date",
                                   help="The execution date to check")
    parser_task_state.add_argument("-sd",
                                   "--subdir",
                                   help=subdir_help,
                                   default=DAGS_FOLDER)
    parser_task_state.set_defaults(func=task_state)

    ht = "Start a Airflow webserver instance"
    parser_webserver = subparsers.add_parser('webserver', help=ht)
    parser_webserver.add_argument(
        "-p",
        "--port",
        default=configuration.get('webserver', 'WEB_SERVER_PORT'),
        type=int,
        help="Set the port on which to run the web server")
    parser_webserver.add_argument(
        "-w",
        "--workers",
        default=configuration.get('webserver', 'WORKERS'),
        type=int,
        help="Number of workers to run the webserver on")
    parser_webserver.add_argument(
        "-k",
        "--workerclass",
        default=configuration.get('webserver', 'WORKER_CLASS'),
        choices=['sync', 'eventlet', 'gevent', 'tornado'],
        help="The worker class to use for gunicorn")
    parser_webserver.add_argument(
        "-hn",
        "--hostname",
        default=configuration.get('webserver', 'WEB_SERVER_HOST'),
        help="Set the hostname on which to run the web server")
    ht = "Use the server that ships with Flask in debug mode"
    parser_webserver.add_argument("-d",
                                  "--debug",
                                  help=ht,
                                  action="store_true")
    parser_webserver.set_defaults(func=webserver)

    ht = "Start a scheduler scheduler instance"
    parser_scheduler = subparsers.add_parser('scheduler', help=ht)
    parser_scheduler.add_argument("-d",
                                  "--dag_id",
                                  help="The id of the dag to run")
    parser_scheduler.add_argument("-sd",
                                  "--subdir",
                                  help=subdir_help,
                                  default=DAGS_FOLDER)
    parser_scheduler.add_argument(
        "-n",
        "--num_runs",
        default=None,
        type=int,
        help="Set the number of runs to execute before exiting")
    parser_scheduler.add_argument(
        "-p",
        "--do_pickle",
        default=False,
        help=("Attempt to pickle the DAG object to send over "
              "to the workers, instead of letting workers run their version "
              "of the code."),
        action="store_true")
    parser_scheduler.set_defaults(func=scheduler)

    ht = "Initialize the metadata database"
    parser_initdb = subparsers.add_parser('initdb', help=ht)
    parser_initdb.set_defaults(func=initdb)

    ht = "Burn down and rebuild the metadata database"
    parser_resetdb = subparsers.add_parser('resetdb', help=ht)
    parser_resetdb.add_argument(
        "-y",
        "--yes",
        default=False,
        help="Do not prompt to confirm reset. Use with care!",
        action="store_true")
    parser_resetdb.set_defaults(func=resetdb)

    ht = "Upgrade metadata database to latest version"
    parser_upgradedb = subparsers.add_parser('upgradedb', help=ht)
    parser_upgradedb.set_defaults(func=upgradedb)

    ht = "List the DAGs"
    parser_list_dags = subparsers.add_parser('list_dags', help=ht)
    parser_list_dags.add_argument("-sd",
                                  "--subdir",
                                  help=subdir_help,
                                  default=DAGS_FOLDER)
    parser_list_dags.set_defaults(func=list_dags)

    ht = "List the tasks within a DAG"
    parser_list_tasks = subparsers.add_parser('list_tasks', help=ht)
    parser_list_tasks.add_argument("-t",
                                   "--tree",
                                   help="Tree view",
                                   action="store_true")
    parser_list_tasks.add_argument("dag_id", help="The id of the dag")
    parser_list_tasks.add_argument("-sd",
                                   "--subdir",
                                   help=subdir_help,
                                   default=DAGS_FOLDER)
    parser_list_tasks.set_defaults(func=list_tasks)

    ht = "Start a Celery worker node"
    parser_worker = subparsers.add_parser('worker', help=ht)
    parser_worker.add_argument("-q",
                               "--queues",
                               help="Comma delimited list of queues to serve",
                               default=configuration.get(
                                   'celery', 'DEFAULT_QUEUE'))
    parser_worker.add_argument("-c",
                               "--concurrency",
                               type=int,
                               help="The number of worker processes",
                               default=configuration.get(
                                   'celery', 'celeryd_concurrency'))
    parser_worker.set_defaults(func=worker)

    ht = "Serve logs generate by worker"
    parser_logs = subparsers.add_parser('serve_logs', help=ht)
    parser_logs.set_defaults(func=serve_logs)

    ht = "Start a Celery Flower"
    parser_flower = subparsers.add_parser('flower', help=ht)
    parser_flower.add_argument("-p", "--port", help="The port")
    parser_flower.add_argument("-a", "--broker_api", help="Broker api")
    parser_flower.set_defaults(func=flower)

    parser_version = subparsers.add_parser('version', help="Show version")
    parser_version.set_defaults(func=version)

    ht = "Start a kerberos ticket renewer"
    parser_kerberos = subparsers.add_parser('kerberos', help=ht)
    parser_kerberos.add_argument("-kt",
                                 "--keytab",
                                 help="keytab",
                                 nargs='?',
                                 default=configuration.get(
                                     'kerberos', 'keytab'))
    parser_kerberos.add_argument("principal",
                                 help="kerberos principal",
                                 nargs='?',
                                 default=configuration.get(
                                     'kerberos', 'principal'))
    parser_kerberos.set_defaults(func=kerberos)

    return parser
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import multiprocessing
import subprocess
import time

from builtins import range

from airflow import configuration
from airflow.executors.base_executor import BaseExecutor
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.utils.state import State

PARALLELISM = configuration.get('core', 'PARALLELISM')


class LocalWorker(multiprocessing.Process, LoggingMixin):
    def __init__(self, task_queue, result_queue):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue
        self.daemon = True

    def run(self):
        while True:
            key, command = self.task_queue.get()
            if key is None:
                # Received poison pill, no more tasks to run
                self.task_queue.task_done()
Example #56
0
def initdb(args):
    print("DB: " + configuration.get('core', 'SQL_ALCHEMY_CONN'))
    utils.initdb()
    print("Done.")
Example #57
0
from sqlalchemy.orm.session import make_transient

from airflow import executors, models, settings, utils
from airflow import configuration
from airflow.utils import AirflowException, State, LoggingMixin


Base = models.Base
ID_LEN = models.ID_LEN

# Setting up a statsd client if needed
statsd = None
if configuration.getboolean('scheduler', 'statsd_on'):
    from statsd import StatsClient
    statsd = StatsClient(
        host=configuration.get('scheduler', 'statsd_host'),
        port=configuration.getint('scheduler', 'statsd_port'),
        prefix=configuration.get('scheduler', 'statsd_prefix'))


class BaseJob(Base, LoggingMixin):
    """
    Abstract class to be derived for jobs. Jobs are processing items with state
    and duration that aren't task instances. For instance a BackfillJob is
    a collection of task instance runs, but should have it's own state, start
    and end time.
    """

    __tablename__ = "job"

    id = Column(Integer, primary_key=True)
Example #58
0
import os
import subprocess
from datetime import datetime

from builtins import input
import argparse
import dateutil.parser

import airflow
from airflow import jobs, settings, utils
from airflow import configuration
from airflow.executors import DEFAULT_EXECUTOR
from airflow.models import DagBag, TaskInstance, DagPickle, DagRun
from airflow.utils import AirflowException, State

DAGS_FOLDER = os.path.expanduser(configuration.get('core', 'DAGS_FOLDER'))

# Common help text across subcommands
mark_success_help = "Mark jobs as succeeded without running them"
subdir_help = "File location or directory from which to look for the dag"


def process_subdir(subdir):
    dags_folder = configuration.get("core", "DAGS_FOLDER")
    dags_folder = os.path.expanduser(dags_folder)
    if subdir:
        subdir = os.path.expanduser(subdir)
        if "DAGS_FOLDER" in subdir:
            subdir = subdir.replace("DAGS_FOLDER", dags_folder)
        if dags_folder not in subdir:
            raise AirflowException(
def check_hive_conf():
    from airflow import configuration as conf
    assert conf.get('hive', 'default_hive_mapred_queue') == 'airflow'
Example #60
0
 def jinja_globals():
     return {
         'hostname': socket.getfqdn(),
         'navbar_color': conf.get('webserver', 'NAVBAR_COLOR'),
     }