コード例 #1
0
    def start_go_pipeline(
        self,
        variables: dict,
        go_file: str,
        process_line_callback: Optional[Callable[[str], None]] = None,
        should_init_module: bool = False,
    ) -> None:
        """
        Starts Apache Beam Go pipeline.

        :param variables: Variables passed to the job.
        :param go_file: Path to the Go file with your beam pipeline.
        :param go_file:
        :param process_line_callback: (optional) Callback that can be used to process each line of
            the stdout and stderr file descriptors.
        :param should_init_module: If False (default), will just execute a `go run` command. If True, will
            init a module and dependencies with a ``go mod init`` and ``go mod tidy``, useful when pulling
            source with GCSHook.
        :return:
        """
        if shutil.which("go") is None:
            raise AirflowConfigException(
                "You need to have Go installed to run beam go pipeline. See https://go.dev/doc/install "
                "installation guide. If you are running airflow in Docker see more info at "
                "'https://airflow.apache.org/docs/docker-stack/recipes.html'.")

        if "labels" in variables:
            variables["labels"] = json.dumps(variables["labels"],
                                             separators=(",", ":"))

        working_directory = os.path.dirname(go_file)
        basename = os.path.basename(go_file)

        if should_init_module:
            init_module("main", working_directory)
            install_dependencies(working_directory)

        command_prefix = ["go", "run", basename]
        self._start_pipeline(
            variables=variables,
            command_prefix=command_prefix,
            process_line_callback=process_line_callback,
            working_directory=working_directory,
        )
コード例 #2
0
def run_command(command):
    """
    Runs command and returns stdout
    """
    process = subprocess.Popen(shlex.split(command),
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               close_fds=True)
    output, stderr = [
        stream.decode(sys.getdefaultencoding(), 'ignore')
        for stream in process.communicate()
    ]

    if process.returncode != 0:
        raise AirflowConfigException(
            "Cannot execute {}. Error code is: {}. Output: {}, Stderr: {}".
            format(command, process.returncode, output, stderr))

    return output
コード例 #3
0
ファイル: platform.py プロジェクト: wolvery/airflow
def getuser() -> str:
    """
    Gets the username associated with the current user, or error with a nice
    error message if there's no current user.

    We don't want to fall back to os.getuid() because not having a username
    probably means the rest of the user environment is wrong (e.g. no $HOME).
    Explicit failure is better than silently trying to work badly.
    """
    try:
        return getpass.getuser()
    except KeyError:
        # Inner import to avoid circular import
        from airflow.exceptions import AirflowConfigException

        raise AirflowConfigException(
            "The user that Airflow is running as has no username; you must run"
            "Airflow as a full user, with a username and home directory, "
            "in order for it to function properly.")
コード例 #4
0
def fetch_config(ds, **kwargs):
    CONFIG_PARAM, config = 'file', None
    conf_file = kwargs['dag_run'].conf[CONFIG_PARAM]
    if conf_file is None or conf_file.strip() == '':
        raise AirflowException(
            'Config parameter {} is not specified.'.format(CONFIG_PARAM))
    print('Config file for the job: {}'.format(conf_file))
    print('Reading configuration from {}'.format(conf_file))
    try:
        with open(conf_file, "r") as f:
            config = yaml.load(f)
    except Exception as error:
        raise AirflowException(
            'Error while reading the config file: {}'.format(error))
    try:
        validate(instance=config, schema=CONFIG_SCHEMA)
    except ValidationError as error:
        raise AirflowConfigException(
            'Invalid configuration specified: {}'.format(error))
    if 'staging_table_id' not in config['bigquery']:
        config['bigquery']['staging_table_id'] = 'staging_{}'.format(
            str(uuid4())[:8])
    config['bigquery']['merge_table'] = '{}:{}.{}'.format(
        config['bigquery']['project_id'], config['bigquery']['dataset_id'],
        config['bigquery']['table_id'])
    config['bigquery']['staging_table'] = '{}:{}.{}'.format(
        config['bigquery']['project_id'], config['bigquery']['dataset_id'],
        config['bigquery']['staging_table_id'])
    if 'jobname' not in config['dataflow']:
        config['dataflow']['jobname'] = '{}-{}'.format(
            config['dataflow']['job_name_prefix'],
            str(uuid4())[:8])
    config['bigquery'][
        'merge_query'] = 'MERGE `{}` t USING `{}` s ON {} WHEN MATCHED THEN {} WHEN NOT MATCHED THEN {}'.format(
            config['bigquery']['merge_table'].replace(':', '.'),
            config['bigquery']['staging_table'].replace(':', '.'),
            config['bigquery']['merge']['condition'],
            config['bigquery']['merge']['matched'],
            config['bigquery']['merge']['notmatched'])
    print('Airflow config: {}'.format(config))
    config_var = 'config-{}'.format(kwargs['dag_run'].run_id)
    print('Writing config to variable: {}'.format(config_var))
    Variable.set(config_var, config, serialize_json=True)
コード例 #5
0
    def load_executor(cls, executor_name: str) -> BaseExecutor:
        """
        Loads the executor.

        This supports the following formats:
        * by executor name for core executor
        * by ``{plugin_name}.{class_name}`` for executor from plugins
        * by import path.

        :return: an instance of executor class via executor_name
        """
        if executor_name == CELERY_KUBERNETES_EXECUTOR:
            return cls.__load_celery_kubernetes_executor()

        if executor_name in cls.executors:
            log.debug("Loading core executor: %s", executor_name)
            return import_string(cls.executors[executor_name])()
        # If the executor name looks like "plugin executor path" then try to load plugins.
        if executor_name.count(".") == 1:
            log.debug(
                "The executor name looks like the plugin path (executor_name=%s). Trying to load a "
                "executor from a plugin",
                executor_name,
            )
            with suppress(ImportError), suppress(AttributeError):
                # Load plugins here for executors as at that time the plugins might not have been
                # initialized yet
                from airflow import plugins_manager

                plugins_manager.integrate_executor_plugins()
                return import_string(f"airflow.executors.{executor_name}")()

        log.debug("Loading executor from custom path: %s", executor_name)
        try:
            executor = import_string(executor_name)()
        except ImportError as e:
            log.error(e)
            raise AirflowConfigException(
                f'The module/attribute could not be loaded. Please check "executor" key in "core" section. '
                f'Current value: "{executor_name}".')
        log.info("Loaded executor: %s", executor_name)

        return executor
コード例 #6
0
ファイル: configuration.py プロジェクト: redjohn03/airflow
    def getimport(self, section, key, **kwargs):  # noqa
        """
        Reads options, imports the full qualified name, and returns the object.

        In case of failure, it throws an exception a clear message with the key aad the section names

        :return: The object or None, if the option is empty
        """
        full_qualified_path = conf.get(section=section, key=key, **kwargs)
        if not full_qualified_path:
            return None

        try:
            return import_string(full_qualified_path)
        except ImportError as e:
            log.error(e)
            raise AirflowConfigException(
                f'The object could not be loaded. Please check "{key}" key in "{section}" section. '
                f'Current value: "{full_qualified_path}".')
コード例 #7
0
    def execute(self, context):
        ambari_hook = HdpAmbariHook(ambari_conn_id=self.ambari_conn_id)
        datas = {"user.name": ambari_hook.cluster_name}

        for attr_name in ["statusdir", "files", "callback"]:
            attr_value = getattr(self, attr_name)
            if attr_value is not None and attr_value != "":
                datas[attr_name] = attr_value

        if is_not_null_and_is_not_empty_str(self.file):
            datas["file"] = self.file
        elif not is_not_null_and_is_not_empty_str(self.file) and is_not_null_and_is_not_empty_str(self.execute_query):
            datas["execute"] = self.execute_query
        else:
            raise AirflowConfigException("Request body must include file or execute params")

        if self.enablelog:
            datas["enablelog"] = self.enablelog

        job_id = ambari_hook.submit_hive_job(datas, self.arg)
        if self.do_xcom_push:
            context['ti'].xcom_push(key='hive_job_id', value=job_id)
コード例 #8
0
    def submit_hive_job(self, body_params: dict, arg: str = None) -> str:
        """
        Executes hql code or hive script in Azure HDInsight Cluster

        See https://cwiki.apache.org/confluence/display/Hive/WebHCat+Reference+Job

        :param arg: define arg params for hive =>  key1=value1;key2=value2
        :param execution_timeout: connection timeout of requesting to hortomwork cluster
        :type execution_timeout: int
        :param body_params: parametres of Hive script
        :type body_params: dict

        """

        if not ("file" in body_params or "execute" in body_params):
            raise AirflowConfigException(
                "Request body must include file or execute params")

        body_params.update(self.default_params)

        if arg is not None:
            hive_defines = urlencode([("define", x)
                                      for x in str(arg).split(";")])
            self.query = urlencode(body_params) + "&" + hive_defines
        else:
            self.query = urlencode(body_params)

        self.method = "POST"
        submit_endpoint = self.hive_endpoint + "hive"

        self.log.debug("Submiting hive  Script: %s", str(self.query))
        response = self.run(endpoint=submit_endpoint,
                            data=self.query,
                            headers=self.headers)

        job_id = response["id"]
        self.log.info("Finished submitting hive script job_id: %s", job_id)
        return job_id
コード例 #9
0
    def get(self, section, key, **kwargs):
        section = str(section).lower()
        key = str(key).lower()

        # first check environment variables
        option = self._get_env_var_option(section, key)
        if option is not None:
            return option

        # ...then the config file
        if self.has_option(section, key):
            return expand_env_var(
                ConfigParser.get(self, section, key, **kwargs))

        # ...then commands
        option = self._get_cmd_option(section, key)
        if option:
            return option

        log.warning("section/key [{section}/{key}] not found in config".format(
            **locals()))

        raise AirflowConfigException("section/key [{section}/{key}] not found "
                                     "in config".format(**locals()))
コード例 #10
0
    def _validate(self):
        if (self.get("core",
                     "executor") not in ('DebugExecutor', 'SequentialExecutor')
                and "sqlite" in self.get('core', 'sql_alchemy_conn')):
            raise AirflowConfigException(
                "error: cannot use sqlite with the {}".format(
                    self.get('core', 'executor')))

        for section, replacement in self.deprecated_values.items():
            for name, info in replacement.items():
                old, new, version = info
                current_value = self.get(section, name, fallback=None)
                if self._using_old_value(old, current_value):
                    new_value = re.sub(old, new, current_value)
                    self._update_env_var(section=section,
                                         name=name,
                                         new_value=new_value)
                    self._create_future_warning(name=name,
                                                section=section,
                                                current_value=current_value,
                                                new_value=new_value,
                                                version=version)

        self.is_validated = True
コード例 #11
0
    def execute(self, context):
        try:
            OUTLOOK_USER = configuration.conf.get('outlook', 'OUTLOOK_USER')
            OUTLOOK_PASSWORD = configuration.conf.get('outlook',
                                                      'OUTLOOK_PASSWORD')
        except AirflowConfigException:
            raise AirflowConfigException(
                "No user/password found for Outlook, so logging in with no authentication."
            )
        authentication = (OUTLOOK_USER, OUTLOOK_PASSWORD)

        m = Message(auth=authentication)
        m.setRecipients(self.to)
        m.setSubject(self.subject)
        if self.html_flag:
            m.setBodyHTML(self.body)
        else:
            m.setBody(self.body)
        if isinstance(self.files, (list, )):
            for file in self.files:
                att = Attachment(path=file)
                m.attachments.append(att)
        m.sendMessage()
        print("True")
コード例 #12
0
ファイル: logging_config.py プロジェクト: vipadm/airflow
def validate_logging_config(logging_config):
    """Validate the provided Logging Config"""
    # Now lets validate the other logging-related settings
    task_log_reader = conf.get('logging', 'task_log_reader')

    logger = logging.getLogger('airflow.task')

    def _get_handler(name):
        return next((h for h in logger.handlers if h.name == name), None)

    if _get_handler(task_log_reader) is None:
        # Check for pre 1.10 setting that might be in deployed airflow.cfg files
        if task_log_reader == "file.task" and _get_handler("task"):
            warnings.warn(
                f"task_log_reader setting in [logging] has a deprecated value of {task_log_reader!r}, "
                "but no handler with this name was found. Please update your config to use task. "
                "Running config has been adjusted to match",
                DeprecationWarning,
            )
            conf.set('logging', 'task_log_reader', 'task')
        else:
            raise AirflowConfigException(
                f"Configured task_log_reader {task_log_reader!r} was not a handler of "
                f"the 'airflow.task' logger.")
コード例 #13
0
    def _validate(self):
        if (self.get("core", "executor") != 'SequentialExecutor'
                and "sqlite" in self.get('core', 'sql_alchemy_conn')):
            raise AirflowConfigException(
                "error: cannot use sqlite with the {}".format(
                    self.get('core', 'executor')))

        for section, replacement in self.deprecated_values.items():
            for name, info in replacement.items():
                old, new, version = info
                if self.get(section, name, fallback=None) == old:
                    # Make sure the env var option is removed, otherwise it
                    # would be read and used instead of the value we set
                    env_var = self._env_var_name(section, name)
                    os.environ.pop(env_var, None)

                    self.set(section, name, new)
                    warnings.warn(
                        self.deprecation_value_format_string.format(
                            **locals()),
                        FutureWarning,
                    )

        self.is_validated = True
コード例 #14
0
    def get_statsd_logger(self):
        if conf.getboolean('scheduler', 'statsd_on'):
            from statsd import StatsClient

            if conf.has_option('scheduler', 'statsd_custom_client_path'):
                stats_class = conf.getimport('scheduler', 'statsd_custom_client_path')

                if not issubclass(stats_class, StatsClient):
                    raise AirflowConfigException(
                        "Your custom Statsd client must extend the statsd.StatsClient in order to ensure "
                        "backwards compatibility."
                    )
                else:
                    log.info("Successfully loaded custom Statsd client")

            else:
                stats_class = StatsClient

        statsd = stats_class(
            host=conf.get('scheduler', 'statsd_host'),
            port=conf.getint('scheduler', 'statsd_port'),
            prefix=conf.get('scheduler', 'statsd_prefix'))
        allow_list_validator = AllowListValidator(conf.get('scheduler', 'statsd_allow_list', fallback=None))
        return SafeStatsdLogger(statsd, allow_list_validator)
コード例 #15
0
def create_app(config=None, testing=False):
    """Create a new instance of Airflow WWW app"""
    flask_app = Flask(__name__)
    flask_app.secret_key = conf.get('webserver', 'SECRET_KEY')

    flask_app.config['PERMANENT_SESSION_LIFETIME'] = timedelta(
        minutes=settings.get_session_lifetime_config())
    flask_app.config.from_pyfile(settings.WEBSERVER_CONFIG, silent=True)
    flask_app.config['APP_NAME'] = conf.get(section="webserver",
                                            key="instance_name",
                                            fallback="Airflow")
    flask_app.config['TESTING'] = testing
    flask_app.config['SQLALCHEMY_DATABASE_URI'] = conf.get(
        'database', 'SQL_ALCHEMY_CONN')

    url = make_url(flask_app.config['SQLALCHEMY_DATABASE_URI'])
    if url.drivername == 'sqlite' and url.database and not url.database.startswith(
            '/'):
        raise AirflowConfigException(
            f'Cannot use relative path: `{conf.get("database", "SQL_ALCHEMY_CONN")}` to connect to sqlite. '
            'Please use absolute path such as `sqlite:////tmp/airflow.db`.')

    flask_app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False

    flask_app.config['SESSION_COOKIE_HTTPONLY'] = True
    flask_app.config['SESSION_COOKIE_SECURE'] = conf.getboolean(
        'webserver', 'COOKIE_SECURE')

    cookie_samesite_config = conf.get('webserver', 'COOKIE_SAMESITE')
    if cookie_samesite_config == "":
        warnings.warn(
            "Old deprecated value found for `cookie_samesite` option in `[webserver]` section. "
            "Using `Lax` instead. Change the value to `Lax` in airflow.cfg to remove this warning.",
            DeprecationWarning,
        )
        cookie_samesite_config = "Lax"
    flask_app.config['SESSION_COOKIE_SAMESITE'] = cookie_samesite_config

    if config:
        flask_app.config.from_mapping(config)

    if 'SQLALCHEMY_ENGINE_OPTIONS' not in flask_app.config:
        flask_app.config[
            'SQLALCHEMY_ENGINE_OPTIONS'] = settings.prepare_engine_args()

    # Configure the JSON encoder used by `|tojson` filter from Flask
    flask_app.json_encoder = AirflowJsonEncoder

    csrf.init_app(flask_app)

    init_wsgi_middleware(flask_app)

    db = SQLA()
    db.session = settings.Session
    db.init_app(flask_app)

    init_dagbag(flask_app)

    init_api_experimental_auth(flask_app)

    init_robots(flask_app)

    cache_config = {
        'CACHE_TYPE': 'flask_caching.backends.filesystem',
        'CACHE_DIR': gettempdir()
    }
    Cache(app=flask_app, config=cache_config)

    init_flash_views(flask_app)

    configure_logging()
    configure_manifest_files(flask_app)

    with flask_app.app_context():
        init_appbuilder(flask_app)

        init_appbuilder_views(flask_app)
        init_appbuilder_links(flask_app)
        init_plugins(flask_app)
        init_connection_form()
        init_error_handlers(flask_app)
        init_api_connexion(flask_app)
        init_api_experimental(flask_app)

        sync_appbuilder_roles(flask_app)

        init_jinja_globals(flask_app)
        init_xframe_protection(flask_app)
        init_airflow_session_interface(flask_app)
    return flask_app
コード例 #16
0
 def _validate(self):
     if not self.dags_volume_claim and (not self.git_repo or not self.git_branch):
         raise AirflowConfigException(
             'In kubernetes mode the following must be set in the `kubernetes` '
             'config section: `dags_volume_claim` or `git_repo and git_branch`')
コード例 #17
0
    def get(self, section, key, **kwargs):
        section = str(section).lower()
        key = str(key).lower()

        deprecated_name = self.deprecated_options.get(section, {}).get(key, None)

        # first check environment variables
        option = self._get_env_var_option(section, key)
        if option is not None:
            return option
        if deprecated_name:
            option = self._get_env_var_option(section, deprecated_name)
            if option is not None:
                self._warn_deprecate(section, key, deprecated_name)
                return option

        # ...then the config file
        if super(AirflowConfigParser, self).has_option(section, key):
            # Use the parent's methods to get the actual config here to be able to
            # separate the config from default config.
            return expand_env_var(
                super(AirflowConfigParser, self).get(section, key, **kwargs))
        if deprecated_name:
            if super(AirflowConfigParser, self).has_option(section, deprecated_name):
                self._warn_deprecate(section, key, deprecated_name)
                return expand_env_var(super(AirflowConfigParser, self).get(
                    section,
                    deprecated_name,
                    **kwargs
                ))

        # ...then commands
        option = self._get_cmd_option(section, key)
        if option:
            return option
        if deprecated_name:
            option = self._get_cmd_option(section, deprecated_name)
            if option:
                self._warn_deprecate(section, key, deprecated_name)
                return option

        # ...then from secret backends
        option = self._get_secret_option(section, key)
        if option:
            return option
        if deprecated_name:
            option = self._get_secret_option(section, deprecated_name)
            if option:
                self._warn_deprecate(section, key, deprecated_name)
                return option

        # ...then the default config
        if self.airflow_defaults.has_option(section, key) or 'fallback' in kwargs:
            return expand_env_var(
                self.airflow_defaults.get(section, key, **kwargs))

        else:
            log.warning(
                "section/key [%s/%s] not found in config", section, key
            )

            raise AirflowConfigException(
                "section/key [{section}/{key}] not found "
                "in config".format(section=section, key=key))
コード例 #18
0
ファイル: kube_config.py プロジェクト: vipadm/airflow
    def __init__(self):
        configuration_dict = conf.as_dict(display_sensitive=True)
        self.core_configuration = configuration_dict[self.core_section]
        self.airflow_home = AIRFLOW_HOME
        self.dags_folder = conf.get(self.core_section, 'dags_folder')
        self.parallelism = conf.getint(self.core_section, 'parallelism')
        self.pod_template_file = conf.get(self.kubernetes_section,
                                          'pod_template_file',
                                          fallback=None)

        self.delete_worker_pods = conf.getboolean(self.kubernetes_section,
                                                  'delete_worker_pods')
        self.delete_worker_pods_on_failure = conf.getboolean(
            self.kubernetes_section, 'delete_worker_pods_on_failure')
        self.worker_pods_creation_batch_size = conf.getint(
            self.kubernetes_section, 'worker_pods_creation_batch_size')

        self.worker_container_repository = conf.get(
            self.kubernetes_section, 'worker_container_repository')
        self.worker_container_tag = conf.get(self.kubernetes_section,
                                             'worker_container_tag')
        if self.worker_container_repository and self.worker_container_tag:
            self.kube_image = f'{self.worker_container_repository}:{self.worker_container_tag}'
        else:
            self.kube_image = None

        # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note
        # that if your
        # cluster has RBAC enabled, your scheduler may need service account permissions to
        # create, watch, get, and delete pods in this namespace.
        self.kube_namespace = conf.get(self.kubernetes_section, 'namespace')
        self.multi_namespace_mode = conf.getboolean(self.kubernetes_section,
                                                    'multi_namespace_mode')
        # The Kubernetes Namespace in which pods will be created by the executor. Note
        # that if your
        # cluster has RBAC enabled, your workers may need service account permissions to
        # interact with cluster components.
        self.executor_namespace = conf.get(self.kubernetes_section,
                                           'namespace')

        self.worker_pods_pending_timeout = conf.getint(
            self.kubernetes_section, 'worker_pods_pending_timeout')
        self.worker_pods_pending_timeout_check_interval = conf.getint(
            self.kubernetes_section,
            'worker_pods_pending_timeout_check_interval')
        self.worker_pods_pending_timeout_batch_size = conf.getint(
            self.kubernetes_section, 'worker_pods_pending_timeout_batch_size')
        self.worker_pods_queued_check_interval = conf.getint(
            self.kubernetes_section, 'worker_pods_queued_check_interval')

        self.kube_client_request_args = conf.getjson(
            self.kubernetes_section, 'kube_client_request_args', fallback={})
        if not isinstance(self.kube_client_request_args, dict):
            raise AirflowConfigException(
                f"[{self.kubernetes_section}] 'kube_client_request_args' expected a JSON dict, got "
                + type(self.kube_client_request_args).__name__)
        if self.kube_client_request_args:
            if '_request_timeout' in self.kube_client_request_args and isinstance(
                    self.kube_client_request_args['_request_timeout'], list):
                self.kube_client_request_args['_request_timeout'] = tuple(
                    self.kube_client_request_args['_request_timeout'])
        self.delete_option_kwargs = conf.getjson(self.kubernetes_section,
                                                 'delete_option_kwargs',
                                                 fallback={})
        if not isinstance(self.delete_option_kwargs, dict):
            raise AirflowConfigException(
                f"[{self.kubernetes_section}] 'delete_option_kwargs' expected a JSON dict, got "
                + type(self.delete_option_kwargs).__name__)