def start_go_pipeline( self, variables: dict, go_file: str, process_line_callback: Optional[Callable[[str], None]] = None, should_init_module: bool = False, ) -> None: """ Starts Apache Beam Go pipeline. :param variables: Variables passed to the job. :param go_file: Path to the Go file with your beam pipeline. :param go_file: :param process_line_callback: (optional) Callback that can be used to process each line of the stdout and stderr file descriptors. :param should_init_module: If False (default), will just execute a `go run` command. If True, will init a module and dependencies with a ``go mod init`` and ``go mod tidy``, useful when pulling source with GCSHook. :return: """ if shutil.which("go") is None: raise AirflowConfigException( "You need to have Go installed to run beam go pipeline. See https://go.dev/doc/install " "installation guide. If you are running airflow in Docker see more info at " "'https://airflow.apache.org/docs/docker-stack/recipes.html'.") if "labels" in variables: variables["labels"] = json.dumps(variables["labels"], separators=(",", ":")) working_directory = os.path.dirname(go_file) basename = os.path.basename(go_file) if should_init_module: init_module("main", working_directory) install_dependencies(working_directory) command_prefix = ["go", "run", basename] self._start_pipeline( variables=variables, command_prefix=command_prefix, process_line_callback=process_line_callback, working_directory=working_directory, )
def run_command(command): """ Runs command and returns stdout """ process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) output, stderr = [ stream.decode(sys.getdefaultencoding(), 'ignore') for stream in process.communicate() ] if process.returncode != 0: raise AirflowConfigException( "Cannot execute {}. Error code is: {}. Output: {}, Stderr: {}". format(command, process.returncode, output, stderr)) return output
def getuser() -> str: """ Gets the username associated with the current user, or error with a nice error message if there's no current user. We don't want to fall back to os.getuid() because not having a username probably means the rest of the user environment is wrong (e.g. no $HOME). Explicit failure is better than silently trying to work badly. """ try: return getpass.getuser() except KeyError: # Inner import to avoid circular import from airflow.exceptions import AirflowConfigException raise AirflowConfigException( "The user that Airflow is running as has no username; you must run" "Airflow as a full user, with a username and home directory, " "in order for it to function properly.")
def fetch_config(ds, **kwargs): CONFIG_PARAM, config = 'file', None conf_file = kwargs['dag_run'].conf[CONFIG_PARAM] if conf_file is None or conf_file.strip() == '': raise AirflowException( 'Config parameter {} is not specified.'.format(CONFIG_PARAM)) print('Config file for the job: {}'.format(conf_file)) print('Reading configuration from {}'.format(conf_file)) try: with open(conf_file, "r") as f: config = yaml.load(f) except Exception as error: raise AirflowException( 'Error while reading the config file: {}'.format(error)) try: validate(instance=config, schema=CONFIG_SCHEMA) except ValidationError as error: raise AirflowConfigException( 'Invalid configuration specified: {}'.format(error)) if 'staging_table_id' not in config['bigquery']: config['bigquery']['staging_table_id'] = 'staging_{}'.format( str(uuid4())[:8]) config['bigquery']['merge_table'] = '{}:{}.{}'.format( config['bigquery']['project_id'], config['bigquery']['dataset_id'], config['bigquery']['table_id']) config['bigquery']['staging_table'] = '{}:{}.{}'.format( config['bigquery']['project_id'], config['bigquery']['dataset_id'], config['bigquery']['staging_table_id']) if 'jobname' not in config['dataflow']: config['dataflow']['jobname'] = '{}-{}'.format( config['dataflow']['job_name_prefix'], str(uuid4())[:8]) config['bigquery'][ 'merge_query'] = 'MERGE `{}` t USING `{}` s ON {} WHEN MATCHED THEN {} WHEN NOT MATCHED THEN {}'.format( config['bigquery']['merge_table'].replace(':', '.'), config['bigquery']['staging_table'].replace(':', '.'), config['bigquery']['merge']['condition'], config['bigquery']['merge']['matched'], config['bigquery']['merge']['notmatched']) print('Airflow config: {}'.format(config)) config_var = 'config-{}'.format(kwargs['dag_run'].run_id) print('Writing config to variable: {}'.format(config_var)) Variable.set(config_var, config, serialize_json=True)
def load_executor(cls, executor_name: str) -> BaseExecutor: """ Loads the executor. This supports the following formats: * by executor name for core executor * by ``{plugin_name}.{class_name}`` for executor from plugins * by import path. :return: an instance of executor class via executor_name """ if executor_name == CELERY_KUBERNETES_EXECUTOR: return cls.__load_celery_kubernetes_executor() if executor_name in cls.executors: log.debug("Loading core executor: %s", executor_name) return import_string(cls.executors[executor_name])() # If the executor name looks like "plugin executor path" then try to load plugins. if executor_name.count(".") == 1: log.debug( "The executor name looks like the plugin path (executor_name=%s). Trying to load a " "executor from a plugin", executor_name, ) with suppress(ImportError), suppress(AttributeError): # Load plugins here for executors as at that time the plugins might not have been # initialized yet from airflow import plugins_manager plugins_manager.integrate_executor_plugins() return import_string(f"airflow.executors.{executor_name}")() log.debug("Loading executor from custom path: %s", executor_name) try: executor = import_string(executor_name)() except ImportError as e: log.error(e) raise AirflowConfigException( f'The module/attribute could not be loaded. Please check "executor" key in "core" section. ' f'Current value: "{executor_name}".') log.info("Loaded executor: %s", executor_name) return executor
def getimport(self, section, key, **kwargs): # noqa """ Reads options, imports the full qualified name, and returns the object. In case of failure, it throws an exception a clear message with the key aad the section names :return: The object or None, if the option is empty """ full_qualified_path = conf.get(section=section, key=key, **kwargs) if not full_qualified_path: return None try: return import_string(full_qualified_path) except ImportError as e: log.error(e) raise AirflowConfigException( f'The object could not be loaded. Please check "{key}" key in "{section}" section. ' f'Current value: "{full_qualified_path}".')
def execute(self, context): ambari_hook = HdpAmbariHook(ambari_conn_id=self.ambari_conn_id) datas = {"user.name": ambari_hook.cluster_name} for attr_name in ["statusdir", "files", "callback"]: attr_value = getattr(self, attr_name) if attr_value is not None and attr_value != "": datas[attr_name] = attr_value if is_not_null_and_is_not_empty_str(self.file): datas["file"] = self.file elif not is_not_null_and_is_not_empty_str(self.file) and is_not_null_and_is_not_empty_str(self.execute_query): datas["execute"] = self.execute_query else: raise AirflowConfigException("Request body must include file or execute params") if self.enablelog: datas["enablelog"] = self.enablelog job_id = ambari_hook.submit_hive_job(datas, self.arg) if self.do_xcom_push: context['ti'].xcom_push(key='hive_job_id', value=job_id)
def submit_hive_job(self, body_params: dict, arg: str = None) -> str: """ Executes hql code or hive script in Azure HDInsight Cluster See https://cwiki.apache.org/confluence/display/Hive/WebHCat+Reference+Job :param arg: define arg params for hive => key1=value1;key2=value2 :param execution_timeout: connection timeout of requesting to hortomwork cluster :type execution_timeout: int :param body_params: parametres of Hive script :type body_params: dict """ if not ("file" in body_params or "execute" in body_params): raise AirflowConfigException( "Request body must include file or execute params") body_params.update(self.default_params) if arg is not None: hive_defines = urlencode([("define", x) for x in str(arg).split(";")]) self.query = urlencode(body_params) + "&" + hive_defines else: self.query = urlencode(body_params) self.method = "POST" submit_endpoint = self.hive_endpoint + "hive" self.log.debug("Submiting hive Script: %s", str(self.query)) response = self.run(endpoint=submit_endpoint, data=self.query, headers=self.headers) job_id = response["id"] self.log.info("Finished submitting hive script job_id: %s", job_id) return job_id
def get(self, section, key, **kwargs): section = str(section).lower() key = str(key).lower() # first check environment variables option = self._get_env_var_option(section, key) if option is not None: return option # ...then the config file if self.has_option(section, key): return expand_env_var( ConfigParser.get(self, section, key, **kwargs)) # ...then commands option = self._get_cmd_option(section, key) if option: return option log.warning("section/key [{section}/{key}] not found in config".format( **locals())) raise AirflowConfigException("section/key [{section}/{key}] not found " "in config".format(**locals()))
def _validate(self): if (self.get("core", "executor") not in ('DebugExecutor', 'SequentialExecutor') and "sqlite" in self.get('core', 'sql_alchemy_conn')): raise AirflowConfigException( "error: cannot use sqlite with the {}".format( self.get('core', 'executor'))) for section, replacement in self.deprecated_values.items(): for name, info in replacement.items(): old, new, version = info current_value = self.get(section, name, fallback=None) if self._using_old_value(old, current_value): new_value = re.sub(old, new, current_value) self._update_env_var(section=section, name=name, new_value=new_value) self._create_future_warning(name=name, section=section, current_value=current_value, new_value=new_value, version=version) self.is_validated = True
def execute(self, context): try: OUTLOOK_USER = configuration.conf.get('outlook', 'OUTLOOK_USER') OUTLOOK_PASSWORD = configuration.conf.get('outlook', 'OUTLOOK_PASSWORD') except AirflowConfigException: raise AirflowConfigException( "No user/password found for Outlook, so logging in with no authentication." ) authentication = (OUTLOOK_USER, OUTLOOK_PASSWORD) m = Message(auth=authentication) m.setRecipients(self.to) m.setSubject(self.subject) if self.html_flag: m.setBodyHTML(self.body) else: m.setBody(self.body) if isinstance(self.files, (list, )): for file in self.files: att = Attachment(path=file) m.attachments.append(att) m.sendMessage() print("True")
def validate_logging_config(logging_config): """Validate the provided Logging Config""" # Now lets validate the other logging-related settings task_log_reader = conf.get('logging', 'task_log_reader') logger = logging.getLogger('airflow.task') def _get_handler(name): return next((h for h in logger.handlers if h.name == name), None) if _get_handler(task_log_reader) is None: # Check for pre 1.10 setting that might be in deployed airflow.cfg files if task_log_reader == "file.task" and _get_handler("task"): warnings.warn( f"task_log_reader setting in [logging] has a deprecated value of {task_log_reader!r}, " "but no handler with this name was found. Please update your config to use task. " "Running config has been adjusted to match", DeprecationWarning, ) conf.set('logging', 'task_log_reader', 'task') else: raise AirflowConfigException( f"Configured task_log_reader {task_log_reader!r} was not a handler of " f"the 'airflow.task' logger.")
def _validate(self): if (self.get("core", "executor") != 'SequentialExecutor' and "sqlite" in self.get('core', 'sql_alchemy_conn')): raise AirflowConfigException( "error: cannot use sqlite with the {}".format( self.get('core', 'executor'))) for section, replacement in self.deprecated_values.items(): for name, info in replacement.items(): old, new, version = info if self.get(section, name, fallback=None) == old: # Make sure the env var option is removed, otherwise it # would be read and used instead of the value we set env_var = self._env_var_name(section, name) os.environ.pop(env_var, None) self.set(section, name, new) warnings.warn( self.deprecation_value_format_string.format( **locals()), FutureWarning, ) self.is_validated = True
def get_statsd_logger(self): if conf.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient if conf.has_option('scheduler', 'statsd_custom_client_path'): stats_class = conf.getimport('scheduler', 'statsd_custom_client_path') if not issubclass(stats_class, StatsClient): raise AirflowConfigException( "Your custom Statsd client must extend the statsd.StatsClient in order to ensure " "backwards compatibility." ) else: log.info("Successfully loaded custom Statsd client") else: stats_class = StatsClient statsd = stats_class( host=conf.get('scheduler', 'statsd_host'), port=conf.getint('scheduler', 'statsd_port'), prefix=conf.get('scheduler', 'statsd_prefix')) allow_list_validator = AllowListValidator(conf.get('scheduler', 'statsd_allow_list', fallback=None)) return SafeStatsdLogger(statsd, allow_list_validator)
def create_app(config=None, testing=False): """Create a new instance of Airflow WWW app""" flask_app = Flask(__name__) flask_app.secret_key = conf.get('webserver', 'SECRET_KEY') flask_app.config['PERMANENT_SESSION_LIFETIME'] = timedelta( minutes=settings.get_session_lifetime_config()) flask_app.config.from_pyfile(settings.WEBSERVER_CONFIG, silent=True) flask_app.config['APP_NAME'] = conf.get(section="webserver", key="instance_name", fallback="Airflow") flask_app.config['TESTING'] = testing flask_app.config['SQLALCHEMY_DATABASE_URI'] = conf.get( 'database', 'SQL_ALCHEMY_CONN') url = make_url(flask_app.config['SQLALCHEMY_DATABASE_URI']) if url.drivername == 'sqlite' and url.database and not url.database.startswith( '/'): raise AirflowConfigException( f'Cannot use relative path: `{conf.get("database", "SQL_ALCHEMY_CONN")}` to connect to sqlite. ' 'Please use absolute path such as `sqlite:////tmp/airflow.db`.') flask_app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False flask_app.config['SESSION_COOKIE_HTTPONLY'] = True flask_app.config['SESSION_COOKIE_SECURE'] = conf.getboolean( 'webserver', 'COOKIE_SECURE') cookie_samesite_config = conf.get('webserver', 'COOKIE_SAMESITE') if cookie_samesite_config == "": warnings.warn( "Old deprecated value found for `cookie_samesite` option in `[webserver]` section. " "Using `Lax` instead. Change the value to `Lax` in airflow.cfg to remove this warning.", DeprecationWarning, ) cookie_samesite_config = "Lax" flask_app.config['SESSION_COOKIE_SAMESITE'] = cookie_samesite_config if config: flask_app.config.from_mapping(config) if 'SQLALCHEMY_ENGINE_OPTIONS' not in flask_app.config: flask_app.config[ 'SQLALCHEMY_ENGINE_OPTIONS'] = settings.prepare_engine_args() # Configure the JSON encoder used by `|tojson` filter from Flask flask_app.json_encoder = AirflowJsonEncoder csrf.init_app(flask_app) init_wsgi_middleware(flask_app) db = SQLA() db.session = settings.Session db.init_app(flask_app) init_dagbag(flask_app) init_api_experimental_auth(flask_app) init_robots(flask_app) cache_config = { 'CACHE_TYPE': 'flask_caching.backends.filesystem', 'CACHE_DIR': gettempdir() } Cache(app=flask_app, config=cache_config) init_flash_views(flask_app) configure_logging() configure_manifest_files(flask_app) with flask_app.app_context(): init_appbuilder(flask_app) init_appbuilder_views(flask_app) init_appbuilder_links(flask_app) init_plugins(flask_app) init_connection_form() init_error_handlers(flask_app) init_api_connexion(flask_app) init_api_experimental(flask_app) sync_appbuilder_roles(flask_app) init_jinja_globals(flask_app) init_xframe_protection(flask_app) init_airflow_session_interface(flask_app) return flask_app
def _validate(self): if not self.dags_volume_claim and (not self.git_repo or not self.git_branch): raise AirflowConfigException( 'In kubernetes mode the following must be set in the `kubernetes` ' 'config section: `dags_volume_claim` or `git_repo and git_branch`')
def get(self, section, key, **kwargs): section = str(section).lower() key = str(key).lower() deprecated_name = self.deprecated_options.get(section, {}).get(key, None) # first check environment variables option = self._get_env_var_option(section, key) if option is not None: return option if deprecated_name: option = self._get_env_var_option(section, deprecated_name) if option is not None: self._warn_deprecate(section, key, deprecated_name) return option # ...then the config file if super(AirflowConfigParser, self).has_option(section, key): # Use the parent's methods to get the actual config here to be able to # separate the config from default config. return expand_env_var( super(AirflowConfigParser, self).get(section, key, **kwargs)) if deprecated_name: if super(AirflowConfigParser, self).has_option(section, deprecated_name): self._warn_deprecate(section, key, deprecated_name) return expand_env_var(super(AirflowConfigParser, self).get( section, deprecated_name, **kwargs )) # ...then commands option = self._get_cmd_option(section, key) if option: return option if deprecated_name: option = self._get_cmd_option(section, deprecated_name) if option: self._warn_deprecate(section, key, deprecated_name) return option # ...then from secret backends option = self._get_secret_option(section, key) if option: return option if deprecated_name: option = self._get_secret_option(section, deprecated_name) if option: self._warn_deprecate(section, key, deprecated_name) return option # ...then the default config if self.airflow_defaults.has_option(section, key) or 'fallback' in kwargs: return expand_env_var( self.airflow_defaults.get(section, key, **kwargs)) else: log.warning( "section/key [%s/%s] not found in config", section, key ) raise AirflowConfigException( "section/key [{section}/{key}] not found " "in config".format(section=section, key=key))
def __init__(self): configuration_dict = conf.as_dict(display_sensitive=True) self.core_configuration = configuration_dict[self.core_section] self.airflow_home = AIRFLOW_HOME self.dags_folder = conf.get(self.core_section, 'dags_folder') self.parallelism = conf.getint(self.core_section, 'parallelism') self.pod_template_file = conf.get(self.kubernetes_section, 'pod_template_file', fallback=None) self.delete_worker_pods = conf.getboolean(self.kubernetes_section, 'delete_worker_pods') self.delete_worker_pods_on_failure = conf.getboolean( self.kubernetes_section, 'delete_worker_pods_on_failure') self.worker_pods_creation_batch_size = conf.getint( self.kubernetes_section, 'worker_pods_creation_batch_size') self.worker_container_repository = conf.get( self.kubernetes_section, 'worker_container_repository') self.worker_container_tag = conf.get(self.kubernetes_section, 'worker_container_tag') if self.worker_container_repository and self.worker_container_tag: self.kube_image = f'{self.worker_container_repository}:{self.worker_container_tag}' else: self.kube_image = None # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note # that if your # cluster has RBAC enabled, your scheduler may need service account permissions to # create, watch, get, and delete pods in this namespace. self.kube_namespace = conf.get(self.kubernetes_section, 'namespace') self.multi_namespace_mode = conf.getboolean(self.kubernetes_section, 'multi_namespace_mode') # The Kubernetes Namespace in which pods will be created by the executor. Note # that if your # cluster has RBAC enabled, your workers may need service account permissions to # interact with cluster components. self.executor_namespace = conf.get(self.kubernetes_section, 'namespace') self.worker_pods_pending_timeout = conf.getint( self.kubernetes_section, 'worker_pods_pending_timeout') self.worker_pods_pending_timeout_check_interval = conf.getint( self.kubernetes_section, 'worker_pods_pending_timeout_check_interval') self.worker_pods_pending_timeout_batch_size = conf.getint( self.kubernetes_section, 'worker_pods_pending_timeout_batch_size') self.worker_pods_queued_check_interval = conf.getint( self.kubernetes_section, 'worker_pods_queued_check_interval') self.kube_client_request_args = conf.getjson( self.kubernetes_section, 'kube_client_request_args', fallback={}) if not isinstance(self.kube_client_request_args, dict): raise AirflowConfigException( f"[{self.kubernetes_section}] 'kube_client_request_args' expected a JSON dict, got " + type(self.kube_client_request_args).__name__) if self.kube_client_request_args: if '_request_timeout' in self.kube_client_request_args and isinstance( self.kube_client_request_args['_request_timeout'], list): self.kube_client_request_args['_request_timeout'] = tuple( self.kube_client_request_args['_request_timeout']) self.delete_option_kwargs = conf.getjson(self.kubernetes_section, 'delete_option_kwargs', fallback={}) if not isinstance(self.delete_option_kwargs, dict): raise AirflowConfigException( f"[{self.kubernetes_section}] 'delete_option_kwargs' expected a JSON dict, got " + type(self.delete_option_kwargs).__name__)