def check_events_for_failures(events): check.list_param(events, "events", of_type=DagsterEvent) for event in events: if event.event_type_value == "STEP_FAILURE": raise AirflowException("step failed with error: %s" % event.event_specific_data.error.to_string())
def resp_check(_): raise AirflowException('AirflowException raised here!')
def create_instance( self, location: str, instance_id: str, instance: Union[Dict, Instance], project_id: str, retry: Optional[Retry] = None, timeout: Optional[float] = None, metadata: Optional[Sequence[Tuple[str, str]]] = None, ): """ Creates a Redis instance based on the specified tier and memory size. By default, the instance is accessible from the project's `default network <https://cloud.google.com/compute/docs/networks-and-firewalls#networks>`__. :param location: The location of the Cloud Memorystore instance (for example europe-west1) :type location: str :param instance_id: Required. The logical name of the Redis instance in the customer project with the following restrictions: - Must contain only lowercase letters, numbers, and hyphens. - Must start with a letter. - Must be between 1-40 characters. - Must end with a number or a letter. - Must be unique within the customer project / location :type instance_id: str :param instance: Required. A Redis [Instance] resource If a dict is provided, it must be of the same form as the protobuf message :class:`~google.cloud.redis_v1.types.Instance` :type instance: Union[Dict, google.cloud.redis_v1.types.Instance] :param project_id: Project ID of the project that contains the instance. If set to None or missing, the default project_id from the Google Cloud connection is used. :type project_id: str :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be retried. :type retry: google.api_core.retry.Retry :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if ``retry`` is specified, the timeout applies to each individual attempt. :type timeout: float :param metadata: Additional metadata that is provided to the method. :type metadata: Sequence[Tuple[str, str]] """ client = self.get_conn() parent = CloudRedisClient.location_path(project_id, location) instance_name = CloudRedisClient.instance_path(project_id, location, instance_id) try: instance = client.get_instance( name=instance_name, retry=retry, timeout=timeout, metadata=metadata ) self.log.info("Instance exists. Skipping creation.") return instance except NotFound: self.log.info("Instance not exists.") if isinstance(instance, dict): instance = ParseDict(instance, Instance()) elif not isinstance(instance, Instance): raise AirflowException("instance is not instance of Instance type or python dict") self._append_label(instance, "airflow-version", "v" + version.version) result = client.create_instance( parent=parent, instance_id=instance_id, instance=instance, retry=retry, timeout=timeout, metadata=metadata, ) result.result() self.log.info("Instance created.") return client.get_instance(name=instance_name, retry=retry, timeout=timeout, metadata=metadata)
def _validate_inputs(self): if self.project_id == '': raise AirflowException("The required parameter 'project_id' is empty") if not self.instance_id: raise AirflowException("The required parameter 'instance_id' " "is empty or None")
def is_terminated(self): if self.result_state not in APPLICATION_GATEWAY_JOB_STATES: raise AirflowException(( 'Some problem happened while running the spark job. Please check with APPLICATION Gateway Team' ).format(self.result_state)) return self.result_state in ('TASK_KILLED', 'TASK_FAILED')
def execute(self, context): """ Execute the bash command in a temporary directory which will be cleaned afterwards """ self.log.info("Tmp dir root location: \n %s", gettempdir()) self.lineage_data = self.bash_command # 创建临时目录 with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: # 创建临时文件 with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f: # 将bash命令写入到临时文件中 f.write(bytes(self.bash_command, 'utf_8')) f.flush() # 获得临时文件的名称 fname = f.name script_location = os.path.abspath(fname) self.log.info("Temporary script location: %s", script_location) if USE_WINDOWS: pre_exec = None else: def pre_exec(): # Restore default signal disposition and invoke setsid # SIG_DFL: 表示默认信号处理程序 for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) # 创建一个新的会话。新建的会话无控制终端。 # 帮助一个进程脱离从父进程继承而来的已打开的终端、隶属进程组和隶属的会话。 os.setsid() self.log.info("Running command: %s", self.bash_command) # 执行临时bash脚本 sp = Popen(['bash', fname], stdout=PIPE, stderr=STDOUT, cwd=tmp_dir, env=self.env, preexec_fn=pre_exec) # 获取bash进程 self.sp = sp # 获得bash进程的执行结果 self.log.info("Output:") line = '' for line in iter(sp.stdout.readline, b''): line = line.decode(self.output_encoding).rstrip() self.log.info(line) sp.wait() # 获得bash进程的错误码 self.log.info("Command exited with return code %s", sp.returncode) # 脚本执行失败抛出异常 if sp.returncode: raise AirflowException("Bash command failed") if self.xcom_push_flag: return line
def handler(signum, frame): raise AirflowException(f"Timeout {timeout}s reached")
def _execute(self): """ Runs a dag for a specified date range. """ session = settings.Session() start_date = self.bf_start_date end_date = self.bf_end_date # picklin' pickle_id = None if not self.donot_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): pickle = models.DagPickle(self.dag) session.add(pickle) session.commit() pickle_id = pickle.id executor = self.executor executor.start() executor_fails = Counter() # Build a list of all instances to run tasks_to_run = {} failed = set() succeeded = set() started = set() skipped = set() not_ready = set() deadlocked = set() for task in self.dag.tasks: if (not self.include_adhoc) and task.adhoc: continue start_date = start_date or task.start_date end_date = end_date or task.end_date or datetime.now() for dttm in self.dag.date_range(start_date, end_date=end_date): ti = models.TaskInstance(task, dttm) tasks_to_run[ti.key] = ti session.merge(ti) session.commit() # Triggering what is ready to get triggered while tasks_to_run and not deadlocked: not_ready.clear() for key, ti in list(tasks_to_run.items()): ti.refresh_from_db() ignore_depends_on_past = (self.ignore_first_depends_on_past and ti.execution_date == (start_date or ti.start_date)) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if key not in started: if ti.state == State.SUCCESS: succeeded.add(key) tasks_to_run.pop(key) continue elif ti.state == State.SKIPPED: skipped.add(key) tasks_to_run.pop(key) continue # Is the task runnable? -- then run it if ti.is_queueable( include_queued=True, ignore_depends_on_past=ignore_depends_on_past, flag_upstream_failed=True): self.logger.debug('Sending {} to executor'.format(ti)) executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_dependencies=self.ignore_dependencies, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool) started.add(key) # Mark the task as not ready to run elif ti.state in (State.NONE, State.UPSTREAM_FAILED): self.logger.debug('Added {} to not_ready'.format(ti)) not_ready.add(key) self.heartbeat() executor.heartbeat() # If the set of tasks that aren't ready ever equals the set of # tasks to run, then the backfill is deadlocked if not_ready and not_ready == set(tasks_to_run): deadlocked.update(tasks_to_run.values()) tasks_to_run.clear() # Reacting to events for key, state in list(executor.get_event_buffer().items()): dag_id, task_id, execution_date = key if key not in tasks_to_run: continue ti = tasks_to_run[key] ti.refresh_from_db() # executor reports failure if state == State.FAILED: # task reports running if ti.state == State.RUNNING: msg = ('Executor reports that task instance {} failed ' 'although the task says it is running.'.format( key)) self.logger.error(msg) ti.handle_failure(msg) tasks_to_run.pop(key) # task reports skipped elif ti.state == State.SKIPPED: self.logger.error("Skipping {} ".format(key)) skipped.add(key) tasks_to_run.pop(key) # anything else is a failure else: self.logger.error( "Task instance {} failed".format(key)) failed.add(key) tasks_to_run.pop(key) # executor reports success elif state == State.SUCCESS: # task reports success if ti.state == State.SUCCESS: self.logger.info( 'Task instance {} succeeded'.format(key)) succeeded.add(key) tasks_to_run.pop(key) # task reports failure elif ti.state == State.FAILED: self.logger.error( "Task instance {} failed".format(key)) failed.add(key) tasks_to_run.pop(key) # task reports skipped elif ti.state == State.SKIPPED: self.logger.info( "Task instance {} skipped".format(key)) skipped.add(key) tasks_to_run.pop(key) # this probably won't ever be triggered elif ti in not_ready: self.logger.info( "{} wasn't expected to run, but it did".format(ti)) # executor reports success but task does not - this is weird elif ti.state not in (State.SUCCESS, State.QUEUED, State.UP_FOR_RETRY): self.logger.error( "The airflow run command failed " "at reporting an error. This should not occur " "in normal circumstances. Task state is '{}'," "reported state is '{}'. TI is {}" "".format(ti.state, state, ti)) # if the executor fails 3 or more times, stop trying to # run the task executor_fails[key] += 1 if executor_fails[key] >= 3: msg = ( 'The airflow run command failed to report an ' 'error for task {} three or more times. The ' 'task is being marked as failed. This is very ' 'unusual and probably means that an error is ' 'taking place before the task even ' 'starts.'.format(key)) self.logger.error(msg) ti.handle_failure(msg) tasks_to_run.pop(key) msg = ' | '.join([ "[backfill progress]", "waiting: {0}", "succeeded: {1}", "kicked_off: {2}", "failed: {3}", "skipped: {4}", "deadlocked: {5}" ]).format(len(tasks_to_run), len(succeeded), len(started), len(failed), len(skipped), len(deadlocked)) self.logger.info(msg) executor.end() session.close() err = '' if failed: err += ("---------------------------------------------------\n" "Some task instances failed:\n{}\n".format(failed)) if deadlocked: err += ('---------------------------------------------------\n' 'BackfillJob is deadlocked.') deadlocked_depends_on_past = any( t.are_dependencies_met() != t.are_dependencies_met( ignore_depends_on_past=True) for t in deadlocked) if deadlocked_depends_on_past: err += ( 'Some of the deadlocked tasks were unable to run because ' 'of "depends_on_past" relationships. Try running the ' 'backfill with the option ' '"ignore_first_depends_on_past=True" or passing "-I" at ' 'the command line.') err += ' These tasks were unable to run:\n{}\n'.format(deadlocked) if err: raise AirflowException(err) self.logger.info("Backfill done. Exiting.")
def __init__( self, *, external_dag_id: str, external_task_id: Optional[str] = None, external_task_ids: Optional[Collection[str]] = None, allowed_states: Optional[Iterable[str]] = None, failed_states: Optional[Iterable[str]] = None, execution_delta: Optional[datetime.timedelta] = None, execution_date_fn: Optional[Callable] = None, check_existence: bool = False, **kwargs, ): super().__init__(**kwargs) self.allowed_states = list(allowed_states) if allowed_states else [ State.SUCCESS ] self.failed_states = list(failed_states) if failed_states else [] total_states = set(self.allowed_states + self.failed_states) if set(self.failed_states).intersection(set(self.allowed_states)): raise AirflowException( f"Duplicate values provided as allowed " f"`{self.allowed_states}` and failed states `{self.failed_states}`" ) if external_task_id is not None and external_task_ids is not None: raise ValueError( 'Only one of `external_task_id` or `external_task_ids` may ' 'be provided to ExternalTaskSensor; not both.') if external_task_id is not None: external_task_ids = [external_task_id] if external_task_ids: if not total_states <= set(State.task_states): raise ValueError( f'Valid values for `allowed_states` and `failed_states` ' f'when `external_task_id` or `external_task_ids` is not `None`: {State.task_states}' ) if len(external_task_ids) > len(set(external_task_ids)): raise ValueError( 'Duplicate task_ids passed in external_task_ids parameter') elif not total_states <= set(State.dag_states): raise ValueError( f'Valid values for `allowed_states` and `failed_states` ' f'when `external_task_id` is `None`: {State.dag_states}') if execution_delta is not None and execution_date_fn is not None: raise ValueError( 'Only one of `execution_delta` or `execution_date_fn` may ' 'be provided to ExternalTaskSensor; not both.') self.execution_delta = execution_delta self.execution_date_fn = execution_date_fn self.external_dag_id = external_dag_id self.external_task_id = external_task_id self.external_task_ids = external_task_ids self.check_existence = check_existence self._has_checked_existence = False
def update_parameters( self, update_mask: Union[Dict, cloud_memcache.field_mask.FieldMask], parameters: Union[Dict, cloud_memcache.MemcacheParameters], project_id: str, location: str, instance_id: Optional[str] = None, retry: Optional[Retry] = None, timeout: Optional[float] = None, metadata: Optional[Sequence[Tuple[str, str]]] = None, ): """ Updates the defined Memcached Parameters for an existing Instance. This method only stages the parameters, it must be followed by apply_parameters to apply the parameters to nodes of the Memcached Instance. :param update_mask: Required. Mask of fields to update. If a dict is provided, it must be of the same form as the protobuf message :class:`~google.cloud.memcache_v1beta2.types.cloud_memcache.field_mask.FieldMask` :type update_mask: Union[Dict, google.cloud.memcache_v1beta2.types.cloud_memcache.field_mask.FieldMask] :param parameters: The parameters to apply to the instance. If a dict is provided, it must be of the same form as the protobuf message :class:`~google.cloud.memcache_v1beta2.types.cloud_memcache.MemcacheParameters` :type parameters: Union[Dict, google.cloud.memcache_v1beta2.types.cloud_memcache.MemcacheParameters] :param location: The location of the Cloud Memorystore instance (for example europe-west1) :type location: str :param instance_id: The logical name of the Memcached instance in the customer project. :type instance_id: str :param project_id: Project ID of the project that contains the instance. If set to None or missing, the default project_id from the Google Cloud connection is used. :type project_id: str :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be retried. :type retry: google.api_core.retry.Retry :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if ``retry`` is specified, the timeout applies to each individual attempt. :type timeout: float :param metadata: Additional metadata that is provided to the method. :type metadata: Sequence[Tuple[str, str]] """ client = self.get_conn() metadata = metadata or () if isinstance(parameters, dict): parameters = cloud_memcache.MemcacheParameters(parameters) elif not isinstance(parameters, cloud_memcache.MemcacheParameters): raise AirflowException( "instance is not instance of MemcacheParameters type or python dict" ) name = CloudMemcacheClient.instance_path(project_id, location, instance_id) self.log.info("Staging update to instance: %s", instance_id) result = client.update_parameters( name=name, update_mask=update_mask, parameters=parameters, retry=retry, timeout=timeout, metadata=metadata, ) result.result() self.log.info("Update staged for instance: %s", instance_id)
def _validate_inputs(self): for attr_name in self.REQUIRED_ATTRIBUTES: if not getattr(self, attr_name): raise AirflowException('Empty parameter: {}'.format(attr_name))
def copy_object( self, source_bucket_key: str, dest_bucket_key: str, source_bucket_name: Optional[str] = None, dest_bucket_name: Optional[str] = None, source_version_id: Optional[str] = None, acl_policy: Optional[str] = None, ) -> None: """ Creates a copy of an object that is already stored in S3. Note: the S3 connection used here needs to have access to both source and destination bucket/key. :param source_bucket_key: The key of the source object. It can be either full s3:// style url or relative path from root level. When it's specified as a full s3:// url, please omit source_bucket_name. :type source_bucket_key: str :param dest_bucket_key: The key of the object to copy to. The convention to specify `dest_bucket_key` is the same as `source_bucket_key`. :type dest_bucket_key: str :param source_bucket_name: Name of the S3 bucket where the source object is in. It should be omitted when `source_bucket_key` is provided as a full s3:// url. :type source_bucket_name: str :param dest_bucket_name: Name of the S3 bucket to where the object is copied. It should be omitted when `dest_bucket_key` is provided as a full s3:// url. :type dest_bucket_name: str :param source_version_id: Version ID of the source object (OPTIONAL) :type source_version_id: str :param acl_policy: The string to specify the canned ACL policy for the object to be copied which is private by default. :type acl_policy: str """ acl_policy = acl_policy or 'private' if dest_bucket_name is None: dest_bucket_name, dest_bucket_key = self.parse_s3_url( dest_bucket_key) else: parsed_url = urlparse(dest_bucket_key) if parsed_url.scheme != '' or parsed_url.netloc != '': raise AirflowException( 'If dest_bucket_name is provided, ' + 'dest_bucket_key should be relative path ' + 'from root level, rather than a full s3:// url') if source_bucket_name is None: source_bucket_name, source_bucket_key = self.parse_s3_url( source_bucket_key) else: parsed_url = urlparse(source_bucket_key) if parsed_url.scheme != '' or parsed_url.netloc != '': raise AirflowException( 'If source_bucket_name is provided, ' + 'source_bucket_key should be relative path ' + 'from root level, rather than a full s3:// url') copy_source = { 'Bucket': source_bucket_name, 'Key': source_bucket_key, 'VersionId': source_version_id } response = self.get_conn().copy_object(Bucket=dest_bucket_name, Key=dest_bucket_key, CopySource=copy_source, ACL=acl_policy) return response
def execute(self, context): try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info( "ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id, timeout=self.timeout) if not self.ssh_hook: raise AirflowException( "Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info( "remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host if not self.command: raise AirflowException("SSH command not specified. Aborting.") with self.ssh_hook.get_conn() as ssh_client: # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command( command=self.command, get_pty=get_pty, timeout=self.timeout) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or \ channel.recv_ready() or \ channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], self.timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line self.log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr( len(c.in_stderr_buffer)) line = line agg_stderr += line self.log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready()\ and not stderr.channel.recv_stderr_ready()\ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() exit_status = stdout.channel.recv_exit_status() if exit_status == 0: enable_pickling = configuration.conf.getboolean( 'core', 'enable_xcom_pickling') if enable_pickling: return agg_stdout else: return b64encode(agg_stdout).decode('utf-8') else: error_msg = agg_stderr.decode('utf-8') raise AirflowException( "error running cmd: {0}, error: {1}".format( self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True
def securestring(value: str): if not native: raise AirflowException( "Filter 'securestring' not applicable to non-native " "templating environment") return TaggedValue("SS", value)
def execute(self, context): try: from dagster_graphql.client.mutations import ( DagsterGraphQLClientError, handle_execution_errors, handle_execute_plan_result_raw, ) except ImportError: raise AirflowException( 'To use the DagsterDockerOperator, dagster and dagster_graphql must be installed ' 'in your Airflow environment.') if 'run_id' in self.params: self._run_id = self.params['run_id'] elif 'dag_run' in context and context['dag_run'] is not None: self._run_id = context['dag_run'].run_id try: if self.instance: run = self.instance.register_managed_run( pipeline_name=self.pipeline_name, run_id=self.run_id, run_config=self.run_config, mode=self.mode, solids_to_execute=None, step_keys_to_execute=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot=self.pipeline_snapshot, execution_plan_snapshot=self.execution_plan_snapshot, parent_pipeline_snapshot=self.parent_pipeline_snapshot, ) raw_res = super(DagsterDockerOperator, self).execute(context) self.log.info('Finished executing container.') res = parse_raw_log_lines(raw_res) try: handle_execution_errors(res, 'executePlan') except DagsterGraphQLClientError as err: if self.instance: self.instance.report_engine_event( str(err), run, EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info())), self.__class__, ) raise events = handle_execute_plan_result_raw(res) if self.instance: for event in events: self.instance.handle_new_event(event) events = [e.dagster_event for e in events] check_events_for_failures(events) check_events_for_skips(events) return events finally: self._run_id = None
def create_bucket(self, bucket_name, storage_class='MULTI_REGIONAL', location='US', project_id=None, labels=None ): """ Creates a new bucket. Google Cloud Storage uses a flat namespace, so you can't create a bucket with a name that is already in use. .. seealso:: For more information, see Bucket Naming Guidelines: https://cloud.google.com/storage/docs/bucketnaming.html#requirements :param bucket_name: The name of the bucket. :type bucket_name: str :param storage_class: This defines how objects in the bucket are stored and determines the SLA and the cost of storage. Values include - ``MULTI_REGIONAL`` - ``REGIONAL`` - ``STANDARD`` - ``NEARLINE`` - ``COLDLINE``. If this value is not specified when the bucket is created, it will default to STANDARD. :type storage_class: str :param location: The location of the bucket. Object data for objects in the bucket resides in physical storage within this region. Defaults to US. .. seealso:: https://developers.google.com/storage/docs/bucket-locations :type location: str :param project_id: The ID of the GCP Project. :type project_id: str :param labels: User-provided labels, in key/value pairs. :type labels: dict :return: If successful, it returns the ``id`` of the bucket. """ project_id = project_id if project_id is not None else self.project_id storage_classes = [ 'MULTI_REGIONAL', 'REGIONAL', 'NEARLINE', 'COLDLINE', 'STANDARD', # alias for MULTI_REGIONAL/REGIONAL, based on location ] self.log.info('Creating Bucket: %s; Location: %s; Storage Class: %s', bucket_name, location, storage_class) if storage_class not in storage_classes: raise ValueError( 'Invalid value ({}) passed to storage_class. Value should be ' 'one of {}'.format(storage_class, storage_classes)) if not re.match('[a-zA-Z0-9]+', bucket_name[0]): raise ValueError('Bucket names must start with a number or letter.') if not re.match('[a-zA-Z0-9]+', bucket_name[-1]): raise ValueError('Bucket names must end with a number or letter.') service = self.get_conn() bucket_resource = { 'name': bucket_name, 'location': location, 'storageClass': storage_class } self.log.info('The Default Project ID is %s', self.project_id) if labels is not None: bucket_resource['labels'] = labels try: response = service.buckets().insert( project=project_id, body=bucket_resource ).execute() self.log.info('Bucket: %s created successfully.', bucket_name) return response['id'] except errors.HttpError as ex: raise AirflowException( 'Bucket creation failed. Error was: {}'.format(ex.content) )
def execute(self, context): '''Modified only to use the get_host_tmp_dir helper.''' self.log.info('Starting docker container from image %s', self.image) tls_config = self.__get_tls_config() if self.docker_conn_id: self.cli = self.get_hook().get_conn() else: self.cli = APIClient(base_url=self.docker_url, version=self.api_version, tls=tls_config) if self.force_pull or len(self.cli.images(name=self.image)) == 0: self.log.info('Pulling docker image %s', self.image) for l in self.cli.pull(self.image, stream=True): output = seven.json.loads(l.decode('utf-8').strip()) if 'status' in output: self.log.info("%s", output['status']) with self.get_host_tmp_dir() as host_tmp_dir: self.environment['AIRFLOW_TMP_DIR'] = self.tmp_dir self.volumes.append('{0}:{1}'.format(host_tmp_dir, self.tmp_dir)) self.container = self.cli.create_container( command=self.get_command(), environment=self.environment, host_config=self.cli.create_host_config( auto_remove=self.auto_remove, binds=self.volumes, network_mode=self.network_mode, shm_size=self.shm_size, dns=self.dns, dns_search=self.dns_search, cpu_shares=int(round(self.cpus * 1024)), mem_limit=self.mem_limit, ), image=self.image, user=self.user, working_dir=self.working_dir, ) self.cli.start(self.container['Id']) res = [] line = '' for new_line in self.cli.logs(container=self.container['Id'], stream=True): line = new_line.strip() if hasattr(line, 'decode'): line = line.decode('utf-8') self.log.info(line) res.append(line) result = self.cli.wait(self.container['Id']) if result['StatusCode'] != 0: raise AirflowException( 'docker container failed with result: {result} and logs: {logs}' .format(result=repr(result), logs='\n'.join(res))) if self.xcom_push_flag: # Try to avoid any kind of race condition? return res if self.xcom_all else str(line)
def check_uuids(**kwargs): print('dag_run conf follows:') pprint(kwargs['dag_run'].conf) try: assert_json_matches_schema(kwargs['dag_run'].conf, 'launch_multi_metadata_schema.yml') except AssertionError as e: print('invalid metadata follows:') pprint(kwargs['dag_run'].conf) raise uuid_l = kwargs['dag_run'].conf['uuid_list'] collection_type = kwargs['dag_run'].conf['collection_type'] filtered_uuid_l = [] filtered_path_l = [] filtered_data_types = [] for uuid in uuid_l: print(f'Starting uuid {uuid}') my_callable = lambda **kwargs: uuid ds_rslt = utils.pythonop_get_dataset_state( dataset_uuid_callable=my_callable, http_conn_id='ingest_api_connection', **kwargs) if not ds_rslt: raise AirflowException(f'Invalid uuid/doi for group: {uuid}') print('ds_rslt:') pprint(ds_rslt) for key in [ 'status', 'uuid', 'data_types', 'local_directory_full_path' ]: assert key in ds_rslt, f"Dataset status for {uuid} has no {key}" if not ds_rslt['status'] in ['QA', 'Published']: raise AirflowException(f'Dataset {uuid} is not QA or better') dt = ds_rslt['data_types'] if isinstance(dt, str) and dt.startswith('[') and dt.endswith(']'): dt = ast.literal_eval(dt) print(f'parsed dt: {dt}') if isinstance(dt, list): if dt: if len(dt) == 1: filtered_data_types.append(dt[0]) else: filtered_data_types.append(tuple(dt)) else: raise AirflowException( f'Dataset data_types for {uuid} is empty') else: filtered_data_types.append(dt) lz_path = ds_rslt['local_directory_full_path'] filtered_path_l.append(lz_path) filtered_uuid_l.append(ds_rslt['uuid']) print(f'Finished uuid {uuid}') print(f'filtered data types: {filtered_data_types}') print(f'filtered paths: {filtered_path_l}') print(f'filtered uuids: {filtered_uuid_l}') kwargs['ti'].xcom_push(key='collectiontype', value=collection_type) kwargs['ti'].xcom_push(key='assay_type', value=filtered_data_types) kwargs['ti'].xcom_push(key='lz_paths', value=filtered_path_l) kwargs['ti'].xcom_push(key='uuids', value=filtered_uuid_l)
def run_cli(self, hql, schema=None, verbose=True, hive_conf=None): """ Run an hql statement using the hive cli. If hive_conf is specified it should be a dict and the entries will be set as key/value pairs in HiveConf :param hive_conf: if specified these key value pairs will be passed to hive as ``-hiveconf "key"="value"``. Note that they will be passed after the ``hive_cli_params`` and thus will override whatever values are specified in the database. :type hive_conf: dict >>> hh = HiveCliHook() >>> result = hh.run_cli("USE airflow;") >>> ("OK" in result) True """ conn = self.conn schema = schema or conn.schema if schema: hql = "USE {schema};\n{hql}".format(**locals()) with TemporaryDirectory(prefix='airflow_hiveop_') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir) as f: f.write(hql.encode('UTF-8')) f.flush() hive_cmd = self._prepare_cli_cmd() hive_conf_params = self._prepare_hiveconf(hive_conf) if self.mapred_queue: hive_conf_params.extend([ '-hiveconf', 'mapreduce.job.queuename={}'.format(self.mapred_queue) ]) if self.mapred_queue_priority: hive_conf_params.extend([ '-hiveconf', 'mapreduce.job.priority={}'.format( self.mapred_queue_priority) ]) if self.mapred_job_name: hive_conf_params.extend([ '-hiveconf', 'mapred.job.name={}'.format(self.mapred_job_name) ]) hive_cmd.extend(hive_conf_params) hive_cmd.extend(['-f', f.name]) if verbose: self.log.info(" ".join(hive_cmd)) sp = subprocess.Popen(hive_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=tmp_dir) self.sp = sp stdout = '' while True: line = sp.stdout.readline() if not line: break stdout += line.decode('UTF-8') if verbose: self.log.info(line.decode('UTF-8').strip()) sp.wait() if sp.returncode: raise AirflowException(stdout) return stdout
def __init__( self, # pylint: disable=too-many-arguments,too-many-locals *, namespace: Optional[str] = None, image: Optional[str] = None, name: Optional[str] = None, cmds: Optional[List[str]] = None, arguments: Optional[List[str]] = None, ports: Optional[List[Port]] = None, volume_mounts: Optional[List[VolumeMount]] = None, volumes: Optional[List[Volume]] = None, env_vars: Optional[Dict] = None, secrets: Optional[List[Secret]] = None, in_cluster: Optional[bool] = None, cluster_context: Optional[str] = None, labels: Optional[Dict] = None, reattach_on_restart: bool = True, startup_timeout_seconds: int = 120, get_logs: bool = True, image_pull_policy: str = 'IfNotPresent', annotations: Optional[Dict] = None, resources: Optional[Dict] = None, affinity: Optional[Dict] = None, config_file: Optional[str] = None, node_selectors: Optional[Dict] = None, image_pull_secrets: Optional[str] = None, service_account_name: str = 'default', is_delete_operator_pod: bool = False, hostnetwork: bool = False, tolerations: Optional[List] = None, configmaps: Optional[List] = None, security_context: Optional[Dict] = None, pod_runtime_info_envs: Optional[List[PodRuntimeInfoEnv]] = None, dnspolicy: Optional[str] = None, schedulername: Optional[str] = None, full_pod_spec: Optional[k8s.V1Pod] = None, init_containers: Optional[List[k8s.V1Container]] = None, log_events_on_failure: bool = False, do_xcom_push: bool = False, pod_template_file: Optional[str] = None, priority_class_name: Optional[str] = None, **kwargs): if kwargs.get('xcom_push') is not None: raise AirflowException( "'xcom_push' was deprecated, use 'do_xcom_push' instead") super().__init__(resources=None, **kwargs) self.pod = None self.do_xcom_push = do_xcom_push self.image = image self.namespace = namespace self.cmds = cmds or [] self.arguments = arguments or [] self.labels = labels or {} self.startup_timeout_seconds = startup_timeout_seconds self.env_vars = env_vars or {} self.ports = ports or [] self.volume_mounts = volume_mounts or [] self.volumes = volumes or [] self.secrets = secrets or [] self.in_cluster = in_cluster self.cluster_context = cluster_context self.reattach_on_restart = reattach_on_restart self.get_logs = get_logs self.image_pull_policy = image_pull_policy self.node_selectors = node_selectors or {} self.annotations = annotations or {} self.affinity = affinity or {} self.resources = self._set_resources(resources) self.config_file = config_file self.image_pull_secrets = image_pull_secrets self.service_account_name = service_account_name self.is_delete_operator_pod = is_delete_operator_pod self.hostnetwork = hostnetwork self.tolerations = tolerations or [] self.configmaps = configmaps or [] self.security_context = security_context or {} self.pod_runtime_info_envs = pod_runtime_info_envs or [] self.dnspolicy = dnspolicy self.schedulername = schedulername self.full_pod_spec = full_pod_spec self.init_containers = init_containers or [] self.log_events_on_failure = log_events_on_failure self.priority_class_name = priority_class_name self.pod_template_file = pod_template_file self.name = self._set_name(name)
def execute(self, context: "Context") -> List[str]: # Define intervals and prefixes. try: timespan_start = context["data_interval_start"] timespan_end = context["data_interval_end"] except KeyError: # Data interval context variables are only available in Airflow 2.2+ timespan_start = timezone.coerce_datetime(context["execution_date"]) timespan_end = timezone.coerce_datetime(context["dag"].following_schedule(timespan_start)) if timespan_end is None: # Only possible in Airflow before 2.2. self.log.warning("No following schedule found, setting timespan end to max %s", timespan_end) timespan_end = timezone.coerce_datetime(DateTime.max) elif timespan_start >= timespan_end: # Airflow 2.2 sets start == end for non-perodic schedules. self.log.warning("DAG schedule not periodic, setting timespan end to max %s", timespan_end) timespan_end = timezone.coerce_datetime(DateTime.max) timespan_start = timespan_start.in_timezone(timezone.utc) timespan_end = timespan_end.in_timezone(timezone.utc) source_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix( self.source_prefix, timespan_start, ) destination_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix( self.destination_prefix, timespan_start, ) source_hook = GCSHook( gcp_conn_id=self.source_gcp_conn_id, impersonation_chain=self.source_impersonation_chain, ) destination_hook = GCSHook( gcp_conn_id=self.destination_gcp_conn_id, impersonation_chain=self.destination_impersonation_chain, ) # Fetch list of files. blobs_to_transform = source_hook.list_by_timespan( bucket_name=self.source_bucket, prefix=source_prefix_interp, timespan_start=timespan_start, timespan_end=timespan_end, ) with TemporaryDirectory() as temp_input_dir, TemporaryDirectory() as temp_output_dir: temp_input_dir_path = Path(temp_input_dir) temp_output_dir_path = Path(temp_output_dir) # TODO: download in parallel. for blob_to_transform in blobs_to_transform: destination_file = temp_input_dir_path / blob_to_transform destination_file.parent.mkdir(parents=True, exist_ok=True) try: source_hook.download( bucket_name=self.source_bucket, object_name=blob_to_transform, filename=str(destination_file), chunk_size=self.chunk_size, num_max_attempts=self.download_num_attempts, ) except GoogleCloudError: if self.download_continue_on_fail: continue raise self.log.info("Starting the transformation") cmd = [self.transform_script] if isinstance(self.transform_script, str) else self.transform_script cmd += [ str(temp_input_dir_path), str(temp_output_dir_path), timespan_start.replace(microsecond=0).isoformat(), timespan_end.replace(microsecond=0).isoformat(), ] with subprocess.Popen( args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True ) as process: self.log.info("Process output:") if process.stdout: for line in iter(process.stdout.readline, b''): self.log.info(line.decode(self.output_encoding).rstrip()) process.wait() if process.returncode: raise AirflowException(f"Transform script failed: {process.returncode}") self.log.info("Transformation succeeded. Output temporarily located at %s", temp_output_dir_path) files_uploaded = [] # TODO: upload in parallel. for upload_file in temp_output_dir_path.glob("**/*"): if upload_file.is_dir(): continue upload_file_name = str(upload_file.relative_to(temp_output_dir_path)) if self.destination_prefix is not None: upload_file_name = f"{destination_prefix_interp}/{upload_file_name}" self.log.info("Uploading file %s to %s", upload_file, upload_file_name) try: destination_hook.upload( bucket_name=self.destination_bucket, object_name=upload_file_name, filename=str(upload_file), chunk_size=self.chunk_size, num_max_attempts=self.upload_num_attempts, ) files_uploaded.append(str(upload_file_name)) except GoogleCloudError: if self.upload_continue_on_fail: continue raise return files_uploaded
def deserialize_operator(cls, encoded_op: Dict[str, Any]) -> BaseOperator: """Deserializes an operator from a JSON object.""" op = SerializedBaseOperator(task_id=encoded_op['task_id']) if "label" not in encoded_op: # Handle deserialization of old data before the introduction of TaskGroup encoded_op["label"] = encoded_op["task_id"] # Extra Operator Links defined in Plugins op_extra_links_from_plugin = {} # We don't want to load Extra Operator links in Scheduler if cls._load_operator_extra_links: from airflow import plugins_manager plugins_manager.initialize_extra_operators_links_plugins() if plugins_manager.operator_extra_links is None: raise AirflowException("Can not load plugins") for ope in plugins_manager.operator_extra_links: for operator in ope.operators: if (operator.__name__ == encoded_op["_task_type"] and operator.__module__ == encoded_op["_task_module"]): op_extra_links_from_plugin.update({ope.name: ope}) # If OperatorLinks are defined in Plugins but not in the Operator that is being Serialized # set the Operator links attribute # The case for "If OperatorLinks are defined in the operator that is being Serialized" # is handled in the deserialization loop where it matches k == "_operator_extra_links" if op_extra_links_from_plugin and "_operator_extra_links" not in encoded_op: setattr(op, "operator_extra_links", list(op_extra_links_from_plugin.values())) for k, v in encoded_op.items(): if k == "_downstream_task_ids": v = set(v) elif k == "subdag": v = SerializedDAG.deserialize_dag(v) elif k in { "retry_delay", "execution_timeout", "sla", "max_retry_delay" }: v = cls._deserialize_timedelta(v) elif k in encoded_op["template_fields"]: pass elif k.endswith("_date"): v = cls._deserialize_datetime(v) elif k == "_operator_extra_links": if cls._load_operator_extra_links: op_predefined_extra_links = cls._deserialize_operator_extra_links( v) # If OperatorLinks with the same name exists, Links via Plugin have higher precedence op_predefined_extra_links.update( op_extra_links_from_plugin) else: op_predefined_extra_links = {} v = list(op_predefined_extra_links.values()) k = "operator_extra_links" elif k == "deps": v = cls._deserialize_deps(v) elif k == "params": v = cls._deserialize_params_dict(v) elif k in cls._decorated_fields or k not in op.get_serialized_fields( ): v = cls._deserialize(v) # else use v as it is setattr(op, k, v) for k in op.get_serialized_fields() - encoded_op.keys( ) - cls._CONSTRUCTOR_PARAMS.keys(): setattr(op, k, None) # Set all the template_field to None that were not present in Serialized JSON for field in op.template_fields: if not hasattr(op, field): setattr(op, field, None) # Used to determine if an Operator is inherited from DummyOperator setattr(op, "_is_dummy", bool(encoded_op.get("_is_dummy", False))) return op
def execute(self, context): self.log.info('Preparing Singularity container %s', self.image) self.cli = Client if not self.command: raise AirflowException('You must define a command.') # Pull the container if asked, and ensure not a binary file if self.force_pull and not os.path.exists(self.image): self.log.info('Pulling container %s', self.image) image = self.cli.pull(self.image, stream=True, pull_folder=self.pull_folder) # If we need to stream result for the user, returns lines if isinstance(image, list): lines = image.pop() image = image[0] for line in lines: self.log.info(line) # Update the image to be a filepath on the system self.image = image # Prepare list of binds for bind in self.volumes: self.options = self.options + ['--bind', bind] # Does the user want a custom working directory? if self.working_dir is not None: self.options = self.options + ['--workdir', self.working_dir] # Export environment before instance is run for enkey, envar in self.environment.items(): self.log.debug('Exporting %s=%s', envar, enkey) os.putenv(enkey, envar) os.environ[enkey] = envar # Create a container instance self.log.debug('Options include: %s', self.options) self.instance = self.cli.instance(self.image, options=self.options, args=self.start_command, start=False) self.instance.start() self.log.info(self.instance.cmd) self.log.info('Created instance %s from %s', self.instance, self.image) self.log.info('Running command %s', self._get_command()) self.cli.quiet = True result = self.cli.execute(self.instance, self._get_command(), return_result=True) # Stop the instance self.log.info('Stopping instance %s', self.instance) self.instance.stop() if self.auto_remove is True: if self.auto_remove and os.path.exists(self.image): shutil.rmtree(self.image) # If the container failed, raise the exception if result['return_code'] != 0: message = result['message'] raise AirflowException(f'Singularity failed: {message}') self.log.info('Output from command %s', result['message'])
def _deserialize_operator_extra_links( cls, encoded_op_links: list) -> Dict[str, BaseOperatorLink]: """ Deserialize Operator Links if the Classes are registered in Airflow Plugins. Error is raised if the OperatorLink is not found in Plugins too. :param encoded_op_links: Serialized Operator Link :return: De-Serialized Operator Link """ from airflow import plugins_manager plugins_manager.initialize_extra_operators_links_plugins() if plugins_manager.registered_operator_link_classes is None: raise AirflowException("Can't load plugins") op_predefined_extra_links = {} for _operator_links_source in encoded_op_links: # Get the key, value pair as Tuple where key is OperatorLink ClassName # and value is the dictionary containing the arguments passed to the OperatorLink # # Example of a single iteration: # # _operator_links_source = # { # 'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink': { # 'index': 0 # } # }, # # list(_operator_links_source.items()) = # [ # ( # 'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink', # {'index': 0} # ) # ] # # list(_operator_links_source.items())[0] = # ( # 'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink', # { # 'index': 0 # } # ) _operator_link_class_path, data = list( _operator_links_source.items())[0] if _operator_link_class_path in get_operator_extra_links(): single_op_link_class = import_string(_operator_link_class_path) elif _operator_link_class_path in plugins_manager.registered_operator_link_classes: single_op_link_class = plugins_manager.registered_operator_link_classes[ _operator_link_class_path] else: log.error("Operator Link class %r not registered", _operator_link_class_path) return {} op_predefined_extra_link: BaseOperatorLink = cattr.structure( data, single_op_link_class) op_predefined_extra_links.update( {op_predefined_extra_link.name: op_predefined_extra_link}) return op_predefined_extra_links
def resp_check(_, execution_date): if execution_date == DEFAULT_DATE: return True raise AirflowException('AirflowException raised here!')
def slots_stats( *, lock_rows: bool = False, session: Session = None, ) -> Dict[str, PoolStats]: """ Get Pool stats (Number of Running, Queued, Open & Total tasks) If ``lock_rows`` is True, and the database engine in use supports the ``NOWAIT`` syntax, then a non-blocking lock will be attempted -- if the lock is not available then SQLAlchemy will throw an OperationalError. :param lock_rows: Should we attempt to obtain a row-level lock on all the Pool rows returns :param session: SQLAlchemy ORM Session """ from airflow.models.taskinstance import TaskInstance # Avoid circular import pools: Dict[str, PoolStats] = {} query = session.query(Pool.pool, Pool.slots) if lock_rows: query = with_row_locks(query, **nowait(session)) pool_rows: Iterable[Tuple[str, int]] = query.all() for (pool_name, total_slots) in pool_rows: pools[pool_name] = PoolStats(total=total_slots, running=0, queued=0, open=0) state_count_by_pool = (session.query( TaskInstance.pool, TaskInstance.state, func.count()).filter(TaskInstance.state.in_( list(EXECUTION_STATES))).group_by(TaskInstance.pool, TaskInstance.state)).all() # calculate queued and running metrics count: int for (pool_name, state, count) in state_count_by_pool: stats_dict: Optional[PoolStats] = pools.get(pool_name) if not stats_dict: continue # TypedDict key must be a string literal, so we use if-statements to set value if state == "running": stats_dict["running"] = count elif state == "queued": stats_dict["queued"] = count else: raise AirflowException( f"Unexpected state. Expected values: {EXECUTION_STATES}.") # calculate open metric for pool_name, stats_dict in pools.items(): if stats_dict["total"] == -1: # -1 means infinite stats_dict["open"] = -1 else: stats_dict["open"] = stats_dict["total"] - stats_dict[ "running"] - stats_dict["queued"] return pools
def get_hook(self): if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.gcp.hooks.bigquery import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'pig_cli': from airflow.hooks.pig_hook import PigCliHook return PigCliHook(pig_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) elif self.conn_type == 'azure_data_lake': from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) elif self.conn_type == 'azure_cosmos': from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) elif self.conn_type == 'cassandra': from airflow.contrib.hooks.cassandra_hook import CassandraHook return CassandraHook(cassandra_conn_id=self.conn_id) elif self.conn_type == 'mongo': from airflow.contrib.hooks.mongo_hook import MongoHook return MongoHook(conn_id=self.conn_id) elif self.conn_type == 'gcpcloudsql': from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) elif self.conn_type == 'grpc': from airflow.contrib.hooks.grpc_hook import GrpcHook return GrpcHook(grpc_conn_id=self.conn_id) raise AirflowException("Unknown hook type {}".format(self.conn_type))
def _get_required_param(name): """Extract required parameter from extra JSON, raise exception if not found""" value = conn.extra_dejson.get(name) if not value: raise AirflowException(f'Extra connection option is missing required parameter: `{name}`') return value
def update_instance( self, update_mask: Union[Dict, FieldMask], instance: Union[Dict, Instance], project_id: str, location: Optional[str] = None, instance_id: Optional[str] = None, retry: Optional[Retry] = None, timeout: Optional[float] = None, metadata: Optional[Sequence[Tuple[str, str]]] = None, ): """ Updates the metadata and configuration of a specific Redis instance. :param update_mask: Required. Mask of fields to update. At least one path must be supplied in this field. The elements of the repeated paths field may only include these fields from ``Instance``: - ``displayName`` - ``labels`` - ``memorySizeGb`` - ``redisConfig`` If a dict is provided, it must be of the same form as the protobuf message :class:`~google.cloud.redis_v1.types.FieldMask` :type update_mask: Union[Dict, google.cloud.redis_v1.types.FieldMask] :param instance: Required. Update description. Only fields specified in ``update_mask`` are updated. If a dict is provided, it must be of the same form as the protobuf message :class:`~google.cloud.redis_v1.types.Instance` :type instance: Union[Dict, google.cloud.redis_v1.types.Instance] :param location: The location of the Cloud Memorystore instance (for example europe-west1) :type location: str :param instance_id: The logical name of the Redis instance in the customer project. :type instance_id: str :param project_id: Project ID of the project that contains the instance. If set to None or missing, the default project_id from the Google Cloud connection is used. :type project_id: str :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be retried. :type retry: google.api_core.retry.Retry :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if ``retry`` is specified, the timeout applies to each individual attempt. :type timeout: float :param metadata: Additional metadata that is provided to the method. :type metadata: Sequence[Tuple[str, str]] """ client = self.get_conn() if isinstance(instance, dict): instance = ParseDict(instance, Instance()) elif not isinstance(instance, Instance): raise AirflowException("instance is not instance of Instance type or python dict") if location and instance_id: name = CloudRedisClient.instance_path(project_id, location, instance_id) instance.name = name self.log.info("Updating instances: %s", instance.name) result = client.update_instance( update_mask=update_mask, instance=instance, retry=retry, timeout=timeout, metadata=metadata ) result.result() self.log.info("Instance updated: %s", instance.name)
def _sync_dag_view_permissions(self, dag_id, access_control): """Set the access policy on the given DAG's ViewModel. :param dag_id: the ID of the DAG whose permissions should be updated :type dag_id: str :param access_control: a dict where each key is a rolename and each value is a set() of permission names (e.g., {'can_dag_read'} :type access_control: dict """ def _get_or_create_dag_permission(perm_name): dag_perm = self.find_permission_view_menu(perm_name, dag_id) if not dag_perm: self.log.info( "Creating new permission '%s' on view '%s'", perm_name, dag_id ) dag_perm = self.add_permission_view_menu(perm_name, dag_id) return dag_perm def _revoke_stale_permissions(dag_view): existing_dag_perms = self.find_permissions_view_menu(dag_view) for perm in existing_dag_perms: non_admin_roles = [role for role in perm.role if role.name != 'Admin'] for role in non_admin_roles: target_perms_for_role = access_control.get(role.name, {}) if perm.permission.name not in target_perms_for_role: self.log.info( "Revoking '%s' on DAG '%s' for role '%s'", perm.permission, dag_id, role.name ) self.del_permission_role(role, perm) dag_view = self.find_view_menu(dag_id) if dag_view: _revoke_stale_permissions(dag_view) for rolename, perms in access_control.items(): role = self.find_role(rolename) if not role: raise AirflowException( "The access_control mapping for DAG '{}' includes a role " "named '{}', but that role does not exist".format( dag_id, rolename)) perms = set(perms) invalid_perms = perms - self.DAG_PERMS if invalid_perms: raise AirflowException( "The access_control map for DAG '{}' includes the following " "invalid permissions: {}; The set of valid permissions " "is: {}".format(dag_id, (perms - self.DAG_PERMS), self.DAG_PERMS)) for perm_name in perms: dag_perm = _get_or_create_dag_permission(perm_name) self.add_permission_role(role, dag_perm)