Exemple #1
0
def check_events_for_failures(events):
    check.list_param(events, "events", of_type=DagsterEvent)
    for event in events:
        if event.event_type_value == "STEP_FAILURE":
            raise AirflowException("step failed with error: %s" %
                                   event.event_specific_data.error.to_string())
Exemple #2
0
 def resp_check(_):
     raise AirflowException('AirflowException raised here!')
Exemple #3
0
    def create_instance(
        self,
        location: str,
        instance_id: str,
        instance: Union[Dict, Instance],
        project_id: str,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None,
    ):
        """
        Creates a Redis instance based on the specified tier and memory size.

        By default, the instance is accessible from the project's `default network
        <https://cloud.google.com/compute/docs/networks-and-firewalls#networks>`__.

        :param location: The location of the Cloud Memorystore instance (for example europe-west1)
        :type location: str
        :param instance_id: Required. The logical name of the Redis instance in the customer project with the
            following restrictions:

            -  Must contain only lowercase letters, numbers, and hyphens.
            -  Must start with a letter.
            -  Must be between 1-40 characters.
            -  Must end with a number or a letter.
            -  Must be unique within the customer project / location
        :type instance_id: str
        :param instance: Required. A Redis [Instance] resource

            If a dict is provided, it must be of the same form as the protobuf message
            :class:`~google.cloud.redis_v1.types.Instance`
        :type instance: Union[Dict, google.cloud.redis_v1.types.Instance]
        :param project_id: Project ID of the project that contains the instance. If set
            to None or missing, the default project_id from the Google Cloud connection is used.
        :type project_id: str
        :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be
            retried.
        :type retry: google.api_core.retry.Retry
        :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if
            ``retry`` is specified, the timeout applies to each individual attempt.
        :type timeout: float
        :param metadata: Additional metadata that is provided to the method.
        :type metadata: Sequence[Tuple[str, str]]
        """
        client = self.get_conn()
        parent = CloudRedisClient.location_path(project_id, location)
        instance_name = CloudRedisClient.instance_path(project_id, location, instance_id)
        try:
            instance = client.get_instance(
                name=instance_name, retry=retry, timeout=timeout, metadata=metadata
            )
            self.log.info("Instance exists. Skipping creation.")
            return instance
        except NotFound:
            self.log.info("Instance not exists.")

        if isinstance(instance, dict):
            instance = ParseDict(instance, Instance())
        elif not isinstance(instance, Instance):
            raise AirflowException("instance is not instance of Instance type or python dict")

        self._append_label(instance, "airflow-version", "v" + version.version)

        result = client.create_instance(
            parent=parent,
            instance_id=instance_id,
            instance=instance,
            retry=retry,
            timeout=timeout,
            metadata=metadata,
        )
        result.result()
        self.log.info("Instance created.")
        return client.get_instance(name=instance_name, retry=retry, timeout=timeout, metadata=metadata)
Exemple #4
0
 def _validate_inputs(self):
     if self.project_id == '':
         raise AirflowException("The required parameter 'project_id' is empty")
     if not self.instance_id:
         raise AirflowException("The required parameter 'instance_id' " "is empty or None")
 def is_terminated(self):
     if self.result_state not in APPLICATION_GATEWAY_JOB_STATES:
         raise AirflowException((
             'Some problem happened while running the spark job. Please check with APPLICATION Gateway Team'
         ).format(self.result_state))
     return self.result_state in ('TASK_KILLED', 'TASK_FAILED')
Exemple #6
0
    def execute(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        self.log.info("Tmp dir root location: \n %s", gettempdir())

        self.lineage_data = self.bash_command
        # 创建临时目录
        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            # 创建临时文件
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:
                # 将bash命令写入到临时文件中
                f.write(bytes(self.bash_command, 'utf_8'))
                f.flush()
                # 获得临时文件的名称
                fname = f.name
                script_location = os.path.abspath(fname)
                self.log.info("Temporary script location: %s", script_location)

                if USE_WINDOWS:
                    pre_exec = None
                else:

                    def pre_exec():
                        # Restore default signal disposition and invoke setsid
                        # SIG_DFL: 表示默认信号处理程序
                        for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                            if hasattr(signal, sig):
                                signal.signal(getattr(signal, sig),
                                              signal.SIG_DFL)
                        # 创建一个新的会话。新建的会话无控制终端。
                        # 帮助一个进程脱离从父进程继承而来的已打开的终端、隶属进程组和隶属的会话。
                        os.setsid()

                self.log.info("Running command: %s", self.bash_command)
                # 执行临时bash脚本
                sp = Popen(['bash', fname],
                           stdout=PIPE,
                           stderr=STDOUT,
                           cwd=tmp_dir,
                           env=self.env,
                           preexec_fn=pre_exec)
                # 获取bash进程
                self.sp = sp
                # 获得bash进程的执行结果
                self.log.info("Output:")
                line = ''
                for line in iter(sp.stdout.readline, b''):
                    line = line.decode(self.output_encoding).rstrip()
                    self.log.info(line)
                sp.wait()
                # 获得bash进程的错误码
                self.log.info("Command exited with return code %s",
                              sp.returncode)
                # 脚本执行失败抛出异常
                if sp.returncode:
                    raise AirflowException("Bash command failed")

        if self.xcom_push_flag:
            return line
Exemple #7
0
 def handler(signum, frame):
     raise AirflowException(f"Timeout {timeout}s reached")
Exemple #8
0
    def _execute(self):
        """
        Runs a dag for a specified date range.
        """
        session = settings.Session()

        start_date = self.bf_start_date
        end_date = self.bf_end_date

        # picklin'
        pickle_id = None
        if not self.donot_pickle and self.executor.__class__ not in (
                executors.LocalExecutor, executors.SequentialExecutor):
            pickle = models.DagPickle(self.dag)
            session.add(pickle)
            session.commit()
            pickle_id = pickle.id

        executor = self.executor
        executor.start()
        executor_fails = Counter()

        # Build a list of all instances to run
        tasks_to_run = {}
        failed = set()
        succeeded = set()
        started = set()
        skipped = set()
        not_ready = set()
        deadlocked = set()

        for task in self.dag.tasks:
            if (not self.include_adhoc) and task.adhoc:
                continue

            start_date = start_date or task.start_date
            end_date = end_date or task.end_date or datetime.now()
            for dttm in self.dag.date_range(start_date, end_date=end_date):
                ti = models.TaskInstance(task, dttm)
                tasks_to_run[ti.key] = ti
                session.merge(ti)
        session.commit()

        # Triggering what is ready to get triggered
        while tasks_to_run and not deadlocked:
            not_ready.clear()
            for key, ti in list(tasks_to_run.items()):

                ti.refresh_from_db()
                ignore_depends_on_past = (self.ignore_first_depends_on_past
                                          and ti.execution_date
                                          == (start_date or ti.start_date))

                # The task was already marked successful or skipped by a
                # different Job. Don't rerun it.
                if key not in started:
                    if ti.state == State.SUCCESS:
                        succeeded.add(key)
                        tasks_to_run.pop(key)
                        continue
                    elif ti.state == State.SKIPPED:
                        skipped.add(key)
                        tasks_to_run.pop(key)
                        continue

                # Is the task runnable? -- then run it
                if ti.is_queueable(
                        include_queued=True,
                        ignore_depends_on_past=ignore_depends_on_past,
                        flag_upstream_failed=True):
                    self.logger.debug('Sending {} to executor'.format(ti))
                    executor.queue_task_instance(
                        ti,
                        mark_success=self.mark_success,
                        pickle_id=pickle_id,
                        ignore_dependencies=self.ignore_dependencies,
                        ignore_depends_on_past=ignore_depends_on_past,
                        pool=self.pool)
                    started.add(key)

                # Mark the task as not ready to run
                elif ti.state in (State.NONE, State.UPSTREAM_FAILED):
                    self.logger.debug('Added {} to not_ready'.format(ti))
                    not_ready.add(key)

            self.heartbeat()
            executor.heartbeat()

            # If the set of tasks that aren't ready ever equals the set of
            # tasks to run, then the backfill is deadlocked
            if not_ready and not_ready == set(tasks_to_run):
                deadlocked.update(tasks_to_run.values())
                tasks_to_run.clear()

            # Reacting to events
            for key, state in list(executor.get_event_buffer().items()):
                dag_id, task_id, execution_date = key
                if key not in tasks_to_run:
                    continue
                ti = tasks_to_run[key]
                ti.refresh_from_db()

                # executor reports failure
                if state == State.FAILED:

                    # task reports running
                    if ti.state == State.RUNNING:
                        msg = ('Executor reports that task instance {} failed '
                               'although the task says it is running.'.format(
                                   key))
                        self.logger.error(msg)
                        ti.handle_failure(msg)
                        tasks_to_run.pop(key)

                    # task reports skipped
                    elif ti.state == State.SKIPPED:
                        self.logger.error("Skipping {} ".format(key))
                        skipped.add(key)
                        tasks_to_run.pop(key)

                    # anything else is a failure
                    else:
                        self.logger.error(
                            "Task instance {} failed".format(key))
                        failed.add(key)
                        tasks_to_run.pop(key)

                # executor reports success
                elif state == State.SUCCESS:

                    # task reports success
                    if ti.state == State.SUCCESS:
                        self.logger.info(
                            'Task instance {} succeeded'.format(key))
                        succeeded.add(key)
                        tasks_to_run.pop(key)

                    # task reports failure
                    elif ti.state == State.FAILED:
                        self.logger.error(
                            "Task instance {} failed".format(key))
                        failed.add(key)
                        tasks_to_run.pop(key)

                    # task reports skipped
                    elif ti.state == State.SKIPPED:
                        self.logger.info(
                            "Task instance {} skipped".format(key))
                        skipped.add(key)
                        tasks_to_run.pop(key)

                    # this probably won't ever be triggered
                    elif ti in not_ready:
                        self.logger.info(
                            "{} wasn't expected to run, but it did".format(ti))

                    # executor reports success but task does not - this is weird
                    elif ti.state not in (State.SUCCESS, State.QUEUED,
                                          State.UP_FOR_RETRY):
                        self.logger.error(
                            "The airflow run command failed "
                            "at reporting an error. This should not occur "
                            "in normal circumstances. Task state is '{}',"
                            "reported state is '{}'. TI is {}"
                            "".format(ti.state, state, ti))

                        # if the executor fails 3 or more times, stop trying to
                        # run the task
                        executor_fails[key] += 1
                        if executor_fails[key] >= 3:
                            msg = (
                                'The airflow run command failed to report an '
                                'error for task {} three or more times. The '
                                'task is being marked as failed. This is very '
                                'unusual and probably means that an error is '
                                'taking place before the task even '
                                'starts.'.format(key))
                            self.logger.error(msg)
                            ti.handle_failure(msg)
                            tasks_to_run.pop(key)

            msg = ' | '.join([
                "[backfill progress]", "waiting: {0}", "succeeded: {1}",
                "kicked_off: {2}", "failed: {3}", "skipped: {4}",
                "deadlocked: {5}"
            ]).format(len(tasks_to_run), len(succeeded), len(started),
                      len(failed), len(skipped), len(deadlocked))
            self.logger.info(msg)

        executor.end()
        session.close()

        err = ''
        if failed:
            err += ("---------------------------------------------------\n"
                    "Some task instances failed:\n{}\n".format(failed))
        if deadlocked:
            err += ('---------------------------------------------------\n'
                    'BackfillJob is deadlocked.')
            deadlocked_depends_on_past = any(
                t.are_dependencies_met() != t.are_dependencies_met(
                    ignore_depends_on_past=True) for t in deadlocked)
            if deadlocked_depends_on_past:
                err += (
                    'Some of the deadlocked tasks were unable to run because '
                    'of "depends_on_past" relationships. Try running the '
                    'backfill with the option '
                    '"ignore_first_depends_on_past=True" or passing "-I" at '
                    'the command line.')
            err += ' These tasks were unable to run:\n{}\n'.format(deadlocked)
        if err:
            raise AirflowException(err)

        self.logger.info("Backfill done. Exiting.")
Exemple #9
0
    def __init__(
        self,
        *,
        external_dag_id: str,
        external_task_id: Optional[str] = None,
        external_task_ids: Optional[Collection[str]] = None,
        allowed_states: Optional[Iterable[str]] = None,
        failed_states: Optional[Iterable[str]] = None,
        execution_delta: Optional[datetime.timedelta] = None,
        execution_date_fn: Optional[Callable] = None,
        check_existence: bool = False,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.allowed_states = list(allowed_states) if allowed_states else [
            State.SUCCESS
        ]
        self.failed_states = list(failed_states) if failed_states else []

        total_states = set(self.allowed_states + self.failed_states)

        if set(self.failed_states).intersection(set(self.allowed_states)):
            raise AirflowException(
                f"Duplicate values provided as allowed "
                f"`{self.allowed_states}` and failed states `{self.failed_states}`"
            )

        if external_task_id is not None and external_task_ids is not None:
            raise ValueError(
                'Only one of `external_task_id` or `external_task_ids` may '
                'be provided to ExternalTaskSensor; not both.')

        if external_task_id is not None:
            external_task_ids = [external_task_id]

        if external_task_ids:
            if not total_states <= set(State.task_states):
                raise ValueError(
                    f'Valid values for `allowed_states` and `failed_states` '
                    f'when `external_task_id` or `external_task_ids` is not `None`: {State.task_states}'
                )
            if len(external_task_ids) > len(set(external_task_ids)):
                raise ValueError(
                    'Duplicate task_ids passed in external_task_ids parameter')
        elif not total_states <= set(State.dag_states):
            raise ValueError(
                f'Valid values for `allowed_states` and `failed_states` '
                f'when `external_task_id` is `None`: {State.dag_states}')

        if execution_delta is not None and execution_date_fn is not None:
            raise ValueError(
                'Only one of `execution_delta` or `execution_date_fn` may '
                'be provided to ExternalTaskSensor; not both.')

        self.execution_delta = execution_delta
        self.execution_date_fn = execution_date_fn
        self.external_dag_id = external_dag_id
        self.external_task_id = external_task_id
        self.external_task_ids = external_task_ids
        self.check_existence = check_existence
        self._has_checked_existence = False
Exemple #10
0
    def update_parameters(
        self,
        update_mask: Union[Dict, cloud_memcache.field_mask.FieldMask],
        parameters: Union[Dict, cloud_memcache.MemcacheParameters],
        project_id: str,
        location: str,
        instance_id: Optional[str] = None,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None,
    ):
        """
        Updates the defined Memcached Parameters for an existing Instance. This method only stages the
            parameters, it must be followed by apply_parameters to apply the parameters to nodes of
            the Memcached Instance.

        :param update_mask: Required. Mask of fields to update.
            If a dict is provided, it must be of the same form as the protobuf message
            :class:`~google.cloud.memcache_v1beta2.types.cloud_memcache.field_mask.FieldMask`
        :type update_mask:
            Union[Dict, google.cloud.memcache_v1beta2.types.cloud_memcache.field_mask.FieldMask]
        :param parameters: The parameters to apply to the instance.
            If a dict is provided, it must be of the same form as the protobuf message
            :class:`~google.cloud.memcache_v1beta2.types.cloud_memcache.MemcacheParameters`
        :type parameters: Union[Dict, google.cloud.memcache_v1beta2.types.cloud_memcache.MemcacheParameters]
        :param location: The location of the Cloud Memorystore instance (for example europe-west1)
        :type location: str
        :param instance_id: The logical name of the Memcached instance in the customer project.
        :type instance_id: str
        :param project_id: Project ID of the project that contains the instance. If set
            to None or missing, the default project_id from the Google Cloud connection is used.
        :type project_id: str
        :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be
            retried.
        :type retry: google.api_core.retry.Retry
        :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if
            ``retry`` is specified, the timeout applies to each individual attempt.
        :type timeout: float
        :param metadata: Additional metadata that is provided to the method.
        :type metadata: Sequence[Tuple[str, str]]
        """
        client = self.get_conn()
        metadata = metadata or ()

        if isinstance(parameters, dict):
            parameters = cloud_memcache.MemcacheParameters(parameters)
        elif not isinstance(parameters, cloud_memcache.MemcacheParameters):
            raise AirflowException(
                "instance is not instance of MemcacheParameters type or python dict"
            )

        name = CloudMemcacheClient.instance_path(project_id, location,
                                                 instance_id)
        self.log.info("Staging update to instance: %s", instance_id)
        result = client.update_parameters(
            name=name,
            update_mask=update_mask,
            parameters=parameters,
            retry=retry,
            timeout=timeout,
            metadata=metadata,
        )
        result.result()
        self.log.info("Update staged for instance: %s", instance_id)
Exemple #11
0
 def _validate_inputs(self):
     for attr_name in self.REQUIRED_ATTRIBUTES:
         if not getattr(self, attr_name):
             raise AirflowException('Empty parameter: {}'.format(attr_name))
Exemple #12
0
    def copy_object(
        self,
        source_bucket_key: str,
        dest_bucket_key: str,
        source_bucket_name: Optional[str] = None,
        dest_bucket_name: Optional[str] = None,
        source_version_id: Optional[str] = None,
        acl_policy: Optional[str] = None,
    ) -> None:
        """
        Creates a copy of an object that is already stored in S3.

        Note: the S3 connection used here needs to have access to both
        source and destination bucket/key.

        :param source_bucket_key: The key of the source object.

            It can be either full s3:// style url or relative path from root level.

            When it's specified as a full s3:// url, please omit source_bucket_name.
        :type source_bucket_key: str
        :param dest_bucket_key: The key of the object to copy to.

            The convention to specify `dest_bucket_key` is the same
            as `source_bucket_key`.
        :type dest_bucket_key: str
        :param source_bucket_name: Name of the S3 bucket where the source object is in.

            It should be omitted when `source_bucket_key` is provided as a full s3:// url.
        :type source_bucket_name: str
        :param dest_bucket_name: Name of the S3 bucket to where the object is copied.

            It should be omitted when `dest_bucket_key` is provided as a full s3:// url.
        :type dest_bucket_name: str
        :param source_version_id: Version ID of the source object (OPTIONAL)
        :type source_version_id: str
        :param acl_policy: The string to specify the canned ACL policy for the
            object to be copied which is private by default.
        :type acl_policy: str
        """
        acl_policy = acl_policy or 'private'

        if dest_bucket_name is None:
            dest_bucket_name, dest_bucket_key = self.parse_s3_url(
                dest_bucket_key)
        else:
            parsed_url = urlparse(dest_bucket_key)
            if parsed_url.scheme != '' or parsed_url.netloc != '':
                raise AirflowException(
                    'If dest_bucket_name is provided, ' +
                    'dest_bucket_key should be relative path ' +
                    'from root level, rather than a full s3:// url')

        if source_bucket_name is None:
            source_bucket_name, source_bucket_key = self.parse_s3_url(
                source_bucket_key)
        else:
            parsed_url = urlparse(source_bucket_key)
            if parsed_url.scheme != '' or parsed_url.netloc != '':
                raise AirflowException(
                    'If source_bucket_name is provided, ' +
                    'source_bucket_key should be relative path ' +
                    'from root level, rather than a full s3:// url')

        copy_source = {
            'Bucket': source_bucket_name,
            'Key': source_bucket_key,
            'VersionId': source_version_id
        }
        response = self.get_conn().copy_object(Bucket=dest_bucket_name,
                                               Key=dest_bucket_key,
                                               CopySource=copy_source,
                                               ACL=acl_policy)
        return response
Exemple #13
0
    def execute(self, context):
        try:
            if self.ssh_conn_id:
                if self.ssh_hook and isinstance(self.ssh_hook, SSHHook):
                    self.log.info(
                        "ssh_conn_id is ignored when ssh_hook is provided.")
                else:
                    self.log.info("ssh_hook is not provided or invalid. " +
                                  "Trying ssh_conn_id to create SSHHook.")
                    self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id,
                                            timeout=self.timeout)

            if not self.ssh_hook:
                raise AirflowException(
                    "Cannot operate without ssh_hook or ssh_conn_id.")

            if self.remote_host is not None:
                self.log.info(
                    "remote_host is provided explicitly. " +
                    "It will replace the remote_host which was defined " +
                    "in ssh_hook or predefined in connection of ssh_conn_id.")
                self.ssh_hook.remote_host = self.remote_host

            if not self.command:
                raise AirflowException("SSH command not specified. Aborting.")

            with self.ssh_hook.get_conn() as ssh_client:
                # Auto apply tty when its required in case of sudo
                get_pty = False
                if self.command.startswith('sudo'):
                    get_pty = True

                # set timeout taken as params
                stdin, stdout, stderr = ssh_client.exec_command(
                    command=self.command,
                    get_pty=get_pty,
                    timeout=self.timeout)
                # get channels
                channel = stdout.channel

                # closing stdin
                stdin.close()
                channel.shutdown_write()

                agg_stdout = b''
                agg_stderr = b''

                # capture any initial output in case channel is closed already
                stdout_buffer_length = len(stdout.channel.in_buffer)

                if stdout_buffer_length > 0:
                    agg_stdout += stdout.channel.recv(stdout_buffer_length)

                # read from both stdout and stderr
                while not channel.closed or \
                        channel.recv_ready() or \
                        channel.recv_stderr_ready():
                    readq, _, _ = select([channel], [], [], self.timeout)
                    for c in readq:
                        if c.recv_ready():
                            line = stdout.channel.recv(len(c.in_buffer))
                            line = line
                            agg_stdout += line
                            self.log.info(line.decode('utf-8').strip('\n'))
                        if c.recv_stderr_ready():
                            line = stderr.channel.recv_stderr(
                                len(c.in_stderr_buffer))
                            line = line
                            agg_stderr += line
                            self.log.warning(line.decode('utf-8').strip('\n'))
                    if stdout.channel.exit_status_ready()\
                            and not stderr.channel.recv_stderr_ready()\
                            and not stdout.channel.recv_ready():
                        stdout.channel.shutdown_read()
                        stdout.channel.close()
                        break

                stdout.close()
                stderr.close()

                exit_status = stdout.channel.recv_exit_status()
                if exit_status == 0:
                    enable_pickling = configuration.conf.getboolean(
                        'core', 'enable_xcom_pickling')
                    if enable_pickling:
                        return agg_stdout
                    else:
                        return b64encode(agg_stdout).decode('utf-8')

                else:
                    error_msg = agg_stderr.decode('utf-8')
                    raise AirflowException(
                        "error running cmd: {0}, error: {1}".format(
                            self.command, error_msg))

        except Exception as e:
            raise AirflowException("SSH operator error: {0}".format(str(e)))

        return True
Exemple #14
0
 def securestring(value: str):
     if not native:
         raise AirflowException(
             "Filter 'securestring' not applicable to non-native "
             "templating environment")
     return TaggedValue("SS", value)
Exemple #15
0
    def execute(self, context):
        try:
            from dagster_graphql.client.mutations import (
                DagsterGraphQLClientError,
                handle_execution_errors,
                handle_execute_plan_result_raw,
            )

        except ImportError:
            raise AirflowException(
                'To use the DagsterDockerOperator, dagster and dagster_graphql must be installed '
                'in your Airflow environment.')

        if 'run_id' in self.params:
            self._run_id = self.params['run_id']
        elif 'dag_run' in context and context['dag_run'] is not None:
            self._run_id = context['dag_run'].run_id

        try:
            if self.instance:
                run = self.instance.register_managed_run(
                    pipeline_name=self.pipeline_name,
                    run_id=self.run_id,
                    run_config=self.run_config,
                    mode=self.mode,
                    solids_to_execute=None,
                    step_keys_to_execute=None,
                    tags=None,
                    root_run_id=None,
                    parent_run_id=None,
                    pipeline_snapshot=self.pipeline_snapshot,
                    execution_plan_snapshot=self.execution_plan_snapshot,
                    parent_pipeline_snapshot=self.parent_pipeline_snapshot,
                )

            raw_res = super(DagsterDockerOperator, self).execute(context)
            self.log.info('Finished executing container.')

            res = parse_raw_log_lines(raw_res)

            try:
                handle_execution_errors(res, 'executePlan')
            except DagsterGraphQLClientError as err:
                if self.instance:
                    self.instance.report_engine_event(
                        str(err),
                        run,
                        EngineEventData.engine_error(
                            serializable_error_info_from_exc_info(
                                sys.exc_info())),
                        self.__class__,
                    )
                raise

            events = handle_execute_plan_result_raw(res)

            if self.instance:
                for event in events:
                    self.instance.handle_new_event(event)

            events = [e.dagster_event for e in events]
            check_events_for_failures(events)
            check_events_for_skips(events)

            return events

        finally:
            self._run_id = None
Exemple #16
0
    def create_bucket(self,
                      bucket_name,
                      storage_class='MULTI_REGIONAL',
                      location='US',
                      project_id=None,
                      labels=None
                      ):
        """
        Creates a new bucket. Google Cloud Storage uses a flat namespace, so
        you can't create a bucket with a name that is already in use.

        .. seealso::
            For more information, see Bucket Naming Guidelines:
            https://cloud.google.com/storage/docs/bucketnaming.html#requirements

        :param bucket_name: The name of the bucket.
        :type bucket_name: str
        :param storage_class: This defines how objects in the bucket are stored
            and determines the SLA and the cost of storage. Values include

            - ``MULTI_REGIONAL``
            - ``REGIONAL``
            - ``STANDARD``
            - ``NEARLINE``
            - ``COLDLINE``.
            If this value is not specified when the bucket is
            created, it will default to STANDARD.
        :type storage_class: str
        :param location: The location of the bucket.
            Object data for objects in the bucket resides in physical storage
            within this region. Defaults to US.

            .. seealso::
                https://developers.google.com/storage/docs/bucket-locations

        :type location: str
        :param project_id: The ID of the GCP Project.
        :type project_id: str
        :param labels: User-provided labels, in key/value pairs.
        :type labels: dict
        :return: If successful, it returns the ``id`` of the bucket.
        """

        project_id = project_id if project_id is not None else self.project_id
        storage_classes = [
            'MULTI_REGIONAL',
            'REGIONAL',
            'NEARLINE',
            'COLDLINE',
            'STANDARD',  # alias for MULTI_REGIONAL/REGIONAL, based on location
        ]

        self.log.info('Creating Bucket: %s; Location: %s; Storage Class: %s',
                      bucket_name, location, storage_class)
        if storage_class not in storage_classes:
            raise ValueError(
                'Invalid value ({}) passed to storage_class. Value should be '
                'one of {}'.format(storage_class, storage_classes))

        if not re.match('[a-zA-Z0-9]+', bucket_name[0]):
            raise ValueError('Bucket names must start with a number or letter.')

        if not re.match('[a-zA-Z0-9]+', bucket_name[-1]):
            raise ValueError('Bucket names must end with a number or letter.')

        service = self.get_conn()
        bucket_resource = {
            'name': bucket_name,
            'location': location,
            'storageClass': storage_class
        }

        self.log.info('The Default Project ID is %s', self.project_id)

        if labels is not None:
            bucket_resource['labels'] = labels

        try:
            response = service.buckets().insert(
                project=project_id,
                body=bucket_resource
            ).execute()

            self.log.info('Bucket: %s created successfully.', bucket_name)

            return response['id']

        except errors.HttpError as ex:
            raise AirflowException(
                'Bucket creation failed. Error was: {}'.format(ex.content)
            )
Exemple #17
0
    def execute(self, context):
        '''Modified only to use the get_host_tmp_dir helper.'''
        self.log.info('Starting docker container from image %s', self.image)

        tls_config = self.__get_tls_config()
        if self.docker_conn_id:
            self.cli = self.get_hook().get_conn()
        else:
            self.cli = APIClient(base_url=self.docker_url,
                                 version=self.api_version,
                                 tls=tls_config)

        if self.force_pull or len(self.cli.images(name=self.image)) == 0:
            self.log.info('Pulling docker image %s', self.image)
            for l in self.cli.pull(self.image, stream=True):
                output = seven.json.loads(l.decode('utf-8').strip())
                if 'status' in output:
                    self.log.info("%s", output['status'])

        with self.get_host_tmp_dir() as host_tmp_dir:
            self.environment['AIRFLOW_TMP_DIR'] = self.tmp_dir
            self.volumes.append('{0}:{1}'.format(host_tmp_dir, self.tmp_dir))

            self.container = self.cli.create_container(
                command=self.get_command(),
                environment=self.environment,
                host_config=self.cli.create_host_config(
                    auto_remove=self.auto_remove,
                    binds=self.volumes,
                    network_mode=self.network_mode,
                    shm_size=self.shm_size,
                    dns=self.dns,
                    dns_search=self.dns_search,
                    cpu_shares=int(round(self.cpus * 1024)),
                    mem_limit=self.mem_limit,
                ),
                image=self.image,
                user=self.user,
                working_dir=self.working_dir,
            )
            self.cli.start(self.container['Id'])

            res = []
            line = ''
            for new_line in self.cli.logs(container=self.container['Id'],
                                          stream=True):
                line = new_line.strip()
                if hasattr(line, 'decode'):
                    line = line.decode('utf-8')
                self.log.info(line)
                res.append(line)

            result = self.cli.wait(self.container['Id'])
            if result['StatusCode'] != 0:
                raise AirflowException(
                    'docker container failed with result: {result} and logs: {logs}'
                    .format(result=repr(result), logs='\n'.join(res)))

            if self.xcom_push_flag:
                # Try to avoid any kind of race condition?
                return res if self.xcom_all else str(line)
    def check_uuids(**kwargs):
        print('dag_run conf follows:')
        pprint(kwargs['dag_run'].conf)

        try:
            assert_json_matches_schema(kwargs['dag_run'].conf,
                                       'launch_multi_metadata_schema.yml')
        except AssertionError as e:
            print('invalid metadata follows:')
            pprint(kwargs['dag_run'].conf)
            raise

        uuid_l = kwargs['dag_run'].conf['uuid_list']
        collection_type = kwargs['dag_run'].conf['collection_type']
        filtered_uuid_l = []
        filtered_path_l = []
        filtered_data_types = []
        for uuid in uuid_l:
            print(f'Starting uuid {uuid}')
            my_callable = lambda **kwargs: uuid
            ds_rslt = utils.pythonop_get_dataset_state(
                dataset_uuid_callable=my_callable,
                http_conn_id='ingest_api_connection',
                **kwargs)
            if not ds_rslt:
                raise AirflowException(f'Invalid uuid/doi for group: {uuid}')
            print('ds_rslt:')
            pprint(ds_rslt)

            for key in [
                    'status', 'uuid', 'data_types', 'local_directory_full_path'
            ]:
                assert key in ds_rslt, f"Dataset status for {uuid} has no {key}"

            if not ds_rslt['status'] in ['QA', 'Published']:
                raise AirflowException(f'Dataset {uuid} is not QA or better')

            dt = ds_rslt['data_types']
            if isinstance(dt, str) and dt.startswith('[') and dt.endswith(']'):
                dt = ast.literal_eval(dt)
            print(f'parsed dt: {dt}')
            if isinstance(dt, list):
                if dt:
                    if len(dt) == 1:
                        filtered_data_types.append(dt[0])
                    else:
                        filtered_data_types.append(tuple(dt))
                else:
                    raise AirflowException(
                        f'Dataset data_types for {uuid} is empty')
            else:
                filtered_data_types.append(dt)

            lz_path = ds_rslt['local_directory_full_path']
            filtered_path_l.append(lz_path)
            filtered_uuid_l.append(ds_rslt['uuid'])
        print(f'Finished uuid {uuid}')
        print(f'filtered data types: {filtered_data_types}')
        print(f'filtered paths: {filtered_path_l}')
        print(f'filtered uuids: {filtered_uuid_l}')
        kwargs['ti'].xcom_push(key='collectiontype', value=collection_type)
        kwargs['ti'].xcom_push(key='assay_type', value=filtered_data_types)
        kwargs['ti'].xcom_push(key='lz_paths', value=filtered_path_l)
        kwargs['ti'].xcom_push(key='uuids', value=filtered_uuid_l)
Exemple #19
0
    def run_cli(self, hql, schema=None, verbose=True, hive_conf=None):
        """
        Run an hql statement using the hive cli. If hive_conf is specified
        it should be a dict and the entries will be set as key/value pairs
        in HiveConf


        :param hive_conf: if specified these key value pairs will be passed
            to hive as ``-hiveconf "key"="value"``. Note that they will be
            passed after the ``hive_cli_params`` and thus will override
            whatever values are specified in the database.
        :type hive_conf: dict

        >>> hh = HiveCliHook()
        >>> result = hh.run_cli("USE airflow;")
        >>> ("OK" in result)
        True
        """
        conn = self.conn
        schema = schema or conn.schema
        if schema:
            hql = "USE {schema};\n{hql}".format(**locals())

        with TemporaryDirectory(prefix='airflow_hiveop_') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir) as f:
                f.write(hql.encode('UTF-8'))
                f.flush()
                hive_cmd = self._prepare_cli_cmd()
                hive_conf_params = self._prepare_hiveconf(hive_conf)
                if self.mapred_queue:
                    hive_conf_params.extend([
                        '-hiveconf',
                        'mapreduce.job.queuename={}'.format(self.mapred_queue)
                    ])

                if self.mapred_queue_priority:
                    hive_conf_params.extend([
                        '-hiveconf', 'mapreduce.job.priority={}'.format(
                            self.mapred_queue_priority)
                    ])

                if self.mapred_job_name:
                    hive_conf_params.extend([
                        '-hiveconf',
                        'mapred.job.name={}'.format(self.mapred_job_name)
                    ])

                hive_cmd.extend(hive_conf_params)
                hive_cmd.extend(['-f', f.name])

                if verbose:
                    self.log.info(" ".join(hive_cmd))
                sp = subprocess.Popen(hive_cmd,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.STDOUT,
                                      cwd=tmp_dir)
                self.sp = sp
                stdout = ''
                while True:
                    line = sp.stdout.readline()
                    if not line:
                        break
                    stdout += line.decode('UTF-8')
                    if verbose:
                        self.log.info(line.decode('UTF-8').strip())
                sp.wait()

                if sp.returncode:
                    raise AirflowException(stdout)

                return stdout
    def __init__(
            self,  # pylint: disable=too-many-arguments,too-many-locals
            *,
            namespace: Optional[str] = None,
            image: Optional[str] = None,
            name: Optional[str] = None,
            cmds: Optional[List[str]] = None,
            arguments: Optional[List[str]] = None,
            ports: Optional[List[Port]] = None,
            volume_mounts: Optional[List[VolumeMount]] = None,
            volumes: Optional[List[Volume]] = None,
            env_vars: Optional[Dict] = None,
            secrets: Optional[List[Secret]] = None,
            in_cluster: Optional[bool] = None,
            cluster_context: Optional[str] = None,
            labels: Optional[Dict] = None,
            reattach_on_restart: bool = True,
            startup_timeout_seconds: int = 120,
            get_logs: bool = True,
            image_pull_policy: str = 'IfNotPresent',
            annotations: Optional[Dict] = None,
            resources: Optional[Dict] = None,
            affinity: Optional[Dict] = None,
            config_file: Optional[str] = None,
            node_selectors: Optional[Dict] = None,
            image_pull_secrets: Optional[str] = None,
            service_account_name: str = 'default',
            is_delete_operator_pod: bool = False,
            hostnetwork: bool = False,
            tolerations: Optional[List] = None,
            configmaps: Optional[List] = None,
            security_context: Optional[Dict] = None,
            pod_runtime_info_envs: Optional[List[PodRuntimeInfoEnv]] = None,
            dnspolicy: Optional[str] = None,
            schedulername: Optional[str] = None,
            full_pod_spec: Optional[k8s.V1Pod] = None,
            init_containers: Optional[List[k8s.V1Container]] = None,
            log_events_on_failure: bool = False,
            do_xcom_push: bool = False,
            pod_template_file: Optional[str] = None,
            priority_class_name: Optional[str] = None,
            **kwargs):
        if kwargs.get('xcom_push') is not None:
            raise AirflowException(
                "'xcom_push' was deprecated, use 'do_xcom_push' instead")
        super().__init__(resources=None, **kwargs)

        self.pod = None
        self.do_xcom_push = do_xcom_push
        self.image = image
        self.namespace = namespace
        self.cmds = cmds or []
        self.arguments = arguments or []
        self.labels = labels or {}
        self.startup_timeout_seconds = startup_timeout_seconds
        self.env_vars = env_vars or {}
        self.ports = ports or []
        self.volume_mounts = volume_mounts or []
        self.volumes = volumes or []
        self.secrets = secrets or []
        self.in_cluster = in_cluster
        self.cluster_context = cluster_context
        self.reattach_on_restart = reattach_on_restart
        self.get_logs = get_logs
        self.image_pull_policy = image_pull_policy
        self.node_selectors = node_selectors or {}
        self.annotations = annotations or {}
        self.affinity = affinity or {}
        self.resources = self._set_resources(resources)
        self.config_file = config_file
        self.image_pull_secrets = image_pull_secrets
        self.service_account_name = service_account_name
        self.is_delete_operator_pod = is_delete_operator_pod
        self.hostnetwork = hostnetwork
        self.tolerations = tolerations or []
        self.configmaps = configmaps or []
        self.security_context = security_context or {}
        self.pod_runtime_info_envs = pod_runtime_info_envs or []
        self.dnspolicy = dnspolicy
        self.schedulername = schedulername
        self.full_pod_spec = full_pod_spec
        self.init_containers = init_containers or []
        self.log_events_on_failure = log_events_on_failure
        self.priority_class_name = priority_class_name
        self.pod_template_file = pod_template_file
        self.name = self._set_name(name)
Exemple #21
0
    def execute(self, context: "Context") -> List[str]:
        # Define intervals and prefixes.
        try:
            timespan_start = context["data_interval_start"]
            timespan_end = context["data_interval_end"]
        except KeyError:  # Data interval context variables are only available in Airflow 2.2+
            timespan_start = timezone.coerce_datetime(context["execution_date"])
            timespan_end = timezone.coerce_datetime(context["dag"].following_schedule(timespan_start))

        if timespan_end is None:  # Only possible in Airflow before 2.2.
            self.log.warning("No following schedule found, setting timespan end to max %s", timespan_end)
            timespan_end = timezone.coerce_datetime(DateTime.max)
        elif timespan_start >= timespan_end:  # Airflow 2.2 sets start == end for non-perodic schedules.
            self.log.warning("DAG schedule not periodic, setting timespan end to max %s", timespan_end)
            timespan_end = timezone.coerce_datetime(DateTime.max)

        timespan_start = timespan_start.in_timezone(timezone.utc)
        timespan_end = timespan_end.in_timezone(timezone.utc)

        source_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix(
            self.source_prefix,
            timespan_start,
        )
        destination_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix(
            self.destination_prefix,
            timespan_start,
        )

        source_hook = GCSHook(
            gcp_conn_id=self.source_gcp_conn_id,
            impersonation_chain=self.source_impersonation_chain,
        )
        destination_hook = GCSHook(
            gcp_conn_id=self.destination_gcp_conn_id,
            impersonation_chain=self.destination_impersonation_chain,
        )

        # Fetch list of files.
        blobs_to_transform = source_hook.list_by_timespan(
            bucket_name=self.source_bucket,
            prefix=source_prefix_interp,
            timespan_start=timespan_start,
            timespan_end=timespan_end,
        )

        with TemporaryDirectory() as temp_input_dir, TemporaryDirectory() as temp_output_dir:
            temp_input_dir_path = Path(temp_input_dir)
            temp_output_dir_path = Path(temp_output_dir)

            # TODO: download in parallel.
            for blob_to_transform in blobs_to_transform:
                destination_file = temp_input_dir_path / blob_to_transform
                destination_file.parent.mkdir(parents=True, exist_ok=True)
                try:
                    source_hook.download(
                        bucket_name=self.source_bucket,
                        object_name=blob_to_transform,
                        filename=str(destination_file),
                        chunk_size=self.chunk_size,
                        num_max_attempts=self.download_num_attempts,
                    )
                except GoogleCloudError:
                    if self.download_continue_on_fail:
                        continue
                    raise

            self.log.info("Starting the transformation")
            cmd = [self.transform_script] if isinstance(self.transform_script, str) else self.transform_script
            cmd += [
                str(temp_input_dir_path),
                str(temp_output_dir_path),
                timespan_start.replace(microsecond=0).isoformat(),
                timespan_end.replace(microsecond=0).isoformat(),
            ]
            with subprocess.Popen(
                args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True
            ) as process:
                self.log.info("Process output:")
                if process.stdout:
                    for line in iter(process.stdout.readline, b''):
                        self.log.info(line.decode(self.output_encoding).rstrip())

                process.wait()
                if process.returncode:
                    raise AirflowException(f"Transform script failed: {process.returncode}")

            self.log.info("Transformation succeeded. Output temporarily located at %s", temp_output_dir_path)

            files_uploaded = []

            # TODO: upload in parallel.
            for upload_file in temp_output_dir_path.glob("**/*"):
                if upload_file.is_dir():
                    continue

                upload_file_name = str(upload_file.relative_to(temp_output_dir_path))

                if self.destination_prefix is not None:
                    upload_file_name = f"{destination_prefix_interp}/{upload_file_name}"

                self.log.info("Uploading file %s to %s", upload_file, upload_file_name)

                try:
                    destination_hook.upload(
                        bucket_name=self.destination_bucket,
                        object_name=upload_file_name,
                        filename=str(upload_file),
                        chunk_size=self.chunk_size,
                        num_max_attempts=self.upload_num_attempts,
                    )
                    files_uploaded.append(str(upload_file_name))
                except GoogleCloudError:
                    if self.upload_continue_on_fail:
                        continue
                    raise

            return files_uploaded
    def deserialize_operator(cls, encoded_op: Dict[str, Any]) -> BaseOperator:
        """Deserializes an operator from a JSON object."""
        op = SerializedBaseOperator(task_id=encoded_op['task_id'])

        if "label" not in encoded_op:
            # Handle deserialization of old data before the introduction of TaskGroup
            encoded_op["label"] = encoded_op["task_id"]

        # Extra Operator Links defined in Plugins
        op_extra_links_from_plugin = {}

        # We don't want to load Extra Operator links in Scheduler
        if cls._load_operator_extra_links:
            from airflow import plugins_manager

            plugins_manager.initialize_extra_operators_links_plugins()

            if plugins_manager.operator_extra_links is None:
                raise AirflowException("Can not load plugins")

            for ope in plugins_manager.operator_extra_links:
                for operator in ope.operators:
                    if (operator.__name__ == encoded_op["_task_type"] and
                            operator.__module__ == encoded_op["_task_module"]):
                        op_extra_links_from_plugin.update({ope.name: ope})

            # If OperatorLinks are defined in Plugins but not in the Operator that is being Serialized
            # set the Operator links attribute
            # The case for "If OperatorLinks are defined in the operator that is being Serialized"
            # is handled in the deserialization loop where it matches k == "_operator_extra_links"
            if op_extra_links_from_plugin and "_operator_extra_links" not in encoded_op:
                setattr(op, "operator_extra_links",
                        list(op_extra_links_from_plugin.values()))

        for k, v in encoded_op.items():

            if k == "_downstream_task_ids":
                v = set(v)
            elif k == "subdag":
                v = SerializedDAG.deserialize_dag(v)
            elif k in {
                    "retry_delay", "execution_timeout", "sla",
                    "max_retry_delay"
            }:
                v = cls._deserialize_timedelta(v)
            elif k in encoded_op["template_fields"]:
                pass
            elif k.endswith("_date"):
                v = cls._deserialize_datetime(v)
            elif k == "_operator_extra_links":
                if cls._load_operator_extra_links:
                    op_predefined_extra_links = cls._deserialize_operator_extra_links(
                        v)

                    # If OperatorLinks with the same name exists, Links via Plugin have higher precedence
                    op_predefined_extra_links.update(
                        op_extra_links_from_plugin)
                else:
                    op_predefined_extra_links = {}

                v = list(op_predefined_extra_links.values())
                k = "operator_extra_links"

            elif k == "deps":
                v = cls._deserialize_deps(v)
            elif k == "params":
                v = cls._deserialize_params_dict(v)
            elif k in cls._decorated_fields or k not in op.get_serialized_fields(
            ):
                v = cls._deserialize(v)
            # else use v as it is

            setattr(op, k, v)

        for k in op.get_serialized_fields() - encoded_op.keys(
        ) - cls._CONSTRUCTOR_PARAMS.keys():
            setattr(op, k, None)

        # Set all the template_field to None that were not present in Serialized JSON
        for field in op.template_fields:
            if not hasattr(op, field):
                setattr(op, field, None)

        # Used to determine if an Operator is inherited from DummyOperator
        setattr(op, "_is_dummy", bool(encoded_op.get("_is_dummy", False)))

        return op
Exemple #23
0
    def execute(self, context):

        self.log.info('Preparing Singularity container %s', self.image)
        self.cli = Client

        if not self.command:
            raise AirflowException('You must define a command.')

        # Pull the container if asked, and ensure not a binary file
        if self.force_pull and not os.path.exists(self.image):
            self.log.info('Pulling container %s', self.image)
            image = self.cli.pull(self.image, stream=True, pull_folder=self.pull_folder)

            # If we need to stream result for the user, returns lines
            if isinstance(image, list):
                lines = image.pop()
                image = image[0]
                for line in lines:
                    self.log.info(line)

            # Update the image to be a filepath on the system
            self.image = image

        # Prepare list of binds
        for bind in self.volumes:
            self.options = self.options + ['--bind', bind]

        # Does the user want a custom working directory?
        if self.working_dir is not None:
            self.options = self.options + ['--workdir', self.working_dir]

        # Export environment before instance is run
        for enkey, envar in self.environment.items():
            self.log.debug('Exporting %s=%s', envar, enkey)
            os.putenv(enkey, envar)
            os.environ[enkey] = envar

        # Create a container instance
        self.log.debug('Options include: %s', self.options)
        self.instance = self.cli.instance(self.image,
                                          options=self.options,
                                          args=self.start_command,
                                          start=False)

        self.instance.start()
        self.log.info(self.instance.cmd)
        self.log.info('Created instance %s from %s', self.instance, self.image)

        self.log.info('Running command %s', self._get_command())
        self.cli.quiet = True
        result = self.cli.execute(self.instance,
                                  self._get_command(),
                                  return_result=True)

        # Stop the instance
        self.log.info('Stopping instance %s', self.instance)
        self.instance.stop()

        if self.auto_remove is True:
            if self.auto_remove and os.path.exists(self.image):
                shutil.rmtree(self.image)

        # If the container failed, raise the exception
        if result['return_code'] != 0:
            message = result['message']
            raise AirflowException(f'Singularity failed: {message}')

        self.log.info('Output from command %s', result['message'])
    def _deserialize_operator_extra_links(
            cls, encoded_op_links: list) -> Dict[str, BaseOperatorLink]:
        """
        Deserialize Operator Links if the Classes  are registered in Airflow Plugins.
        Error is raised if the OperatorLink is not found in Plugins too.

        :param encoded_op_links: Serialized Operator Link
        :return: De-Serialized Operator Link
        """
        from airflow import plugins_manager

        plugins_manager.initialize_extra_operators_links_plugins()

        if plugins_manager.registered_operator_link_classes is None:
            raise AirflowException("Can't load plugins")
        op_predefined_extra_links = {}

        for _operator_links_source in encoded_op_links:
            # Get the key, value pair as Tuple where key is OperatorLink ClassName
            # and value is the dictionary containing the arguments passed to the OperatorLink
            #
            # Example of a single iteration:
            #
            #   _operator_links_source =
            #   {
            #       'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink': {
            #           'index': 0
            #       }
            #   },
            #
            #   list(_operator_links_source.items()) =
            #   [
            #       (
            #           'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink',
            #           {'index': 0}
            #       )
            #   ]
            #
            #   list(_operator_links_source.items())[0] =
            #   (
            #       'airflow.providers.google.cloud.operators.bigquery.BigQueryConsoleIndexableLink',
            #       {
            #           'index': 0
            #       }
            #   )

            _operator_link_class_path, data = list(
                _operator_links_source.items())[0]
            if _operator_link_class_path in get_operator_extra_links():
                single_op_link_class = import_string(_operator_link_class_path)
            elif _operator_link_class_path in plugins_manager.registered_operator_link_classes:
                single_op_link_class = plugins_manager.registered_operator_link_classes[
                    _operator_link_class_path]
            else:
                log.error("Operator Link class %r not registered",
                          _operator_link_class_path)
                return {}

            op_predefined_extra_link: BaseOperatorLink = cattr.structure(
                data, single_op_link_class)

            op_predefined_extra_links.update(
                {op_predefined_extra_link.name: op_predefined_extra_link})

        return op_predefined_extra_links
Exemple #25
0
 def resp_check(_, execution_date):
     if execution_date == DEFAULT_DATE:
         return True
     raise AirflowException('AirflowException raised here!')
Exemple #26
0
    def slots_stats(
        *,
        lock_rows: bool = False,
        session: Session = None,
    ) -> Dict[str, PoolStats]:
        """
        Get Pool stats (Number of Running, Queued, Open & Total tasks)

        If ``lock_rows`` is True, and the database engine in use supports the ``NOWAIT`` syntax, then a
        non-blocking lock will be attempted -- if the lock is not available then SQLAlchemy will throw an
        OperationalError.

        :param lock_rows: Should we attempt to obtain a row-level lock on all the Pool rows returns
        :param session: SQLAlchemy ORM Session
        """
        from airflow.models.taskinstance import TaskInstance  # Avoid circular import

        pools: Dict[str, PoolStats] = {}

        query = session.query(Pool.pool, Pool.slots)

        if lock_rows:
            query = with_row_locks(query, **nowait(session))

        pool_rows: Iterable[Tuple[str, int]] = query.all()
        for (pool_name, total_slots) in pool_rows:
            pools[pool_name] = PoolStats(total=total_slots,
                                         running=0,
                                         queued=0,
                                         open=0)

        state_count_by_pool = (session.query(
            TaskInstance.pool, TaskInstance.state,
            func.count()).filter(TaskInstance.state.in_(
                list(EXECUTION_STATES))).group_by(TaskInstance.pool,
                                                  TaskInstance.state)).all()

        # calculate queued and running metrics
        count: int
        for (pool_name, state, count) in state_count_by_pool:
            stats_dict: Optional[PoolStats] = pools.get(pool_name)
            if not stats_dict:
                continue
            # TypedDict key must be a string literal, so we use if-statements to set value
            if state == "running":
                stats_dict["running"] = count
            elif state == "queued":
                stats_dict["queued"] = count
            else:
                raise AirflowException(
                    f"Unexpected state. Expected values: {EXECUTION_STATES}.")

        # calculate open metric
        for pool_name, stats_dict in pools.items():
            if stats_dict["total"] == -1:
                # -1 means infinite
                stats_dict["open"] = -1
            else:
                stats_dict["open"] = stats_dict["total"] - stats_dict[
                    "running"] - stats_dict["queued"]

        return pools
 def get_hook(self):
     if self.conn_type == 'mysql':
         from airflow.hooks.mysql_hook import MySqlHook
         return MySqlHook(mysql_conn_id=self.conn_id)
     elif self.conn_type == 'google_cloud_platform':
         from airflow.gcp.hooks.bigquery import BigQueryHook
         return BigQueryHook(bigquery_conn_id=self.conn_id)
     elif self.conn_type == 'postgres':
         from airflow.hooks.postgres_hook import PostgresHook
         return PostgresHook(postgres_conn_id=self.conn_id)
     elif self.conn_type == 'pig_cli':
         from airflow.hooks.pig_hook import PigCliHook
         return PigCliHook(pig_conn_id=self.conn_id)
     elif self.conn_type == 'hive_cli':
         from airflow.hooks.hive_hooks import HiveCliHook
         return HiveCliHook(hive_cli_conn_id=self.conn_id)
     elif self.conn_type == 'presto':
         from airflow.hooks.presto_hook import PrestoHook
         return PrestoHook(presto_conn_id=self.conn_id)
     elif self.conn_type == 'hiveserver2':
         from airflow.hooks.hive_hooks import HiveServer2Hook
         return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
     elif self.conn_type == 'sqlite':
         from airflow.hooks.sqlite_hook import SqliteHook
         return SqliteHook(sqlite_conn_id=self.conn_id)
     elif self.conn_type == 'jdbc':
         from airflow.hooks.jdbc_hook import JdbcHook
         return JdbcHook(jdbc_conn_id=self.conn_id)
     elif self.conn_type == 'mssql':
         from airflow.hooks.mssql_hook import MsSqlHook
         return MsSqlHook(mssql_conn_id=self.conn_id)
     elif self.conn_type == 'oracle':
         from airflow.hooks.oracle_hook import OracleHook
         return OracleHook(oracle_conn_id=self.conn_id)
     elif self.conn_type == 'vertica':
         from airflow.contrib.hooks.vertica_hook import VerticaHook
         return VerticaHook(vertica_conn_id=self.conn_id)
     elif self.conn_type == 'cloudant':
         from airflow.contrib.hooks.cloudant_hook import CloudantHook
         return CloudantHook(cloudant_conn_id=self.conn_id)
     elif self.conn_type == 'jira':
         from airflow.contrib.hooks.jira_hook import JiraHook
         return JiraHook(jira_conn_id=self.conn_id)
     elif self.conn_type == 'redis':
         from airflow.contrib.hooks.redis_hook import RedisHook
         return RedisHook(redis_conn_id=self.conn_id)
     elif self.conn_type == 'wasb':
         from airflow.contrib.hooks.wasb_hook import WasbHook
         return WasbHook(wasb_conn_id=self.conn_id)
     elif self.conn_type == 'docker':
         from airflow.hooks.docker_hook import DockerHook
         return DockerHook(docker_conn_id=self.conn_id)
     elif self.conn_type == 'azure_data_lake':
         from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
         return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id)
     elif self.conn_type == 'azure_cosmos':
         from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook
         return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id)
     elif self.conn_type == 'cassandra':
         from airflow.contrib.hooks.cassandra_hook import CassandraHook
         return CassandraHook(cassandra_conn_id=self.conn_id)
     elif self.conn_type == 'mongo':
         from airflow.contrib.hooks.mongo_hook import MongoHook
         return MongoHook(conn_id=self.conn_id)
     elif self.conn_type == 'gcpcloudsql':
         from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook
         return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id)
     elif self.conn_type == 'grpc':
         from airflow.contrib.hooks.grpc_hook import GrpcHook
         return GrpcHook(grpc_conn_id=self.conn_id)
     raise AirflowException("Unknown hook type {}".format(self.conn_type))
 def _get_required_param(name):
     """Extract required parameter from extra JSON, raise exception if not found"""
     value = conn.extra_dejson.get(name)
     if not value:
         raise AirflowException(f'Extra connection option is missing required parameter: `{name}`')
     return value
Exemple #29
0
    def update_instance(
        self,
        update_mask: Union[Dict, FieldMask],
        instance: Union[Dict, Instance],
        project_id: str,
        location: Optional[str] = None,
        instance_id: Optional[str] = None,
        retry: Optional[Retry] = None,
        timeout: Optional[float] = None,
        metadata: Optional[Sequence[Tuple[str, str]]] = None,
    ):
        """
        Updates the metadata and configuration of a specific Redis instance.

        :param update_mask: Required. Mask of fields to update. At least one path must be supplied in this
            field. The elements of the repeated paths field may only include these fields from ``Instance``:

            -  ``displayName``
            -  ``labels``
            -  ``memorySizeGb``
            -  ``redisConfig``

            If a dict is provided, it must be of the same form as the protobuf message
            :class:`~google.cloud.redis_v1.types.FieldMask`
        :type update_mask: Union[Dict, google.cloud.redis_v1.types.FieldMask]
        :param instance: Required. Update description. Only fields specified in ``update_mask`` are updated.

            If a dict is provided, it must be of the same form as the protobuf message
            :class:`~google.cloud.redis_v1.types.Instance`
        :type instance: Union[Dict, google.cloud.redis_v1.types.Instance]
        :param location: The location of the Cloud Memorystore instance (for example europe-west1)
        :type location: str
        :param instance_id: The logical name of the Redis instance in the customer project.
        :type instance_id: str
        :param project_id: Project ID of the project that contains the instance. If set
            to None or missing, the default project_id from the Google Cloud connection is used.
        :type project_id: str
        :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be
            retried.
        :type retry: google.api_core.retry.Retry
        :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if
            ``retry`` is specified, the timeout applies to each individual attempt.
        :type timeout: float
        :param metadata: Additional metadata that is provided to the method.
        :type metadata: Sequence[Tuple[str, str]]
        """
        client = self.get_conn()

        if isinstance(instance, dict):
            instance = ParseDict(instance, Instance())
        elif not isinstance(instance, Instance):
            raise AirflowException("instance is not instance of Instance type or python dict")

        if location and instance_id:
            name = CloudRedisClient.instance_path(project_id, location, instance_id)
            instance.name = name

        self.log.info("Updating instances: %s", instance.name)
        result = client.update_instance(
            update_mask=update_mask, instance=instance, retry=retry, timeout=timeout, metadata=metadata
        )
        result.result()
        self.log.info("Instance updated: %s", instance.name)
Exemple #30
0
    def _sync_dag_view_permissions(self, dag_id, access_control):
        """Set the access policy on the given DAG's ViewModel.

        :param dag_id: the ID of the DAG whose permissions should be updated
        :type dag_id: str
        :param access_control: a dict where each key is a rolename and
            each value is a set() of permission names (e.g.,
            {'can_dag_read'}
        :type access_control: dict
        """
        def _get_or_create_dag_permission(perm_name):
            dag_perm = self.find_permission_view_menu(perm_name, dag_id)
            if not dag_perm:
                self.log.info(
                    "Creating new permission '%s' on view '%s'",
                    perm_name, dag_id
                )
                dag_perm = self.add_permission_view_menu(perm_name, dag_id)

            return dag_perm

        def _revoke_stale_permissions(dag_view):
            existing_dag_perms = self.find_permissions_view_menu(dag_view)
            for perm in existing_dag_perms:
                non_admin_roles = [role for role in perm.role
                                   if role.name != 'Admin']
                for role in non_admin_roles:
                    target_perms_for_role = access_control.get(role.name, {})
                    if perm.permission.name not in target_perms_for_role:
                        self.log.info(
                            "Revoking '%s' on DAG '%s' for role '%s'",
                            perm.permission, dag_id, role.name
                        )
                        self.del_permission_role(role, perm)

        dag_view = self.find_view_menu(dag_id)
        if dag_view:
            _revoke_stale_permissions(dag_view)

        for rolename, perms in access_control.items():
            role = self.find_role(rolename)
            if not role:
                raise AirflowException(
                    "The access_control mapping for DAG '{}' includes a role "
                    "named '{}', but that role does not exist".format(
                        dag_id,
                        rolename))

            perms = set(perms)
            invalid_perms = perms - self.DAG_PERMS
            if invalid_perms:
                raise AirflowException(
                    "The access_control map for DAG '{}' includes the following "
                    "invalid permissions: {}; The set of valid permissions "
                    "is: {}".format(dag_id,
                                    (perms - self.DAG_PERMS),
                                    self.DAG_PERMS))

            for perm_name in perms:
                dag_perm = _get_or_create_dag_permission(perm_name)
                self.add_permission_role(role, dag_perm)