Ejemplo n.º 1
0
 def _cleanup(self):
     """
     Prune failed dependencies older than DEPENDENCY_FAILURE_COOLDOWN seconds.
     Limit the disk usage of the dependencies (both the bundle files and the serialized state file size)
     Deletes oldest failed dependencies first and then oldest finished dependencies.
     Doesn't touch downloading dependencies.
     """
     self._prune_failed_dependencies()
     # With all the locks (should be fast if no cleanup needed, otherwise make sure nothing is corrupted
     while True:
         with self._global_lock:
             self._acquire_all_locks()
             bytes_used = sum(dep_state.size_bytes
                              for dep_state in self._dependencies.values())
             serialized_length = len(
                 codalab.worker.pyjson.dumps(self._dependencies))
             if (bytes_used > self._max_cache_size_bytes
                     or serialized_length >
                     DependencyManager.MAX_SERIALIZED_LEN):
                 logger.debug(
                     '%d dependencies in cache, disk usage: %s (max %s), serialized size: %s (max %s)',
                     len(self._dependencies),
                     size_str(bytes_used),
                     size_str(self._max_cache_size_bytes),
                     size_str(serialized_length),
                     DependencyManager.MAX_SERIALIZED_LEN,
                 )
                 ready_deps = {
                     dep_key: dep_state
                     for dep_key, dep_state in self._dependencies.items()
                     if dep_state.stage == DependencyStage.READY
                     and not dep_state.dependents
                 }
                 failed_deps = {
                     dep_key: dep_state
                     for dep_key, dep_state in self._dependencies.items()
                     if dep_state.stage == DependencyStage.FAILED
                 }
                 if failed_deps:
                     dep_key_to_remove = min(
                         failed_deps.items(),
                         key=lambda dep: dep[1].last_used)[0]
                 elif ready_deps:
                     dep_key_to_remove = min(
                         ready_deps.items(),
                         key=lambda dep: dep[1].last_used)[0]
                 else:
                     logger.info(
                         'Dependency quota full but there are only downloading dependencies, not cleaning up until downloads are over'
                     )
                     self._release_all_locks()
                     break
                 if dep_key_to_remove:
                     self._delete_dependency(dep_key_to_remove)
                 self._release_all_locks()
             else:
                 self._release_all_locks()
                 break
Ejemplo n.º 2
0
        def check_resource_utilization(run_state: RunState):
            logger.info(
                f'Checking resource utilization for bundle. uuid: {run_state.bundle.uuid}'
            )
            cpu_usage, memory_usage = docker_utils.get_container_stats_with_docker_stats(
                run_state.container)
            run_state = run_state._replace(cpu_usage=cpu_usage,
                                           memory_usage=memory_usage)
            run_state = run_state._replace(memory_usage=memory_usage)

            kill_messages = []

            run_stats = docker_utils.get_container_stats(run_state.container)

            run_state = run_state._replace(max_memory=max(
                run_state.max_memory, run_stats.get('memory', 0)))
            run_state = run_state._replace(
                disk_utilization=self.disk_utilization[
                    run_state.bundle.uuid]['disk_utilization'])

            container_time_total = docker_utils.get_container_running_time(
                run_state.container)
            run_state = run_state._replace(
                container_time_total=container_time_total,
                container_time_user=run_stats.get(
                    'container_time_user', run_state.container_time_user),
                container_time_system=run_stats.get(
                    'container_time_system', run_state.container_time_system),
            )

            if run_state.resources.time and container_time_total > run_state.resources.time:
                kill_messages.append(
                    'Time limit exceeded. (Container uptime %s > time limit %s)'
                    % (duration_str(container_time_total),
                       duration_str(run_state.resources.time)))

            if run_state.max_memory > run_state.resources.memory or run_state.exitcode == 137:
                kill_messages.append('Memory limit %s exceeded.' %
                                     size_str(run_state.resources.memory))

            if run_state.resources.disk and run_state.disk_utilization > run_state.resources.disk:
                kill_messages.append('Disk limit %sb exceeded.' %
                                     size_str(run_state.resources.disk))

            if kill_messages:
                run_state = run_state._replace(
                    kill_message=' '.join(kill_messages), is_killed=True)
            return run_state
Ejemplo n.º 3
0
def apply_func(func, arg):
    """
    Apply post-processing function |func| to |arg|.
    |func| is a string representing a list of functions (which are to be
    applied to |arg| in succession).  Each function is either:
    - 'duration', 'date', 'size' for special formatting
    - '%...' for sprintf-style formatting
    - s/.../... for regular expression substitution
    - [a:b] for taking substrings
    """
    FUNC_DELIM = ' | '
    if isinstance(arg, tuple):
        # tuples are (bundle_uuid, genpath) which have not been fleshed out
        return arg + (func,)
    try:
        if func is None:
            return arg
        # String encoding of a function: size s/a/b
        for f in func.split(FUNC_DELIM):
            if f == 'date':
                arg = formatting.date_str(float(arg)) if arg is not None else None
            elif f == 'duration':
                arg = formatting.duration_str(float(arg)) if arg is not None else None
            elif f == 'size':
                arg = formatting.size_str(float(arg)) if arg is not None else None
            elif f.startswith('%'):
                arg = (f % float(arg)) if arg is not None else None
            elif f.startswith('s/'):  # regular expression: s/<old string>/<new string>
                esc_slash = '_ESC_SLASH_'  # Assume this doesn't occur in s
                # Preserve escaped characters: \/
                tokens = f.replace('\\/', esc_slash).split('/')
                if len(tokens) != 3:
                    return '<invalid regex: %s>' % f
                s = tokens[1].replace(esc_slash, '/')
                t = tokens[2].replace(esc_slash, '/')
                arg = re.sub(s, t, arg)
            elif f.startswith('['):  # substring
                m = re.match('\[(.*):(.*)\]', f)
                if m:
                    start = int(m.group(1) or 0)
                    end = int(m.group(2) or len(arg))
                    arg = arg[start:end]
                else:
                    return '<invalid function: %s>' % f
            elif f.startswith('add '):
                # 'add k v' checks if arg is a dictionary and updates it with arg[k] = v
                if isinstance(arg, dict):
                    k, v = f.split(' ')[1:]
                    arg[k] = v
                else:
                    return 'arg (%s) not a dictionary' % type(arg)
            elif f.startswith('key '):
                # 'key k' converts arg into a dictionary where arg[k] = arg
                arg = {f.split(' ')[1]: arg}
            else:
                return '<invalid function: %s>' % f
        return arg
    except:
        # Applying the function failed, so just return the arg.
        return arg
Ejemplo n.º 4
0
 def get_target_info(self, target, depth=1):
     info = _call_with_retries(lambda: self.client.get_target_info(target, depth))
     contents = info.get('contents')
     # Render the sizes
     if contents:
         for item in contents:
             if 'size' in item:
                 item['size_str'] = formatting.size_str(item['size'])
     return info
Ejemplo n.º 5
0
 def get_target_info(self, target, depth=1):
     info = _call_with_retries(
         lambda: self.client.get_target_info(target, depth))
     contents = info.get('contents')
     # Render the sizes
     if contents:
         for item in contents:
             if 'size' in item:
                 item['size_str'] = formatting.size_str(item['size'])
     return info
        def check_resource_utilization(run_state):
            kill_messages = []

            run_stats = docker_utils.get_container_stats(run_state.container)

            container_time_total = docker_utils.get_container_running_time(run_state.container)
            run_state = run_state._replace(
                container_time_total=container_time_total,
                container_time_user=run_stats.get(
                    'container_time_user', run_state.container_time_user
                ),
                container_time_system=run_stats.get(
                    'container_time_system', run_state.container_time_system
                ),
            )

            run_state = run_state._replace(
                max_memory=max(run_state.max_memory, run_stats.get('memory', 0))
            )

            run_state = check_disk_utilization(run_state)

            if run_state.resources.time and container_time_total > run_state.resources.time:
                kill_messages.append(
                    'Time limit exceeded. (Container uptime %s > time limit %s)'
                    % (duration_str(container_time_total), duration_str(run_state.resources.time))
                )

            if run_state.max_memory > run_state.resources.memory or run_state.exitcode == 137:
                kill_messages.append(
                    'Memory limit %s exceeded.' % size_str(run_state.resources.memory)
                )

            if run_state.resources.disk and run_state.disk_utilization > run_state.resources.disk:
                kill_messages.append(
                    'Disk limit %sb exceeded.' % size_str(run_state.resources.disk)
                )

            if kill_messages:
                run_state = run_state._replace(kill_message=' '.join(kill_messages), is_killed=True)
            return run_state
Ejemplo n.º 7
0
 def update_state_and_check_killed(bytes_downloaded):
     """
     Callback method for bundle service client updates dependency state and
     raises DownloadAbortedException if download is killed by dep. manager
     """
     with self._dependency_locks[dependency_state.dependency_key]:
         state = self._dependencies[dependency_state.dependency_key]
         if state.killed:
             raise DownloadAbortedException("Aborted by user")
         self._dependencies[dependency_state.dependency_key] = state._replace(
             size_bytes=bytes_downloaded,
             message="Downloading dependency: %s downloaded"
             % size_str(bytes_downloaded),
         )
Ejemplo n.º 8
0
def apply_func(func, arg):
    '''
    Apply post-processing function |func| to |arg|.
    |func| is a string representing a list of functions (which are to be
    applied to |arg| in succession).  Each function is either:
    - 'duration', 'date', 'size' for special formatting
    - '%...' for sprintf-style formatting
    - s/... for regular expression substitution
    - [a:b] for taking substrings
    '''
    FUNC_DELIM = ' | '
    if isinstance(arg, tuple):
        # tuples are (bundle_uuid, genpath) which have not been fleshed out
        return arg + (func,)
    try:
        if func == None: return arg
        # String encoding of a function: size s/a/b
        for f in func.split(FUNC_DELIM):
            if f == 'date':
                arg = formatting.date_str(arg)
            elif f == 'duration':
                arg = formatting.duration_str(float(arg)) if arg != None else ''
            elif f == 'size':
                arg = formatting.size_str(arg)
            elif f.startswith('%'):
                arg = (f % float(arg)) if arg != None else ''
            elif f.startswith('s/'):  # regular expression
                _, s, t = f.split("/")
                arg = re.sub(s, t, arg)
            elif f.startswith('['):  # substring
                m = re.match('\[(.*):(.*)\]', f)
                if m:
                    start = int(m.group(1) or 0)
                    end = int(m.group(2) or -1)
                    arg = arg[start:end]
                else:
                    return '<invalid function: %s>' % f
            else:
                return '<invalid function: %s>' % f
        return arg
    except:
        # Can't apply the function, so just return the arg.
        return arg
Ejemplo n.º 9
0
 def get_top_level_contents(self, target):
     info = self.client.get_target_info(target, 1)
     if info is not None and info['type'] == 'directory':
         for item in info['contents']:
             item['size_str'] = formatting.size_str(item['size'])
     return info
    def _cleanup(self):
        """
        Prune failed dependencies older than DEPENDENCY_FAILURE_COOLDOWN seconds.
        Limit the disk usage of the dependencies (both the bundle files and the serialized state file size)
        Deletes oldest failed dependencies first and then oldest finished dependencies.
        Doesn't touch downloading dependencies.
        """
        self._prune_failed_dependencies()

        while True:
            with self._state_lock:
                try:
                    dependencies, paths = self._fetch_state()
                except (ValueError, EnvironmentError):
                    # Do nothing if an error is thrown while reading from the state file
                    logging.exception(
                        "Error reading from state file when cleaning up dependencies. Trying again..."
                    )
                    continue

                bytes_used = sum(dep_state.size_bytes for dep_state in dependencies.values())
                serialized_length = len(codalab.worker.pyjson.dumps(dependencies))
                if (
                    bytes_used > self._max_cache_size_bytes
                    or serialized_length > DependencyManager.MAX_SERIALIZED_LEN
                ):
                    logger.debug(
                        '%d dependencies, disk usage: %s (max %s), serialized size: %s (max %s)',
                        len(dependencies),
                        size_str(bytes_used),
                        size_str(self._max_cache_size_bytes),
                        size_str(serialized_length),
                        DependencyManager.MAX_SERIALIZED_LEN,
                    )
                    ready_deps = {
                        dep_key: dep_state
                        for dep_key, dep_state in dependencies.items()
                        if dep_state.stage == DependencyStage.READY and not dep_state.dependents
                    }
                    failed_deps = {
                        dep_key: dep_state
                        for dep_key, dep_state in dependencies.items()
                        if dep_state.stage == DependencyStage.FAILED
                    }

                    if failed_deps:
                        dep_key_to_remove = min(
                            failed_deps.items(), key=lambda dep: dep[1].last_used
                        )[0]
                    elif ready_deps:
                        dep_key_to_remove = min(
                            ready_deps.items(), key=lambda dep: dep[1].last_used
                        )[0]
                    else:
                        logger.info(
                            'Dependency quota full but there are only downloading dependencies, not cleaning up '
                            'until downloads are over.'
                        )
                        break
                    if dep_key_to_remove:
                        self._delete_dependency(dep_key_to_remove, dependencies, paths)
                        self._commit_state(dependencies, paths)
                else:
                    break
Ejemplo n.º 11
0
 def get_top_level_contents(self, target):
     info = self.get_target_info(target, 1)
     if info is not None and info['type'] == 'directory':
         for item in info['contents']:
             item['size_str'] = formatting.size_str(item['size'])
     return info
Ejemplo n.º 12
0
 def progress_callback(bytes_uploaded):
     run_status = 'Uploading results: %s done (archived size)' % size_str(
         bytes_uploaded)
     self.uploading[
         run_state.bundle.uuid]['run_status'] = run_status
     return True
Ejemplo n.º 13
0
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        def mount_dependency(dependency, shared_file_system):
            if not shared_file_system:
                # Set up symlinks for the content at dependency path
                Path(dependency.child_path).parent.mkdir(parents=True,
                                                         exist_ok=True)
                os.symlink(dependency.docker_path, dependency.child_path)
            # The following will be converted into a Docker volume binding like:
            #   dependency_path:docker_dependency_path:ro
            docker_dependencies.append(
                (dependency.parent_path, dependency.docker_path))

        if run_state.is_killed or run_state.is_restaged:
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.CLEANING_UP,
                reason=
                f'the bundle was {"killed" if run_state.is_killed else "restaged"}',
            )
            return run_state._replace(stage=RunStage.CLEANING_UP)

        # Check CPU and GPU availability
        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources.cpus, run_state.resources.gpus)
        except Exception as e:
            message = "Unexpectedly unable to assign enough resources to bundle {}: {}".format(
                run_state.bundle.uuid, str(e))
            logger.error(message)
            logger.error(traceback.format_exc())
            return run_state._replace(run_status=message)

        dependencies_ready = True
        status_messages = []

        if not self.shared_file_system:
            # No need to download dependencies if we're in the shared FS,
            # since they're already in our FS
            for dep in run_state.bundle.dependencies:
                dep_key = DependencyKey(dep.parent_uuid, dep.parent_path)
                dependency_state = self.dependency_manager.get(
                    run_state.bundle.uuid, dep_key)
                if dependency_state.stage == DependencyStage.DOWNLOADING:
                    status_messages.append(
                        'Downloading dependency %s: %s done (archived size)' %
                        (dep.child_path, size_str(
                            dependency_state.size_bytes)))
                    dependencies_ready = False
                elif dependency_state.stage == DependencyStage.FAILED:
                    # Failed to download dependency; -> CLEANING_UP
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=
                        f'Dependency has failed for this bundle. Dependency child uuid: {dep.child_uuid}. Dependency child path: {dep.child_path}',
                    )
                    return run_state._replace(
                        stage=RunStage.CLEANING_UP,
                        failure_message='Failed to download dependency %s: %s'
                        % (dep.child_path, dependency_state.message),
                    )

        # get the docker image
        docker_image = run_state.resources.docker_image
        image_state = self.image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append('Pulling docker image %s %s' %
                                   (docker_image, image_state.message))
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            message = 'Failed to download Docker image: %s' % image_state.message
            logger.error(message)
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages)
            logger.info(
                f'bundle is not ready yet. uuid: {run_state.bundle.uuid}. status message: {status_message}'
            )
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        if self.shared_file_system:
            if not os.path.exists(run_state.bundle_path):
                if run_state.bundle_dir_wait_num_tries == 0:
                    message = (
                        "Bundle directory cannot be found on the shared filesystem. "
                        "Please ensure the shared fileystem between the server and "
                        "your worker is mounted properly or contact your administrators."
                    )
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=
                        "Bundle directory cannot be found on the shared filesystem.",
                    )
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=message)
                next_bundle_dir_wait_num_tries = run_state.bundle_dir_wait_num_tries - 1
                logger.info(
                    f'Waiting for bundle directory to be created by the server, uuid: {run_state.bundle.uuid}, bundle_dir_wait_num_tries: {next_bundle_dir_wait_num_tries}'
                )
                return run_state._replace(
                    run_status=
                    "Waiting for bundle directory to be created by the server",
                    bundle_dir_wait_num_tries=next_bundle_dir_wait_num_tries,
                )
        else:
            remove_path(run_state.bundle_path)
            os.makedirs(run_state.bundle_path)

        # 2) Set up symlinks
        docker_dependencies = []
        docker_dependencies_path = (
            RunStateMachine._ROOT + run_state.bundle.uuid +
            ('_dependencies' if not self.shared_file_system else ''))

        for dep in run_state.bundle.dependencies:
            full_child_path = os.path.normpath(
                os.path.join(run_state.bundle_path, dep.child_path))
            to_mount = []
            dependency_path = self._get_dependency_path(run_state, dep)

            if dep.child_path == RunStateMachine._CURRENT_DIRECTORY:
                # Mount all the content of the dependency_path to the top-level of the bundle
                for child in os.listdir(dependency_path):
                    child_path = os.path.normpath(
                        os.path.join(run_state.bundle_path, child))
                    to_mount.append(
                        DependencyToMount(
                            docker_path=os.path.join(docker_dependencies_path,
                                                     child),
                            child_path=child_path,
                            parent_path=os.path.join(dependency_path, child),
                        ))
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [child_path])
            else:
                to_mount.append(
                    DependencyToMount(
                        docker_path=os.path.join(docker_dependencies_path,
                                                 dep.child_path),
                        child_path=full_child_path,
                        parent_path=dependency_path,
                    ))

                first_element_of_path = Path(dep.child_path).parts[0]
                if first_element_of_path == RunStateMachine._ROOT:
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [full_child_path])
                else:
                    # child_path can be a nested path, so later remove everything from the first element of the path
                    path_to_remove = os.path.join(run_state.bundle_path,
                                                  first_element_of_path)
                    run_state = run_state._replace(
                        paths_to_remove=(run_state.paths_to_remove or []) +
                        [path_to_remove])
            for dependency in to_mount:
                try:
                    mount_dependency(dependency, self.shared_file_system)
                except OSError as e:
                    log_bundle_transition(
                        bundle_uuid=run_state.bundle.uuid,
                        previous_stage=run_state.stage,
                        next_stage=RunStage.CLEANING_UP,
                        reason=str(e.__class__),
                        level=logging.ERROR,
                    )
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=str(e))

        if run_state.resources.network:
            docker_network = self.docker_network_external.name
        else:
            docker_network = self.docker_network_internal.name

        # 3) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                run_state.bundle.uuid,
                docker_dependencies,
                run_state.bundle.command,
                run_state.resources.docker_image,
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources.memory,
                runtime=self.docker_runtime,
            )
            self.worker_docker_network.connect(container)
        except docker_utils.DockerUserErrorException as e:
            message = 'Cannot start Docker container: {}'.format(e)
            log_bundle_transition(
                bundle_uuid=run_state.bundle.uuid,
                previous_stage=run_state.stage,
                next_stage=RunStage.CLEANING_UP,
                reason='Cannot start Docker container.',
                level=logging.ERROR,
            )
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)
        except Exception as e:
            message = 'Cannot start container: {}'.format(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            raise

        return run_state._replace(
            stage=RunStage.RUNNING,
            run_status='Running job in container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )
    def get(self, image_spec):
        """
        Always request the newest docker image from Dockerhub if it's not in downloading thread and return the current
        downloading status(READY, FAILED, or DOWNLOADING).
        When the requested image in the following states:
        1. If it's not available on the platform, we download the image and return DOWNLOADING status.
        2. If another thread is actively downloading it, we return DOWNLOADING status.
        3. If another thread was downloading it but not active by the time the request was sent, we return the following status:
            * READY if the image was downloaded successfully.
            * FAILED if the image wasn't able to be downloaded due to any reason.
        :param image_spec: Repo image_spec of docker image being requested
        :returns: A DockerAvailabilityState object with the state of the docker image
        """
        def image_availability_state(image_spec, success_message,
                                     failure_message):
            """
            Try to get the image specified by image_spec from host machine.
            Return ImageAvailabilityState.
            """
            try:
                image = self._docker.images.get(image_spec)
                digests = image.attrs.get('RepoDigests', [image_spec])
                digest = digests[0] if len(digests) > 0 else None
                new_timestamp = str(time.time())
                image.tag(self.CACHE_TAG, tag=new_timestamp)
                for tag in image.tags:
                    tag_label, timestamp = tag.split(":")
                    # remove any other timestamp but not the current one
                    if tag_label == self.CACHE_TAG and timestamp != new_timestamp:
                        try:
                            self._docker.images.remove(tag)
                        except docker.errors.NotFound as err:
                            # It's possible that we get a 404 not found error here when removing the image,
                            # since another worker on the same system has already done so. We just
                            # ignore this 404, since any extraneous tags will be removed during the next iteration.
                            logger.warning(
                                "Attempted to remove image %s from cache, but image was not found: %s",
                                tag,
                                err,
                            )

                return ImageAvailabilityState(digest=digest,
                                              stage=DependencyStage.READY,
                                              message=success_message)
            except Exception as ex:
                if using_sentry():
                    capture_exception()
                return ImageAvailabilityState(digest=None,
                                              stage=DependencyStage.FAILED,
                                              message=failure_message % ex)

        if ':' not in image_spec:
            # Both digests and repo:tag kind of specs include the : character. The only case without it is when
            # a repo is specified without a tag (like 'latest')
            # When this is the case, different images API methods act differently:
            # - pull pulls all tags of the image
            # - get tries to get `latest` by default
            # That means if someone requests a docker image without a tag, and the image does not have a latest
            # tag pushed to Dockerhub, pull will succeed since it will pull all other tags, but later get calls
            # will fail since the `latest` tag won't be found on the system.
            # We don't want to assume what tag the user wanted so we want the pull step to fail if no tag is specified
            # and there's no latest tag on dockerhub.
            # Hence, we append the latest tag to the image spec if there's no tag specified otherwise at the very beginning
            image_spec += ':latest'
        try:
            if image_spec in self._downloading:
                with self._downloading[image_spec]['lock']:
                    if self._downloading[image_spec].is_alive():
                        return ImageAvailabilityState(
                            digest=None,
                            stage=DependencyStage.DOWNLOADING,
                            message=self._downloading[image_spec]['status'],
                        )
                    else:
                        if self._downloading[image_spec]['success']:
                            status = image_availability_state(
                                image_spec,
                                success_message='Image ready',
                                failure_message=
                                'Image {} was downloaded successfully, '
                                'but it cannot be found locally due to unhandled error %s'
                                .format(image_spec),
                            )
                        else:
                            status = image_availability_state(
                                image_spec,
                                success_message=
                                'Image {} can not be downloaded from DockerHub '
                                'but it is found locally'.format(image_spec),
                                failure_message=self._downloading[image_spec]
                                ['message'] + ": %s",
                            )
                        self._downloading.remove(image_spec)
                        return status
            else:

                def download():
                    logger.debug('Downloading Docker image %s', image_spec)
                    try:
                        self._docker.images.pull(image_spec)
                        logger.debug('Download for Docker image %s complete',
                                     image_spec)
                        self._downloading[image_spec]['success'] = True
                        self._downloading[image_spec][
                            'message'] = "Downloading image"
                    except (docker.errors.APIError,
                            docker.errors.ImageNotFound) as ex:
                        logger.debug('Download for Docker image %s failed: %s',
                                     image_spec, ex)
                        self._downloading[image_spec]['success'] = False
                        self._downloading[image_spec][
                            'message'] = "Can't download image: {}".format(ex)

                # Check docker image size before pulling from Docker Hub.
                # Do not download images larger than self._max_image_size
                # Download images if size cannot be obtained
                if self._max_image_size:
                    try:
                        image_size_bytes = docker_utils.get_image_size_without_pulling(
                            image_spec)
                        if image_size_bytes is None:
                            failure_msg = (
                                "Unable to find Docker image: {} from Docker HTTP Rest API V2. "
                                "Skipping Docker image size precheck.".format(
                                    image_spec))
                            logger.info(failure_msg)
                        elif image_size_bytes > self._max_image_size:
                            failure_msg = (
                                "The size of " + image_spec +
                                ": {} exceeds the maximum image size allowed {}."
                                .format(size_str(image_size_bytes),
                                        size_str(self._max_image_size)))
                            return ImageAvailabilityState(
                                digest=None,
                                stage=DependencyStage.FAILED,
                                message=failure_msg)
                    except Exception as ex:
                        failure_msg = "Cannot fetch image size before pulling Docker image: {} from Docker Hub: {}.".format(
                            image_spec, ex)
                        logger.error(failure_msg)
                        return ImageAvailabilityState(
                            digest=None,
                            stage=DependencyStage.FAILED,
                            message=failure_msg)

                self._downloading.add_if_new(
                    image_spec, threading.Thread(target=download, args=[]))
                return ImageAvailabilityState(
                    digest=None,
                    stage=DependencyStage.DOWNLOADING,
                    message=self._downloading[image_spec]['status'],
                )
        except Exception as ex:
            return ImageAvailabilityState(digest=None,
                                          stage=DependencyStage.FAILED,
                                          message=str(ex))
Ejemplo n.º 15
0
 def _serialize(self, value, attr, obj):
     return formatting.size_str(value)
    def _transition_from_PREPARING(self, run_state):
        """
        1- Request the docker image from docker image manager
            - if image is failed, move to CLEANING_UP state
        2- Request the dependencies from dependency manager
            - if any are failed, move to CLEANING_UP state
        3- If all dependencies and docker image are ready:
            - Set up the local filesystem for the run
            - Create symlinks to dependencies
            - Allocate resources and prepare the docker container
            - Start the docker container
        4- If all is successful, move to RUNNING state
        """
        if run_state.is_killed:
            return run_state._replace(stage=RunStage.CLEANING_UP)

        dependencies_ready = True
        status_messages = []

        if not self.shared_file_system:
            # No need to download dependencies if we're in the shared FS since they're already in our FS
            for dep_key, dep in run_state.bundle.dependencies.items():
                dependency_state = self.dependency_manager.get(
                    run_state.bundle.uuid, dep_key)
                if dependency_state.stage == DependencyStage.DOWNLOADING:
                    status_messages.append(
                        'Downloading dependency %s: %s done (archived size)' %
                        (dep.child_path, size_str(
                            dependency_state.size_bytes)))
                    dependencies_ready = False
                elif dependency_state.stage == DependencyStage.FAILED:
                    # Failed to download dependency; -> CLEANING_UP
                    return run_state._replace(
                        stage=RunStage.CLEANING_UP,
                        failure_message='Failed to download dependency %s: %s'
                        % (dep.child_path, dependency_state.message),
                    )

        # get the docker image
        docker_image = run_state.resources.docker_image
        image_state = self.docker_image_manager.get(docker_image)
        if image_state.stage == DependencyStage.DOWNLOADING:
            status_messages.append('Pulling docker image: ' +
                                   (image_state.message or docker_image or ""))
            dependencies_ready = False
        elif image_state.stage == DependencyStage.FAILED:
            # Failed to pull image; -> CLEANING_UP
            message = 'Failed to download Docker image: %s' % image_state.message
            logger.error(message)
            return run_state._replace(stage=RunStage.CLEANING_UP,
                                      failure_message=message)

        # stop proceeding if dependency and image downloads aren't all done
        if not dependencies_ready:
            status_message = status_messages.pop()
            if status_messages:
                status_message += "(and downloading %d other dependencies and docker images)" % len(
                    status_messages)
            return run_state._replace(run_status=status_message)

        # All dependencies ready! Set up directories, symlinks and container. Start container.
        # 1) Set up a directory to store the bundle.
        if self.shared_file_system:
            if not os.path.exists(run_state.bundle_path):
                if run_state.bundle_dir_wait_num_tries == 0:
                    message = (
                        "Bundle directory cannot be found on the shared filesystem. "
                        "Please ensure the shared fileystem between the server and "
                        "your worker is mounted properly or contact your administrators."
                    )
                    logger.error(message)
                    return run_state._replace(stage=RunStage.CLEANING_UP,
                                              failure_message=message)
                return run_state._replace(
                    run_status=
                    "Waiting for bundle directory to be created by the server",
                    bundle_dir_wait_num_tries=run_state.
                    bundle_dir_wait_num_tries - 1,
                )
        else:
            remove_path(run_state.bundle_path)
            os.mkdir(run_state.bundle_path)

        # 2) Set up symlinks
        docker_dependencies = []
        docker_dependencies_path = (
            '/' + run_state.bundle.uuid +
            ('_dependencies' if not self.shared_file_system else ''))
        for dep_key, dep in run_state.bundle.dependencies.items():
            full_child_path = os.path.normpath(
                os.path.join(run_state.bundle_path, dep.child_path))
            if not full_child_path.startswith(run_state.bundle_path):
                # Dependencies should end up in their bundles (ie prevent using relative paths like ..
                # to get out of their parent bundles
                message = 'Invalid key for dependency: %s' % (dep.child_path)
                logger.error(message)
                return run_state._replace(stage=RunStage.CLEANING_UP,
                                          failure_message=message)
            docker_dependency_path = os.path.join(docker_dependencies_path,
                                                  dep.child_path)
            if self.shared_file_system:
                # On a shared FS, we know where the dep is stored and can get the contents directly
                dependency_path = os.path.realpath(
                    os.path.join(dep.location, dep.parent_path))
            else:
                # On a dependency_manager setup ask the manager where the dependency is
                dependency_path = os.path.join(
                    self.dependency_manager.dependencies_dir,
                    self.dependency_manager.get(run_state.bundle.uuid,
                                                dep_key).path,
                )
                os.symlink(docker_dependency_path, full_child_path)
            # These are turned into docker volume bindings like:
            #   dependency_path:docker_dependency_path:ro
            docker_dependencies.append(
                (dependency_path, docker_dependency_path))

        # 3) Set up container
        if run_state.resources.network:
            docker_network = self.docker_network_external.name
        else:
            docker_network = self.docker_network_internal.name

        try:
            cpuset, gpuset = self.assign_cpu_and_gpu_sets_fn(
                run_state.resources.cpus, run_state.resources.gpus)
        except Exception as e:
            message = "Cannot assign enough resources: %s" % str(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            return run_state._replace(run_status=message)

        # 4) Start container
        try:
            container = docker_utils.start_bundle_container(
                run_state.bundle_path,
                run_state.bundle.uuid,
                docker_dependencies,
                run_state.bundle.command,
                run_state.resources.docker_image,
                network=docker_network,
                cpuset=cpuset,
                gpuset=gpuset,
                memory_bytes=run_state.resources.memory,
                runtime=self.docker_runtime,
            )
            self.worker_docker_network.connect(container)
        except Exception as e:
            message = 'Cannot start Docker container: {}'.format(e)
            logger.error(message)
            logger.error(traceback.format_exc())
            raise

        return run_state._replace(
            stage=RunStage.RUNNING,
            run_status='Running job in Docker container',
            container_id=container.id,
            container=container,
            docker_image=image_state.digest,
            has_contents=True,
            cpuset=cpuset,
            gpuset=gpuset,
        )
Ejemplo n.º 17
0
def apply_func(func, arg):
    """
    Apply post-processing function |func| to |arg|.
    |func| is a string representing a list of functions (which are to be
    applied to |arg| in succession).  Each function is either:
    - 'duration', 'date', 'size' for special formatting
    - '%...' for sprintf-style formatting
    - s/.../... for regular expression substitution
    - [a:b] for taking substrings
    """
    FUNC_DELIM = ' | '
    if isinstance(arg, tuple):
        # tuples are (bundle_uuid, genpath) which have not been fleshed out
        return arg + (func, )
    try:
        if func is None:
            return arg
        # String encoding of a function: size s/a/b
        for f in func.split(FUNC_DELIM):
            if f == 'str':
                arg = str(arg)
            elif f == 'date':
                arg = formatting.date_str(
                    float(arg)) if arg is not None else None
            elif f == 'duration':
                arg = formatting.duration_str(
                    float(arg)) if arg is not None else None
            elif f == 'size':
                arg = formatting.size_str(
                    float(arg)) if arg is not None else None
            elif f.startswith('%'):
                arg = (f % float(arg)) if arg is not None else None
            elif f.startswith(
                    's/'):  # regular expression: s/<old string>/<new string>
                esc_slash = '_ESC_SLASH_'  # Assume this doesn't occur in s
                # Preserve escaped characters: \/
                tokens = f.replace('\\/', esc_slash).split('/')
                if len(tokens) != 3:
                    return '<invalid regex: %s>' % f
                s = tokens[1].replace(esc_slash, '/')
                t = tokens[2].replace(esc_slash, '/')
                arg = re.sub(s, t, arg)
            elif f.startswith('['):  # substring
                m = re.match('\[(.*):(.*)\]', f)
                if m:
                    start = int(m.group(1) or 0)
                    end = int(m.group(2) or len(arg))
                    arg = arg[start:end]
                else:
                    return '<invalid function: %s>' % f
            elif f.startswith('add '):
                # 'add k v' checks if arg is a dictionary and updates it with arg[k] = v
                if isinstance(arg, dict):
                    k, v = f.split(' ')[1:]
                    arg[k] = v
                else:
                    return 'arg (%s) not a dictionary' % type(arg)
            elif f.startswith('key '):
                # 'key k' converts arg into a dictionary where arg[k] = arg
                arg = {f.split(' ')[1]: arg}
            else:
                return '<invalid function: %s>' % f
        return arg
    except:
        # Applying the function failed, so just return the arg.
        return arg
Ejemplo n.º 18
0
    def get(self, image_spec: str) -> ImageAvailabilityState:
        """
        Always request the newest image from the cloud if it's not in downloading thread and return the current
        downloading status(READY, FAILED, or DOWNLOADING).
        When the requested image in the following states:
        1. If it's not available on the platform, we download the image and return DOWNLOADING status.
        2. If another thread is actively downloading it, we return DOWNLOADING status.
        3. If another thread was downloading it but not active by the time the request was sent, we return the following status:
            * READY if the image was downloaded successfully.
            * FAILED if the image wasn't able to be downloaded due to any reason.
        Args:
            image_spec: the image that the requester needs.
                The caller will need to determine the type of image they need before calling this function.
                It is usually safe to prefix the image with the type of image.
                For example, the docker image go would be docker://go

        Returns:
            ImageAvailabilityState of the image requested.
        """
        try:
            if image_spec in self._downloading:
                with self._downloading[image_spec]['lock']:
                    if self._downloading[image_spec].is_alive():
                        return ImageAvailabilityState(
                            digest=None,
                            stage=DependencyStage.DOWNLOADING,
                            message=self._downloading[image_spec]['status'],
                        )
                    else:
                        if self._downloading[image_spec]['success']:
                            status = self._image_availability_state(
                                image_spec,
                                success_message='Image ready',
                                failure_message=
                                'Image {} was downloaded successfully, '
                                'but it cannot be found locally due to unhandled error %s'
                                .format(image_spec),
                            )
                        else:
                            status = self._image_availability_state(
                                image_spec,
                                success_message=
                                'Image {} can not be downloaded from the cloud '
                                'but it is found locally'.format(image_spec),
                                failure_message=self._downloading[image_spec]
                                ['message'] + ": %s",
                            )
                        self._downloading.remove(image_spec)
                        return status
            else:
                if self._max_image_size:
                    try:
                        try:
                            image_size_bytes = self._image_size_without_pulling(
                                image_spec)
                        except NotImplementedError:
                            failure_msg = (
                                "Could not query size of {} from container runtime hub. "
                                "Skipping size precheck.".format(image_spec))
                            logger.info(failure_msg)
                            image_size_bytes = 0
                        if image_size_bytes > self._max_image_size:
                            failure_msg = (
                                "The size of " + image_spec +
                                ": {} exceeds the maximum image size allowed {}."
                                .format(size_str(image_size_bytes),
                                        size_str(self._max_image_size)))
                            logger.error(failure_msg)
                            return ImageAvailabilityState(
                                digest=None,
                                stage=DependencyStage.FAILED,
                                message=failure_msg)
                    except Exception as ex:
                        failure_msg = "Cannot fetch image size before pulling Docker image: {} from Docker Hub: {}.".format(
                            image_spec, ex)
                        logger.error(failure_msg)
                        return ImageAvailabilityState(
                            digest=None,
                            stage=DependencyStage.FAILED,
                            message=failure_msg)

            self._downloading.add_if_new(
                image_spec,
                threading.Thread(target=self._download, args=[image_spec]))
            return ImageAvailabilityState(
                digest=None,
                stage=DependencyStage.DOWNLOADING,
                message=self._downloading[image_spec]['status'],
            )
        except Exception as ex:
            logger.error(ex)
            return ImageAvailabilityState(digest=None,
                                          stage=DependencyStage.FAILED,
                                          message=str(ex))