def manage_reference_downloads_cache(self):
     command.make_dirs(PURGE_SENTINEL_DIR)
     command.touch(PURGE_SENTINEL)
     time.sleep(
         3
     )  # 1 should be enough for the sentinel to predate all current stage downloads
     present_set, missing_set = self.references_roll_call()
     total_set = present_set | missing_set
     if total_set:
         present = len(present_set)
         missing = len(missing_set)
         total = len(total_set)
         log.write(
             f"Reference download cache: {present} of {total} large files ({100 * present / total:3.1f} percent) already exist locally."
         )
         structured_report = {
             "summary_stats": {
                 "cache_requests": total,
                 "cache_hits": present,
                 "cache_misses": missing,
                 "cache_hit_rate": present / total
             },
             "per_request_stats": {
                 f: "hit" if f in present_set else "miss"
                 for f in sorted(total_set)
             }
         }
         log.log_event("Reference downloads cache efficiency report",
                       values=structured_report)
     idseq_dag.util.s3.make_space(
     )  # make sure to touch this stage's files before deleting LRU ones
Exemple #2
0
def fetch_from_s3(
        src,  # pylint: disable=dangerous-default-value
        dst,
        auto_unzip=DEFAULT_AUTO_UNZIP,
        auto_untar=DEFAULT_AUTO_UNTAR,
        allow_s3mi=DEFAULT_ALLOW_S3MI,
        okay_if_missing=False,
        is_reference=False,
        touch_only=False,
        mutex=TraceLock("fetch_from_s3", multiprocessing.RLock()),
        locks={}):
    """Fetch a file from S3 if needed, using either s3mi or aws cp.

    IT IS NOT SAFE TO CALL THIS FUNCTION FROM MULTIPLE PROCESSES.
    It is totally fine to call it from multiple threads (it is designed for that).

    When is_reference=True, "dst" must be an existing directory.

    If src does not exist or there is a failure fetching it, the function returns None,
    without raising an exception.  If the download is successful, it returns the path
    to the downloaded file or folder.  If the download already exists, it is touched
    to update its timestamp.

    When touch_only=True, if the destination does not already exist, the function
    simply returns None (as if the download failed).  If the destination does exist,
    it is touched as usual.  This is useful in implementing an LRU cache policy.

    An exception is raised only if there is a coding error or equivalent problem,
    not if src simply doesn't exist.
    """
    # FIXME: this is a compatibility hack so we can replace this function
    #   We are removing ad-hoc s3 downloads from within steps and converting
    #   additional_files to wdl inputs. These files will be transparently
    #   downloaded by miniwdl. miniwdl will also handle the caching that
    #   is currently done here. This hack bypasses the s3 download if the
    #   source is already a local file, and returns the source (which is
    #   a local file path). This way, when we change the additional_files
    #   to inputs we can provide the local file path to the step instead
    #   of the s3 path and seamlessly transition without a coordinated
    #   change between idseq-dag and the idseq monorepo.
    if not src.startswith("s3://"):
        log.write(
            f"fetch_from_s3 is skipping download because source: {src} does not start with s3://"
        )
        if not os.path.isfile(src):
            return None
        if auto_untar and src.endswith(".tar"):
            dst = src[:-4]
            if not os.path.isdir(dst):
                command.make_dirs(dst + ".untarring")
                script = 'tar xvf "${src}" -C "${tmp_destdir}"'
                named_args = {"src": src, "tmp_destdir": dst + ".untarring"}
                command.execute(
                    command_patterns.ShellScriptCommand(script=script,
                                                        named_args=named_args))
                command.rename(dst + ".untarring/" + os.path.basename(dst),
                               dst)
            return dst
        return src

    # Do not be mislead by the multiprocessing.RLock() above -- that just means it won't deadlock
    # if called from multiple processes but does not mean the behaivior will be correct.  It will
    # be incorrect, because the locks dict (cointaining per-file locks) cannot be shared across
    # processes, the way it can be shared across threads.

    if is_reference:
        assert config[
            "REF_DIR"], "The is_reference code path becomes available only after initializing gloabal config['REF_DIR']"

    if os.path.exists(dst) and os.path.isdir(dst):
        dirname, basename = os.path.split(src)
        if is_reference or os.path.abspath(dst).startswith(config["REF_DIR"]):
            # Downloads to the reference dir are persisted from job to job, so we must include
            # version information from the full s3 path.
            #
            # The final destination for s3://path/to/source.db will look like /mnt/ref/s3__path__to/source.db
            # The final destination for s3://path/to/myarchive.tar will look like /mnt/ref/s3__path__to/myarchive/...
            #
            # We considered some other alternatives, for example /mnt/ref/s3__path__to__source.db, but unfortunately,
            # some tools incorporate the base name of their database input into the output filenames, so any approach
            # that changes the basename causes problems downstream.  An example such tool is srst2.
            is_reference = True
            if dirname.startswith("s3://"):
                dirname = dirname.replace("s3://", "s3__", 1)
            # If dirname contains slashes, it has to be flattened to single level.
            dirname = dirname.replace("/", "__")
            dst = os.path.join(dst, dirname, basename)
        else:
            dst = os.path.join(dst, basename)
    else:
        assert not is_reference, f"When fetching references, dst must be an existing directory: {dst}"

    unzip = ""
    if auto_unzip:
        file_without_ext, ext = os.path.splitext(dst)
        if ext in ZIP_EXTENSIONS:
            unzip = " | " + ZIP_EXTENSIONS[
                ext]  # this command will be used to decompress stdin to stdout
            dst = file_without_ext  # remove file extension from dst
    untar = auto_untar and dst.lower().endswith(".tar")
    if untar:
        dst = dst[:-4]  # Remove .tar

    # Downloads are staged under tmp_destdir.  Only after a download completes successfully it is moved to dst.
    destdir = os.path.dirname(dst)
    tmp_destdir = os.path.join(destdir, "tmp_downloads")
    tmp_dst = os.path.join(tmp_destdir, os.path.basename(dst))

    abspath = os.path.abspath(dst)
    with mutex:
        if abspath not in locks:
            locks[abspath] = TraceLock(f"fetch_from_s3: {abspath}",
                                       multiprocessing.RLock())
        destination_lock = locks[abspath]

    # shouldn't happen and makes it impossible to ensure that any dst that exists is complete and correct.
    assert tmp_dst != dst, f"Problematic use of fetch_from_s3 with tmp_dst==dst=='{dst}'"

    with destination_lock:
        # This check is a bit imperfect when untarring... unless you follow the discipline that
        # all contents of file foo.tar are under directory foo/... (which we do follow in IDseq)
        if os.path.exists(dst):
            command.touch(dst)
            return dst

        if touch_only:
            return None

        for (kind, ddir) in [("destinaiton", destdir),
                             ("temporary download", tmp_destdir)]:
            try:
                if ddir:
                    command.make_dirs(ddir)
            except OSError as e:
                # It's okay if the parent directory already exists, but all other
                # errors fail the download.
                if e.errno != errno.EEXIST:
                    log.write(f"Error in creating {kind} directory.")
                    return None

        with IOSTREAM:
            try:
                if allow_s3mi:
                    wait_start = time.time()
                    allow_s3mi = S3MI_SEM.acquire(timeout=MAX_S3MI_WAIT)
                    wait_duration = time.time() - wait_start
                    if not allow_s3mi:
                        log.write(
                            f"Failed to acquire S3MI semaphore after waiting {wait_duration} seconds for {src}."
                        )
                    elif wait_duration >= 5:
                        log.write(
                            f"Waited {wait_duration} seconds to acquire S3MI semaphore for {src}."
                        )

                if untar:
                    write_dst = r''' | tar xvf - -C "${tmp_destdir}";'''
                    named_args = {'tmp_destdir': tmp_destdir}
                else:
                    write_dst = r''' > "${tmp_dst}";'''
                    named_args = {'tmp_dst': tmp_dst}
                command_params = f"{unzip} {write_dst}"

                named_args.update({'src': src})

                try_cli = not allow_s3mi
                if allow_s3mi:
                    if os.path.exists(tmp_dst):
                        command.remove_rf(tmp_dst)
                    try:
                        command.execute(
                            command_patterns.ShellScriptCommand(
                                script=
                                r'set -o pipefail; s3mi cat --quiet "${src}" '
                                + command_params,
                                named_args=named_args))
                    except subprocess.CalledProcessError:
                        try_cli = not okay_if_missing
                        allow_s3mi = False
                        S3MI_SEM.release()
                        if try_cli:
                            log.write(
                                "Failed to download with s3mi. Trying with aws s3 cp..."
                            )
                        else:
                            raise
                if try_cli:
                    if os.path.exists(tmp_dst):
                        command.remove_rf(tmp_dst)
                    if okay_if_missing:
                        script = r'set -o pipefail; aws s3 cp --quiet "${src}" - ' + command_params
                    else:
                        script = r'set -o pipefail; aws s3 cp --only-show-errors "${src}" - ' + command_params
                    command.execute(
                        command_patterns.ShellScriptCommand(
                            script=script,
                            named_args=named_args,
                            env=dict(os.environ, **refreshed_credentials())))
                # Move staged download into final location.  Leave this last, so it only happens if no exception has occurred.
                # By this point we have already asserted that tmp_dst != dst.
                command.rename(tmp_dst, dst)
                return dst
            except BaseException as e:  # Deliberately super broad to make doubly certain that dst will be removed if there has been any exception
                if os.path.exists(dst):
                    command.remove_rf(dst)
                if not isinstance(e, subprocess.CalledProcessError):
                    # Coding error of some sort.  Best not hide it.
                    raise
                if okay_if_missing:
                    # We presume.
                    log.write("File most likely does not exist in S3.")
                else:
                    log.write("Failed to fetch file from S3.")
                return None
            finally:
                if allow_s3mi:
                    S3MI_SEM.release()
                if os.path.exists(
                        tmp_dst
                ):  # by this point we have asserted that tmp_dst != dst (and that assert may have failed, but so be it)
                    command.remove_rf(tmp_dst)