Python ArchiveAnnexCustomRemoteの例

プログラミング言語: Python

名前空間/パッケージ名: datalad.customremotes.archives

hotexamples.comのコード掲載数: 4

Python ArchiveAnnexCustomRemote - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdatalad.customremotes.archives.ArchiveAnnexCustomRemoteの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

ArchiveAnnexCustomRemote(3)

get_file_url(3)

コード例 #1

ファイルを表示

    def __call__(archive, annex=None,
                 add_archive_leading_dir=False,
                 strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None,
                 use_current_dir=False,
                 delete=False, key=False, exclude=None, rename=None, existing='fail',
                 annex_options=None, copy=False, commit=True, allow_dirty=False,
                 stats=None, drop_after=False, delete_after=False):
        """
        Returns
        -------
        annex
        """
        if exclude:
            exclude = assure_tuple_or_list(exclude)
        if rename:
            rename = assure_tuple_or_list(rename)

        # TODO: actually I see possibly us asking user either he wants to convert
        # his git repo into annex
        archive_path = archive
        pwd = getpwd()
        if annex is None:
            annex = get_repo_instance(pwd, class_=AnnexRepo)
            if not isabs(archive):
                # if not absolute -- relative to wd and thus
                archive_path = normpath(opj(realpath(pwd), archive))
                # abspath(archive) is not "good" since dereferences links in the path
                # archive_path = abspath(archive)
        elif not isabs(archive):
            # if we are given an annex, then assume that given path is within annex, not
            # relative to PWD
            archive_path = opj(annex.path, archive)
        annex_path = annex.path

        # _rpath below should depict paths relative to the top of the annex
        archive_rpath = relpath(
            archive_path,
            # Use `get_dataset_root` to avoid resolving the leading path. If no
            # repo is found, downstream code will raise FileNotInRepositoryError.
            get_dataset_root(archive_path) or ".")

        if archive in annex.untracked_files:
            raise RuntimeError(
                "The archive is not under annex yet. You should run 'datalad "
                "add {}' first".format(archive))

        if not allow_dirty and annex.dirty:
            # already saved me once ;)
            raise RuntimeError("You better commit all the changes and untracked files first")

        if not key:
            # we were given a file which must exist
            if not exists(archive_path):
                raise ValueError("Archive {} does not exist".format(archive))
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            key = annex.get_file_key(archive_rpath)
            archive_dir = dirname(archive_path)
        else:
            origin = 'key'
            key = archive
            archive_dir = None  # We must not have anything to do with the location under .git/annex

        archive_basename = file_basename(archive)

        if not key:
            # TODO: allow for it to be under git???  how to reference then?
            raise NotImplementedError(
                "Provided file %s is not under annex.  We don't support yet adding everything "
                "straight to git" % archive
            )

        # are we in a subdirectory of the repository?
        pwd_under_annex = commonprefix([pwd, annex_path]) == annex_path
        #  then we should add content under that
        # subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # if outside -- extract to the top of repo
            extract_rpath = relpath(pwd, annex_path) \
                if pwd_under_annex \
                else None
        else:
            extract_rpath = relpath(archive_dir, annex_path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None  # no special relpath from top of the repo

        # and operate from now on the key or whereever content available "canonically"
        try:
            key_rpath = annex.get_contentlocation(key)  # , relative_to_top=True)
        except:
            raise RuntimeError("Content of %s seems to be N/A.  Fetch it first" % key)

        # now we simply need to go through every file in that archive and
        lgr.info("Adding content of the archive %s into annex %s", archive, annex)

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote
        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        annexarchive = ArchiveAnnexCustomRemote(path=annex_path, persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]

        # TODO: check if may be it was already added
        if ARCHIVES_SPECIAL_REMOTE not in annex.get_remotes():
            init_datalad_remote(annex, ARCHIVES_SPECIAL_REMOTE, autoenable=True)
        else:
            lgr.debug("Special remote {} already exists".format(ARCHIVES_SPECIAL_REMOTE))

        precommitted = False
        delete_after_rpath = None
        try:
            old_always_commit = annex.always_commit
            # When faking dates, batch mode is disabled, so we want to always
            # commit.
            annex.always_commit = annex.fake_dates_enabled

            if annex_options:
                if isinstance(annex_options, str):
                    annex_options = split_cmdline(annex_options)

            leading_dir = earchive.get_leading_directory(
                depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) \
                if strip_leading_dirs else None
            leading_dir_len = len(leading_dir) + len(opsep) if leading_dir else 0

            # we need to create a temporary directory at the top level which would later be
            # removed
            prefix_dir = basename(tempfile.mktemp(prefix=".datalad", dir=annex_path)) \
                if delete_after \
                else None

            # dedicated stats which would be added to passed in (if any)
            outside_stats = stats
            stats = ActivityStats()

            for extracted_file in earchive.get_extracted_files():
                stats.files += 1
                extracted_path = opj(earchive.path, extracted_file)

                if islink(extracted_path):
                    link_path = realpath(extracted_path)
                    if not exists(link_path):  # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning("Path %s points to non-existing file %s" % (extracted_path, link_path))
                        stats.skipped += 1
                        continue
                        # TODO: check if points outside of the archive -- warning and skip

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = extracted_file

                # strip leading dirs
                target_file = target_file[leading_dir_len:]

                if add_archive_leading_dir:
                    target_file = opj(archive_basename, target_file)

                if rename:
                    target_file = apply_replacement_rules(rename, target_file)

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains {regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if prefix_dir:
                    target_file = opj(prefix_dir, target_file)
                    # but also allow for it in the orig
                    target_file_orig = opj(prefix_dir, target_file_orig)

                target_file_path_orig = opj(annex.path, target_file_orig)

                url = annexarchive.get_file_url(archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size)

                # lgr.debug("mv {extracted_path} {target_file}. URL: {url}".format(**locals()))

                target_file_path = opj(extract_rpath, target_file) \
                    if extract_rpath else target_file

                target_file_path = opj(annex.path, target_file_path)

                if lexists(target_file_path):
                    handle_existing = True
                    if md5sum(target_file_path) == md5sum(extracted_path):
                        if not annex.is_under_annex(extracted_path):
                            # if under annex -- must be having the same content,
                            # we should just add possibly a new extra URL
                            # but if under git -- we cannot/should not do
                            # anything about it ATM
                            if existing != 'overwrite':
                                continue
                        else:
                            handle_existing = False
                    if not handle_existing:
                        pass  # nothing... just to avoid additional indentation
                    elif existing == 'fail':
                        raise RuntimeError(
                            "File {} already exists, but new (?) file {} was instructed "
                            "to be placed there while overwrite=False".format
                                (target_file_path, extracted_file)
                        )
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a tree
                        rmtree(target_file_path)
                    else:
                        target_file_path_orig_ = target_file_path

                        # To keep extension intact -- operate on the base of the filename
                        p, fn = os.path.split(target_file_path)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file doesn't collide
                        suf, i = '', 0
                        while True:
                            target_file_path_new = opj(p, fn_base + suf + ('.' if (fn_ext or ends_with_dot) else '') + fn_ext)
                            if not lexists(target_file_path_new):
                                break
                            lgr.debug("File %s already exists" % target_file_path_new)
                            i += 1
                            suf = '.%d' % i
                        target_file_path = target_file_path_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_path_orig_, target_file_path))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file_path != target_file_path_orig:
                    stats.renamed += 1

                #target_path = opj(getpwd(), target_file)
                if copy:
                    raise NotImplementedError("Not yet copying from 'persistent' cache")
                else:
                    # os.renames(extracted_path, target_path)
                    # addurl implementation relying on annex'es addurl below would actually copy
                    pass

                lgr.debug("Adding %s to annex pointing to %s and with options %r",
                          target_file_path, url, annex_options)

                out_json = annex.add_url_to_file(
                    target_file_path,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:  # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(target_file_path))
                    stats.add_git += 1

                if delete_after:
                    # delayed removal so it doesn't interfer with batched processes since any pure
                    # git action invokes precommit which closes batched processes. But we like to count
                    stats.removed += 1

                # # chaining 3 annex commands, 2 of which not batched -- less efficient but more bullet proof etc
                # annex.add(target_path, options=annex_options)
                # # above action might add to git or to annex
                # if annex.file_has_content(target_path):
                #     # if not --  it was added to git, if in annex, it is present and output is True
                #     annex.add_url_to_file(target_file, url, options=['--relaxed'], batch=True)
                #     stats.add_annex += 1
                # else:
                #     lgr.debug("File {} was added to git, not adding url".format(target_file))
                #     stats.add_git += 1
                # # TODO: actually check if it is anyhow different from a previous version. If not
                # # then it wasn't really added

                del target_file  # Done with target_file -- just to have clear end of the loop

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(archive_rpath, force=True)

            lgr.info("Finished adding %s: %s" % (archive, stats.as_str(mode='line')))

            if outside_stats:
                outside_stats += stats
            if delete_after:
                # force since not committed. r=True for -r (passed into git call
                # to recurse)
                delete_after_rpath = opj(extract_rpath, prefix_dir) if extract_rpath else prefix_dir
                lgr.debug(
                    "Removing extracted and annexed files under %s",
                    delete_after_rpath
                )
                annex.remove(delete_after_rpath, r=True, force=True)
            if commit:
                commit_stats = outside_stats if outside_stats else stats
                annex.precommit()  # so batched ones close and files become annex symlinks etc
                precommitted = True
                if any(r.get('state', None) != 'clean'
                       for p, r in annex.status(untracked='no').items()):
                    annex.commit(
                        "Added content extracted from %s %s\n\n%s" %
                        (origin, archive_rpath, commit_stats.as_str(mode='full')),
                        _datalad_msg=True
                    )
                    commit_stats.reset()
        finally:
            # since we batched addurl, we should close those batched processes
            # if haven't done yet.  explicitly checked to avoid any possible
            # "double-action"
            if not precommitted:
                annex.precommit()

            if delete_after_rpath:
                delete_after_path = opj(annex_path, delete_after_rpath)
                if exists(delete_after_path):  # should not be there
                    # but for paranoid yoh
                    lgr.warning(
                        "Removing temporary directory under which extracted "
                        "files were annexed and should have been removed: %s",
                        delete_after_path)
                    rmtree(delete_after_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)

        return annex

コード例 #2

ファイルを表示

ファイル: add_archive_content.py プロジェクト: datalad/datalad

    def __call__(
            archive,
            *,
            dataset=None,
            annex=None,
            add_archive_leading_dir=False,
            strip_leading_dirs=False,
            leading_dirs_depth=None,
            leading_dirs_consider=None,
            use_current_dir=False,
            delete=False,
            key=False,
            exclude=None,
            rename=None,
            existing='fail',
            annex_options=None,
            copy=False,
            commit=True,
            allow_dirty=False,
            stats=None,
            drop_after=False,
            delete_after=False):

        if exclude:
            exclude = ensure_tuple_or_list(exclude)
        if rename:
            rename = ensure_tuple_or_list(rename)
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='add-archive-content')

        # set up common params for result records
        res_kwargs = {
            'action': 'add-archive-content',
            'logger': lgr,
        }

        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message="Can't operate in a pure Git repository",
                **res_kwargs
            )
            return
        if annex:
            warnings.warn(
                "datalad add_archive_content's `annex` parameter is "
                "deprecated and will be removed in a future release. "
                "Use the 'dataset' parameter instead.",
                DeprecationWarning)
        annex = ds.repo
        # get the archive path relative from the ds root
        archive_path = resolve_path(archive, ds=dataset)
        # let Status decide whether we can act on the given file
        for s in ds.status(
                path=archive_path,
                on_failure='ignore',
                result_renderer='disabled'):
            if s['status'] == 'error':
                if 'path not underneath the reference dataset %s' in s['message']:
                    yield get_status_dict(
                        ds=ds,
                        status='impossible',
                        message='Can not add archive outside of the dataset',
                        **res_kwargs)
                    return
                # status errored & we haven't anticipated the cause. Bubble up
                yield s
                return
            elif s['state'] == 'untracked':
                # we can't act on an untracked file
                message = (
                    "Can not add an untracked archive. "
                    "Run 'datalad save {}'".format(archive)
                )
                yield get_status_dict(
                           ds=ds,
                           status='impossible',
                           message=message,
                           **res_kwargs)
                return

        if not allow_dirty and annex.dirty:
            # error out here if the dataset contains untracked changes
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required. '
                    'Use `datalad status` to inspect unsaved changes'),
                **res_kwargs
            )
            return

        # ensure the archive exists, status doesn't error on a non-existing file
        if not key and not lexists(archive_path):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'No such file: {}'.format(archive_path),
                ),
                **res_kwargs
            )
            return

        if not key:
            check_path = archive_path.relative_to(ds.pathobj)
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            # can become get_file_annexinfo once #6104 is merged
            key = annex.get_file_annexinfo(check_path)['key']
            if not key:
                raise RuntimeError(
                    f"Archive must be an annexed file in {ds}")
            archive_dir = Path(archive_path).parent
        else:
            origin = 'key'
            key = archive
            # We must not have anything to do with the location under .git/annex
            archive_dir = None
            # instead, we will go from the current directory
            use_current_dir = True

        archive_basename = file_basename(archive)

        if not key:
            # if we didn't manage to get a key, the file must be in Git
            raise NotImplementedError(
                "Provided file %s does not seem to be under annex control. "
                "We don't support adding everything straight to Git" % archive
            )

        # figure out our location
        pwd = getpwd()
        # are we in a subdirectory of the repository?
        pwd_in_root = annex.path == archive_dir
        # then we should add content under that subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # extract the archive under the current directory, not the directory
            # where the archive is located
            extract_rpath = Path(pwd).relative_to(ds.path) \
                if not pwd_in_root \
                else None
        else:
            extract_rpath = archive_dir.relative_to(ds.path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None

        try:
            key_rpath = annex.get_contentlocation(key)
        except:
            # the only probable reason for this to fail is that there is no
            # content present
            raise RuntimeError(
                "Content of %s seems to be N/A.  Fetch it first" % key
            )

        # now we simply need to go through every file in that archive and
        lgr.info(
            "Adding content of the archive %s into annex %s", archive, annex
        )

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote

        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        # OK, let's ignore that the following class is actually a special
        # remote implementation, and use it only to work with its cache
        annexarchive = ArchiveAnnexCustomRemote(annex=None,
                                                path=annex.path,
                                                persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]
        # make sure there is an enabled datalad-archives special remote
        ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE,
                              autoenable=True)

        precommitted = False
        old_always_commit = annex.always_commit
        # batch mode is disabled when faking dates, we want to always commit
        annex.always_commit = annex.fake_dates_enabled
        if annex_options:
            if isinstance(annex_options, str):
                annex_options = split_cmdline(annex_options)
        delete_after_rpath = None

        prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad",
                                               dir=annex.path)) \
            if delete_after \
            else None

        # dedicated stats which would be added to passed in (if any)
        outside_stats = stats
        stats = ActivityStats()

        try:
            # keep track of extracted files for progress bar logging
            file_counter = 0
            # iterative over all files in the archive
            extracted_files = list(earchive.get_extracted_files())
            # start a progress bar for extraction
            pbar_id = f'add-archive-{archive_path}'
            log_progress(
                lgr.info, pbar_id, 'Extracting archive',
                label="Extracting archive",
                unit=' Files',
                total = len(extracted_files),
                noninteractive_level = logging.INFO)
            for extracted_file in extracted_files:
                file_counter += 1
                files_left = len(extracted_files) - file_counter
                log_progress(
                    lgr.info, pbar_id,
                    "Files to extract %i ", files_left,
                    update=1,
                    increment=True,
                    noninteractive_level=logging.DEBUG)
                stats.files += 1
                extracted_path = Path(earchive.path) / Path(extracted_file)

                if extracted_path.is_symlink():
                    link_path = str(extracted_path.resolve())
                    if not exists(link_path):
                        # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning(
                            "Path %s points to non-existing file %s" %
                            (extracted_path, link_path)
                        )
                        stats.skipped += 1
                        continue
                        # TODO: check if points outside of archive - warn & skip

                url = annexarchive.get_file_url(
                    archive_key=key,
                    file=extracted_file,
                    size=os.stat(extracted_path).st_size)

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = Path(extracted_file)

                # stream archives would not have had the original filename
                # information in them, so would be extracted under a name
                # derived from their annex key.
                # Provide ad-hoc handling for such cases
                if (len(extracted_files) == 1 and
                    Path(archive).suffix in ('.xz', '.gz', '.lzma') and
                        Path(key_rpath).name.startswith(Path(
                            extracted_file).name)):
                    # take archive's name without extension for filename & place
                    # where it was originally extracted
                    target_file = \
                        Path(extracted_file).parent / Path(archive).stem

                if strip_leading_dirs:
                    leading_dir = earchive.get_leading_directory(
                        depth=leading_dirs_depth, exclude=exclude,
                        consider=leading_dirs_consider)
                    leading_dir_len = \
                        len(leading_dir) + len(opsep) if leading_dir else 0
                    target_file = str(target_file)[leading_dir_len:]

                if add_archive_leading_dir:
                    # place extracted content under a directory corresponding to
                    # the archive name with suffix stripped.
                    target_file = Path(archive_basename) / target_file

                if rename:
                    target_file = apply_replacement_rules(rename,
                                                          str(target_file))

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains "
                                    "{regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if delete_after:
                    # place target file in a temporary directory
                    target_file = Path(prefix_dir) / Path(target_file)
                    # but also allow for it in the orig
                    target_file_orig = Path(prefix_dir) / Path(target_file_orig)

                target_file_path_orig = annex.pathobj / target_file_orig

                # If we were invoked in a subdirectory, patch together the
                # correct path
                target_file_path = extract_rpath / target_file \
                    if extract_rpath else target_file
                target_file_path = annex.pathobj / target_file_path

                # when the file already exists...
                if lexists(target_file_path):
                    handle_existing = True
                    if md5sum(str(target_file_path)) == \
                            md5sum(str(extracted_path)):
                        if not annex.is_under_annex(str(extracted_path)):
                            # if under annex -- must be having the same content,
                            # we should just add possibly a new extra URL
                            # but if under git -- we cannot/should not do
                            # anything about it ATM
                            if existing != 'overwrite':
                                continue
                        else:
                            handle_existing = False
                    if not handle_existing:
                        pass  # nothing... just to avoid additional indentation
                    elif existing == 'fail':
                        message = \
                            "{} exists, but would be overwritten by new file " \
                            "{}. Consider adjusting --existing".format\
                            (target_file_path, extracted_file)
                        yield get_status_dict(
                            ds=ds,
                            status='error',
                            message=message,
                            **res_kwargs)
                        return
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a
                        # tree
                        rmtree(target_file_path)
                    else:
                        # an elaborate dance to piece together new archive names
                        target_file_path_orig_ = target_file_path

                        # To keep extension intact -- operate on the base of the
                        # filename
                        p, fn = os.path.split(target_file_path)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            # we shouldn't get here, argparse should catch a
                            # non-existing value for --existing right away
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file
                        # doesn't collide
                        suf, i = '', 0
                        while True:
                            connector = \
                                ('.' if (fn_ext or ends_with_dot) else '')
                            file = fn_base + suf + connector + fn_ext
                            target_file_path_new =  \
                                Path(p) / Path(file)
                            if not lexists(target_file_path_new):
                                # we found a file name that is not yet taken
                                break
                            lgr.debug("Iteration %i of file name finding. "
                                      "File %s already exists", i,
                                      target_file_path_new)
                            i += 1
                            suf = '.%d' % i
                        target_file_path = target_file_path_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_path_orig_, target_file_path))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file_path != target_file_path_orig:
                    stats.renamed += 1

                if copy:
                    raise NotImplementedError(
                        "Not yet copying from 'persistent' cache"
                    )

                lgr.debug("Adding %s to annex pointing to %s and with options "
                          "%r", target_file_path, url, annex_options)

                out_json = annex.add_url_to_file(
                    target_file_path,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:
                    # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        # drop extracted files after adding to annex
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(
                        target_file_path))
                    stats.add_git += 1

                if delete_after:
                    # we count the removal here, but don't yet perform it
                    # to not interfer with batched processes - any pure Git
                    # action invokes precommit which closes batched processes.
                    stats.removed += 1

                # Done with target_file -- just to have clear end of the loop
                del target_file

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(str(archive_path), force=True)

            lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line'))

            if outside_stats:
                outside_stats += stats
            if delete_after:
                # force since not committed. r=True for -r (passed into git call
                # to recurse)
                delete_after_rpath = opj(extract_rpath, prefix_dir) \
                    if extract_rpath else prefix_dir
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                lgr.debug(
                    "Removing extracted and annexed files under %s",
                    delete_after_rpath
                )
                annex.remove(str(delete_after_rpath), r=True, force=True)
            if commit:
                archive_rpath = archive_path.relative_to(ds.path)
                commit_stats = outside_stats if outside_stats else stats
                # so batched ones close and files become annex symlinks etc
                annex.precommit()
                precommitted = True
                if any(r.get('state', None) != 'clean'
                       for p, r in annex.status(untracked='no').items()):
                    annex.commit(
                        "Added content extracted from %s %s\n\n%s" %
                        (origin, archive_rpath,
                         commit_stats.as_str(mode='full')),
                        _datalad_msg=True
                    )
                    commit_stats.reset()
            else:
                # don't commit upon completion
                pass
        finally:
            # take down the progress bar
            log_progress(
                lgr.info, pbar_id,
                'Finished extraction',
                noninteractive_level=logging.INFO)
            # since we batched addurl, we should close those batched processes
            # if haven't done yet.  explicitly checked to avoid any possible
            # "double-action"
            if not precommitted:
                annex.precommit()

            if delete_after_rpath:
                delete_after_path = opj(annex.path, delete_after_rpath)
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                if exists(delete_after_path):  # should not be there
                    # but for paranoid yoh
                    lgr.warning(
                        "Removing temporary directory under which extracted "
                        "files were annexed and should have been removed: %s",
                        delete_after_path)
                    rmtree(delete_after_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)
            # remove tempfile directories (not cleaned up automatically):
            if prefix_dir is not None and lexists(prefix_dir):
                os.rmdir(prefix_dir)
        yield get_status_dict(
            ds=ds,
            status='ok',
            **res_kwargs)
        return annex

コード例 #3

ファイルを表示

ファイル: add_archive_content.py プロジェクト: glalteva/datalad

    def __call__(archive,
                 annex=None,
                 strip_leading_dirs=False,
                 leading_dirs_depth=None,
                 leading_dirs_consider=None,
                 delete=False,
                 key=False,
                 exclude=None,
                 rename=None,
                 existing='fail',
                 annex_options=None,
                 copy=False,
                 commit=True,
                 allow_dirty=False,
                 stats=None,
                 drop_after=False,
                 delete_after=False):
        """
        Returns
        -------
        annex
        """
        if exclude:
            exclude = assure_tuple_or_list(exclude)
        if rename:
            rename = assure_tuple_or_list(rename)

        # TODO: actually I see possibly us asking user either he wants to convert
        # his git repo into annex
        archive_path = archive
        if annex is None:
            annex = get_repo_instance(class_=AnnexRepo)
            if not isabs(archive):
                # if not absolute -- relative to curdir and thus
                archive_path = relpath(abspath(archive), annex.path)
        elif not isabs(archive):
            # if we are given an annex, then assume that given path is within annex, not
            # relative to PWD
            archive_path = opj(annex.path, archive)

        # TODO: somewhat too cruel -- may be an option or smth...
        if not allow_dirty and annex.dirty:
            # already saved me once ;)
            raise RuntimeError(
                "You better commit all the changes and untracked files first")

        # are we in a subdirectory of the repository? then we should add content under that
        # subdirectory,
        # get the path relative to the repo top
        extract_relpath = relpath(getpwd(), annex.path) \
            if commonprefix([realpath(getpwd()), annex.path]) == annex.path \
            else None

        if not key:
            # we were given a file which must exist
            if not exists(opj(annex.path, archive_path)):
                raise ValueError("Archive {} does not exist".format(archive))
            # TODO: support adding archives content from outside the annex/repo
            origin = archive
            key = annex.get_file_key(archive_path)
        else:
            origin = key

        if not key:
            # TODO: allow for it to be under git???  how to reference then?
            raise NotImplementedError(
                "Provided file is not under annex.  We don't support yet adding everything "
                "straight to git")

        # and operate from now on the key or whereever content available "canonically"
        try:
            key_path = annex.get_contentlocation(
                key)  # , relative_to_top=True)
        except:
            raise RuntimeError(
                "Content of %s seems to be N/A.  Fetch it first" % key)

        #key_path = opj(reltop, key_path)
        # now we simply need to go through every file in that archive and
        lgr.info("Adding content of the archive %s into annex %s", archive,
                 annex)

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote
        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        annexarchive = ArchiveAnnexCustomRemote(path=annex.path,
                                                persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_path]

        # TODO: check if may be it was already added
        if ARCHIVES_SPECIAL_REMOTE not in annex.git_get_remotes():
            lgr.debug(
                "Adding new special remote {}".format(ARCHIVES_SPECIAL_REMOTE))
            annex.annex_initremote(ARCHIVES_SPECIAL_REMOTE, [
                'encryption=none', 'type=external',
                'externaltype=%s' % ARCHIVES_SPECIAL_REMOTE, 'autoenable=true'
            ])
        else:
            lgr.debug("Special remote {} already exists".format(
                ARCHIVES_SPECIAL_REMOTE))

        try:
            old_always_commit = annex.always_commit
            annex.always_commit = False
            keys_to_drop = []

            if annex_options:
                if isinstance(annex_options, string_types):
                    annex_options = shlex.split(annex_options)

            leading_dir = earchive.get_leading_directory(
                    depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) \
                if strip_leading_dirs else None
            leading_dir_len = len(leading_dir) + len(
                opsep) if leading_dir else 0

            # we need to create a temporary directory at the top level which would later be
            # removed
            prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad", dir=annex.path)) \
                if delete_after \
                else None

            # dedicated stats which would be added to passed in (if any)
            outside_stats = stats
            stats = ActivityStats()

            for extracted_file in earchive.get_extracted_files():
                stats.files += 1
                extracted_path = opj(earchive.path, extracted_file)

                if islink(extracted_path):
                    link_path = realpath(extracted_path)
                    if not exists(
                            link_path
                    ):  # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning("Path %s points to non-existing file %s" %
                                    (extracted_path, link_path))
                        stats.skipped += 1
                        continue
                    # TODO: check if points outside of the archive -- warning and skip

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = extracted_file

                target_file = target_file[leading_dir_len:]

                if rename:
                    target_file = apply_replacement_rules(rename, target_file)

                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, target_file):
                                lgr.debug(
                                    "Skipping {target_file} since contains {regexp} pattern"
                                    .format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if prefix_dir:
                    target_file = opj(prefix_dir, target_file)

                url = annexarchive.get_file_url(
                    archive_key=key,
                    file=extracted_file,
                    size=os.stat(extracted_path).st_size)

                # lgr.debug("mv {extracted_path} {target_file}. URL: {url}".format(**locals()))

                if lexists(target_file):
                    if md5sum(target_file) == md5sum(extracted_path):
                        # must be having the same content, we should just add possibly a new extra URL
                        pass
                    elif existing == 'fail':
                        raise RuntimeError(
                            "File {} already exists, but new (?) file {} was instructed "
                            "to be placed there while overwrite=False".format(
                                target_file, extracted_file))
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a tree
                        rmtree(target_file)
                    else:
                        target_file_orig_ = target_file

                        # To keep extension intact -- operate on the base of the filename
                        p, fn = os.path.split(target_file)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % file_basename(origin)
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file doesn't collide
                        suf, i = '', 0
                        while True:
                            target_file_new = opj(
                                p, fn_base + suf +
                                ('.' if
                                 (fn_ext or ends_with_dot) else '') + fn_ext)
                            if not lexists(target_file_new):
                                break
                            lgr.debug("File %s already exists" %
                                      target_file_new)
                            i += 1
                            suf = '.%d' % i
                        target_file = target_file_new
                        lgr.debug("Original file %s will be saved into %s" %
                                  (target_file_orig_, target_file))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file != target_file_orig:
                    stats.renamed += 1

                #target_path = opj(getpwd(), target_file)
                if copy:
                    raise NotImplementedError(
                        "Not yet copying from 'persistent' cache")
                else:
                    # os.renames(extracted_path, target_path)
                    # addurl implementation relying on annex'es addurl below would actually copy
                    pass

                lgr.debug(
                    "Adding %s to annex pointing to %s and with options %r",
                    target_file, url, annex_options)

                target_file_gitpath = opj(
                    extract_relpath,
                    target_file) if extract_relpath else target_file
                out_json = annex.annex_addurl_to_file(target_file_gitpath,
                                                      url,
                                                      options=annex_options,
                                                      batch=True)

                if 'key' in out_json:  # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        keys_to_drop.append(out_json['key'])
                    stats.add_annex += 1
                else:
                    lgr.debug(
                        "File {} was added to git, not adding url".format(
                            target_file))
                    stats.add_git += 1

                if delete_after:
                    # forcing since it is only staged, not yet committed
                    annex.remove(target_file_gitpath,
                                 force=True)  # TODO: batch!
                    stats.removed += 1

                # # chaining 3 annex commands, 2 of which not batched -- less efficient but more bullet proof etc
                # annex.annex_add(target_path, options=annex_options)
                # # above action might add to git or to annex
                # if annex.file_has_content(target_path):
                #     # if not --  it was added to git, if in annex, it is present and output is True
                #     annex.annex_addurl_to_file(target_file, url, options=['--relaxed'], batch=True)
                #     stats.add_annex += 1
                # else:
                #     lgr.debug("File {} was added to git, not adding url".format(target_file))
                #     stats.add_git += 1
                # # TODO: actually check if it is anyhow different from a previous version. If not
                # # then it wasn't really added

                del target_file  # Done with target_file -- just to have clear end of the loop

            if delete and archive:
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(archive_path, force=True)

            lgr.info("Finished adding %s: %s" %
                     (archive, stats.as_str(mode='line')))

            if outside_stats:
                outside_stats += stats
            if commit:
                commit_stats = outside_stats if outside_stats else stats
                annex.commit("Added content extracted from %s\n\n%s" %
                             (origin, commit_stats.as_str(mode='full')))
                commit_stats.reset()
        finally:
            # since we batched addurl, we should close those batched processes
            if delete_after:
                prefix_path = opj(annex.path, prefix_dir)
                if exists(prefix_path):  # probably would always be there
                    lgr.info(
                        "Removing temporary directory under which extracted files were annexed: %s",
                        prefix_path)
                    rmtree(prefix_path)

            annex.precommit()
            if keys_to_drop:
                # since we know that keys should be retrievable, we --force since no batching
                # atm and it would be expensive
                annex.annex_drop(keys_to_drop, options=['--force'], key=True)
                stats.dropped += len(keys_to_drop)
                annex.precommit()  # might need clean up etc again

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)

        return annex

コード例 #4

ファイルを表示

ファイル: add_archive_content.py プロジェクト: debanjum/datalad

    def __call__(archive, annex=None,
                 add_archive_leading_dir=False,
                 strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None,
                 use_current_dir=False,
                 delete=False, key=False, exclude=None, rename=None, existing='fail',
                 annex_options=None, copy=False, commit=True, allow_dirty=False,
                 stats=None, drop_after=False, delete_after=False):
        """
        Returns
        -------
        annex
        """
        if exclude:
            exclude = assure_tuple_or_list(exclude)
        if rename:
            rename = assure_tuple_or_list(rename)

        # TODO: actually I see possibly us asking user either he wants to convert
        # his git repo into annex
        archive_path = archive
        pwd = getpwd()
        if annex is None:
            annex = get_repo_instance(pwd, class_=AnnexRepo)
            if not isabs(archive):
                # if not absolute -- relative to wd and thus
                archive_path = normpath(opj(realpath(pwd), archive))
                # abspath(archive) is not "good" since dereferences links in the path
                # archive_path = abspath(archive)
        elif not isabs(archive):
            # if we are given an annex, then assume that given path is within annex, not
            # relative to PWD
            archive_path = opj(annex.path, archive)
        annex_path = annex.path

        # _rpath below should depict paths relative to the top of the annex
        archive_rpath = relpath(archive_path, annex_path)

        # TODO: somewhat too cruel -- may be an option or smth...
        if not allow_dirty and annex.dirty:
            # already saved me once ;)
            raise RuntimeError("You better commit all the changes and untracked files first")

        if not key:
            # we were given a file which must exist
            if not exists(archive_path):
                raise ValueError("Archive {} does not exist".format(archive))
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            key = annex.get_file_key(archive_rpath)
            archive_dir = dirname(archive_path)
        else:
            origin = 'key'
            key = archive
            archive_dir = None  # We must not have anything to do with the location under .git/annex

        archive_basename = file_basename(archive)

        if not key:
            # TODO: allow for it to be under git???  how to reference then?
            raise NotImplementedError(
                "Provided file %s is not under annex.  We don't support yet adding everything "
                "straight to git" % archive
            )

        # are we in a subdirectory of the repository?
        pwd_under_annex = commonprefix([pwd, annex_path]) == annex_path
        #  then we should add content under that
        # subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # if outside -- extract to the top of repo
            extract_rpath = relpath(pwd, annex_path) \
                if pwd_under_annex \
                else None
        else:
            extract_rpath = relpath(archive_dir, annex_path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None  # no special relpath from top of the repo

        # and operate from now on the key or whereever content available "canonically"
        try:
            key_rpath = annex.get_contentlocation(key)  # , relative_to_top=True)
        except:
            raise RuntimeError("Content of %s seems to be N/A.  Fetch it first" % key)

        # now we simply need to go through every file in that archive and
        lgr.info("Adding content of the archive %s into annex %s", archive, annex)

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote
        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        annexarchive = ArchiveAnnexCustomRemote(path=annex_path, persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]

        # TODO: check if may be it was already added
        if ARCHIVES_SPECIAL_REMOTE not in annex.get_remotes():
            lgr.debug("Adding new special remote {}".format(ARCHIVES_SPECIAL_REMOTE))
            annex.init_remote(
                ARCHIVES_SPECIAL_REMOTE,
                ['encryption=none', 'type=external', 'externaltype=%s' % ARCHIVES_SPECIAL_REMOTE,
                 'autoenable=true'])
        else:
            lgr.debug("Special remote {} already exists".format(ARCHIVES_SPECIAL_REMOTE))

        try:
            old_always_commit = annex.always_commit
            annex.always_commit = False

            if annex_options:
                if isinstance(annex_options, string_types):
                    annex_options = shlex.split(annex_options)

            leading_dir = earchive.get_leading_directory(
                depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) \
                if strip_leading_dirs else None
            leading_dir_len = len(leading_dir) + len(opsep) if leading_dir else 0

            # we need to create a temporary directory at the top level which would later be
            # removed
            prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad", dir=annex_path)) \
                if delete_after \
                else None

            # dedicated stats which would be added to passed in (if any)
            outside_stats = stats
            stats = ActivityStats()

            for extracted_file in earchive.get_extracted_files():
                stats.files += 1
                extracted_path = opj(earchive.path, extracted_file)

                if islink(extracted_path):
                    link_path = realpath(extracted_path)
                    if not exists(link_path):  # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning("Path %s points to non-existing file %s" % (extracted_path, link_path))
                        stats.skipped += 1
                        continue
                    # TODO: check if points outside of the archive -- warning and skip

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = extracted_file

                # strip leading dirs
                target_file = target_file[leading_dir_len:]

                if add_archive_leading_dir:
                    target_file = opj(archive_basename, target_file)

                if rename:
                    target_file = apply_replacement_rules(rename, target_file)

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains {regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if prefix_dir:
                    target_file = opj(prefix_dir, target_file)

                url = annexarchive.get_file_url(archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size)

                # lgr.debug("mv {extracted_path} {target_file}. URL: {url}".format(**locals()))

                if lexists(target_file):
                    if md5sum(target_file) == md5sum(extracted_path):
                        # must be having the same content, we should just add possibly a new extra URL
                        pass
                    elif existing == 'fail':
                        raise RuntimeError(
                            "File {} already exists, but new (?) file {} was instructed "
                            "to be placed there while overwrite=False".format(target_file, extracted_file))
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a tree
                        rmtree(target_file)
                    else:
                        target_file_orig_ = target_file

                        # To keep extension intact -- operate on the base of the filename
                        p, fn = os.path.split(target_file)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file doesn't collide
                        suf, i = '', 0
                        while True:
                            target_file_new = opj(p, fn_base + suf + ('.' if (fn_ext or ends_with_dot) else '') + fn_ext)
                            if not lexists(target_file_new):
                                break
                            lgr.debug("File %s already exists" % target_file_new)
                            i += 1
                            suf = '.%d' % i
                        target_file = target_file_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_orig_, target_file))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file != target_file_orig:
                    stats.renamed += 1

                #target_path = opj(getpwd(), target_file)
                if copy:
                    raise NotImplementedError("Not yet copying from 'persistent' cache")
                else:
                    # os.renames(extracted_path, target_path)
                    # addurl implementation relying on annex'es addurl below would actually copy
                    pass

                lgr.debug("Adding %s to annex pointing to %s and with options %r",
                          target_file, url, annex_options)

                target_file_rpath = opj(extract_rpath, target_file) if extract_rpath else target_file
                out_json = annex.add_url_to_file(
                    target_file_rpath,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:  # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(target_file))
                    stats.add_git += 1

                if delete_after:
                    # forcing since it is only staged, not yet committed
                    annex.remove(target_file_rpath, force=True)  # TODO: batch!
                    stats.removed += 1

                # # chaining 3 annex commands, 2 of which not batched -- less efficient but more bullet proof etc
                # annex.add(target_path, options=annex_options)
                # # above action might add to git or to annex
                # if annex.file_has_content(target_path):
                #     # if not --  it was added to git, if in annex, it is present and output is True
                #     annex.add_url_to_file(target_file, url, options=['--relaxed'], batch=True)
                #     stats.add_annex += 1
                # else:
                #     lgr.debug("File {} was added to git, not adding url".format(target_file))
                #     stats.add_git += 1
                # # TODO: actually check if it is anyhow different from a previous version. If not
                # # then it wasn't really added

                del target_file  # Done with target_file -- just to have clear end of the loop

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(archive_rpath, force=True)

            lgr.info("Finished adding %s: %s" % (archive, stats.as_str(mode='line')))

            if outside_stats:
                outside_stats += stats
            if commit:
                commit_stats = outside_stats if outside_stats else stats
                annex.commit(
                    "Added content extracted from %s %s\n\n%s" % (origin, archive, commit_stats.as_str(mode='full')),
                    _datalad_msg=True
                )
                commit_stats.reset()
        finally:
            # since we batched addurl, we should close those batched processes
            annex.precommit()

            if delete_after:
                prefix_path = opj(annex_path, prefix_dir)
                if exists(prefix_path):  # probably would always be there
                    lgr.info("Removing temporary directory under which extracted files were annexed: %s",
                             prefix_path)
                    rmtree(prefix_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)

        return annex