Ejemplo n.º 1
0
    def __call__(login,
                 cmd,
                 *,
                 port=None,
                 ipv4=False,
                 ipv6=False,
                 options=None,
                 no_stdin=False):
        lgr.debug(
            "sshrun invoked: login=%r, cmd=%r, port=%r, options=%r, "
            "ipv4=%r, ipv6=%r, no_stdin=%r", login, cmd, port, options, ipv4,
            ipv6, no_stdin)
        # Perspective workarounds for git-annex invocation, see
        # https://github.com/datalad/datalad/issues/1456#issuecomment-292641319

        if cmd.startswith("'") and cmd.endswith("'"):
            lgr.debug(
                "Detected additional level of quotations in %r so performing "
                "command line splitting", cmd)
            # there is an additional layer of quotes
            # Let's strip them off by splitting the command
            cmd_ = split_cmdline(cmd)
            if len(cmd_) != 1:
                raise RuntimeError(
                    "Obtained more or less than a single argument after "
                    "command line splitting: %s" % repr(cmd_))
            cmd = cmd_[0]
        sshurl = 'ssh://{}{}'.format(login, ':{}'.format(port) if port else '')

        if ipv4 and ipv6:
            raise ValueError("Cannot force both IPv4 and IPv6")
        elif ipv4:
            force_ip = 4
        elif ipv6:
            force_ip = 6
        else:
            force_ip = None

        ssh = ssh_manager.get_connection(sshurl, force_ip=force_ip)
        # use an empty temp file as stdin if none shall be connected
        stdin_ = tempfile.TemporaryFile() if no_stdin else sys.stdin
        try:
            out, err = ssh(cmd,
                           stdin=stdin_,
                           log_output=False,
                           options=options)
        finally:
            if no_stdin:
                stdin_.close()
        os.write(1, out.encode('UTF-8'))
        os.write(2, err.encode('UTF-8'))
Ejemplo n.º 2
0
def test_splitjoin_cmdline():
    # Do full round trip on a number of tricky samples
    for args in (
        ['cmd', '-o1', 'simple'],
        ['c o', r'\m', ''],
        ['c o', ' '],
    ):
        cmdline = join_cmdline(args)
        assert isinstance(cmdline, str)
        eq_(split_cmdline(cmdline), args)
    # assure that there is no needless quoting
    if on_windows:
        # in quote_cmdlinearg we always quote on Windows
        eq_(join_cmdline(['abc', 'def']), '"abc" "def"')
    else:
        eq_(join_cmdline(['abc', 'def']), 'abc def')
Ejemplo n.º 3
0
def test_runner_dry(tempfile):

    dry = DryRunProtocol()
    runner = Runner(protocol=dry)

    # test dry command call
    cmd = 'echo Testing äöü東 dry run > %s' % tempfile
    with swallow_logs(new_level=5) as cml:
        ret = runner.run(cmd)
        cml.assert_logged("{DryRunProtocol} Running: %s" % cmd, regex=False)
    assert_equal(("DRY", "DRY"), ret,
                 "Output of dry run (%s): %s" % (cmd, ret))
    assert_equal(split_cmdline(cmd), dry[0]['command'])
    assert_false(os.path.exists(tempfile))

    # test dry python function call
    output = runner.call(os.path.join, 'foo', 'bar')
    assert_is(None, output, "Dry call of: os.path.join, 'foo', 'bar' "
                            "returned: %s" % output)
    assert_in('join', dry[1]['command'][0])
    assert_equal("args=('foo', 'bar')", dry[1]['command'][1])
Ejemplo n.º 4
0
    def __call__(
            spec=None,
            dataset=None,
            discover=False,
            help_proc=False):
        if not spec and not discover:
            raise InsufficientArgumentsError('requires at least a procedure name')
        if help_proc and not spec:
            raise InsufficientArgumentsError('requires a procedure name')

        try:
            ds = require_dataset(
                dataset, check_installed=False,
                purpose='run a procedure')
        except NoDatasetFound:
            ds = None

        if discover:
            # specific path of procedures that were already reported
            reported = set()
            # specific names of procedure for which an active one has been
            # found
            active = set()
            for m, cmd_name, cmd_tmpl, cmd_help in \
                    _get_procedure_implementation('*', ds=ds):
                if m in reported:
                    continue
                ex = _guess_exec(m)
                # configured template (call-format string) takes precedence:
                if cmd_tmpl:
                    ex['template'] = cmd_tmpl
                if ex['state'] is None:
                    # doesn't seem like a match
                    lgr.debug("%s does not look like a procedure, ignored.", m)
                    continue
                state = 'overridden' if cmd_name in active else ex['state']
                message = ex['type'] if ex['type'] else 'unknown type'
                message += ' ({})'.format(state) if state != 'executable' else ''
                res = get_status_dict(
                    action='discover_procedure',
                    path=m,
                    type='file',
                    logger=lgr,
                    refds=ds.path if ds else None,
                    status='ok',
                    state=state,
                    procedure_name=cmd_name,
                    procedure_type=ex['type'],
                    procedure_callfmt=ex['template'],
                    procedure_help=cmd_help,
                    message=message)
                reported.add(m)
                if state == 'executable':
                    active.add(cmd_name)
                yield res
            return

        if not isinstance(spec, (tuple, list)):
            # maybe coming from config
            spec = split_cmdline(spec)
        name = spec[0]
        args = spec[1:]

        try:
            # get the first match an run with it
            procedure_file, cmd_name, cmd_tmpl, cmd_help = \
                next(_get_procedure_implementation(name, ds=ds))
        except StopIteration:
            res = get_status_dict(
                    action='run_procedure',
                    # TODO: Default renderer requires a key "path" to exist.
                    # Doesn't make a lot of sense in this case
                    path=name,
                    logger=lgr,
                    refds=ds.path if ds else None,
                    status='impossible',
                    message="Cannot find procedure with name '%s'" % name)
            yield res
            return

        ex = _guess_exec(procedure_file)
        # configured template (call-format string) takes precedence:
        if cmd_tmpl:
            ex['template'] = cmd_tmpl

        if help_proc:
            if cmd_help:
                res = get_status_dict(
                        action='procedure_help',
                        path=procedure_file,
                        type='file',
                        logger=lgr,
                        refds=ds.path if ds else None,
                        status='ok',
                        state=ex['state'],
                        procedure_name=cmd_name,
                        procedure_type=ex['type'],
                        procedure_callfmt=ex['template'],
                        message=cmd_help)
            else:
                res = get_status_dict(
                        action='procedure_help',
                        path=procedure_file,
                        type='file',
                        logger=lgr,
                        refds=ds.path if ds else None,
                        status='impossible',
                        state=ex['state'],
                        procedure_name=cmd_name,
                        procedure_type=ex['type'],
                        procedure_callfmt=ex['template'],
                        message="No help available for '%s'" % name)

            yield res
            return

        if not ex['template']:
            raise ValueError("No idea how to execute procedure %s. "
                             "Missing 'execute' permissions?" % procedure_file)

        cmd = ex['template'].format(
            script=quote_cmdlinearg(procedure_file),
            ds=quote_cmdlinearg(ds.path) if ds else '',
            args=(u' '.join(quote_cmdlinearg(a) for a in args) if args else ''))
        lgr.info(u"Running procedure %s", name)
        lgr.debug(u'Full procedure command: %r', cmd)
        for r in Run.__call__(
                cmd=cmd,
                dataset=ds,
                explicit=True,
                inputs=None,
                outputs=None,
                # pass through here
                on_failure='ignore',
                return_type='generator'
        ):
            yield r
Ejemplo n.º 5
0
    def __call__(spec=None, *, dataset=None, discover=False, help_proc=False):
        if not spec and not discover:
            raise InsufficientArgumentsError(
                'requires at least a procedure name')
        if help_proc and not spec:
            raise InsufficientArgumentsError('requires a procedure name')

        try:
            ds = require_dataset(dataset,
                                 check_installed=False,
                                 purpose='run a procedure')
        except NoDatasetFound:
            ds = None

        if discover:
            # specific path of procedures that were already reported
            reported = set()
            # specific names of procedure for which an active one has been
            # found
            active = set()
            for m, cmd_name, cmd_tmpl, cmd_help in \
                    _get_procedure_implementation('*', ds=ds):
                if m in reported:
                    continue
                ex = _guess_exec(m)
                # configured template (call-format string) takes precedence:
                if cmd_tmpl:
                    ex['template'] = cmd_tmpl
                if ex['state'] is None:
                    # doesn't seem like a match
                    lgr.debug("%s does not look like a procedure, ignored.", m)
                    continue
                state = 'overridden' if cmd_name in active else ex['state']
                message = ex['type'] if ex['type'] else 'unknown type'
                message += ' ({})'.format(
                    state) if state != 'executable' else ''
                res = get_status_dict(action='discover_procedure',
                                      path=m,
                                      type='file',
                                      logger=lgr,
                                      refds=ds.path if ds else None,
                                      status='ok',
                                      state=state,
                                      procedure_name=cmd_name,
                                      procedure_type=ex['type'],
                                      procedure_callfmt=ex['template'],
                                      procedure_help=cmd_help,
                                      message=message)
                reported.add(m)
                if state == 'executable':
                    active.add(cmd_name)
                yield res
            return

        if isinstance(spec, dict):
            # Skip getting procedure implementation if called with a
            # dictionary (presumably coming from --discover)
            procedure_file = spec['path']
            cmd_name = spec['procedure_name']
            cmd_tmpl = spec['procedure_callfmt']
            cmd_help = spec['procedure_help']

            name = cmd_name
            args = []

        else:

            if not isinstance(spec, (tuple, list)):
                # maybe coming from config
                spec = split_cmdline(spec)
            name = spec[0]
            args = spec[1:]

            try:
                # get the first match an run with it
                procedure_file, cmd_name, cmd_tmpl, cmd_help = \
                    next(_get_procedure_implementation(name, ds=ds))
            except StopIteration:
                raise ValueError("Cannot find procedure with name '%s'" % name)

        ex = _guess_exec(procedure_file)
        # configured template (call-format string) takes precedence:
        if cmd_tmpl:
            ex['template'] = cmd_tmpl

        if help_proc:
            if cmd_help:
                res = get_status_dict(action='procedure_help',
                                      path=procedure_file,
                                      type='file',
                                      logger=lgr,
                                      refds=ds.path if ds else None,
                                      status='ok',
                                      state=ex['state'],
                                      procedure_name=cmd_name,
                                      procedure_type=ex['type'],
                                      procedure_callfmt=ex['template'],
                                      message=cmd_help)
            else:
                res = get_status_dict(action='procedure_help',
                                      path=procedure_file,
                                      type='file',
                                      logger=lgr,
                                      refds=ds.path if ds else None,
                                      status='impossible',
                                      state=ex['state'],
                                      procedure_name=cmd_name,
                                      procedure_type=ex['type'],
                                      procedure_callfmt=ex['template'],
                                      message="No help available for '%s'" %
                                      name)

            yield res
            return

        if not ex['template']:
            raise ValueError("No idea how to execute procedure %s. "
                             "Missing 'execute' permissions?" % procedure_file)

        cmd = ex['template'].format(
            script=guard_for_format(quote_cmdlinearg(procedure_file)),
            ds=guard_for_format(quote_cmdlinearg(ds.path)) if ds else '',
            args=join_cmdline(args) if args else '')
        lgr.info(u"Running procedure %s", name)
        lgr.debug(u'Full procedure command: %r', cmd)
        for r in Run.__call__(
                cmd=cmd,
                dataset=ds,
                explicit=True,
                inputs=None,
                outputs=None,
                # pass through here
                on_failure='ignore',
                return_type='generator',
                result_renderer='disabled'):
            yield r

        if ds:
            # the procedure ran and we have to anticipate that it might have
            # changed the dataset config, so we need to trigger an unforced
            # reload.
            # we have to do this despite "being done here", because
            # run_procedure() runs in the same process and reuses dataset (config
            # manager) instances, and the next interaction with a dataset should
            # be able to count on an up-to-date config
            ds.config.reload()
Ejemplo n.º 6
0
    def __call__(
            archive,
            *,
            dataset=None,
            annex=None,
            add_archive_leading_dir=False,
            strip_leading_dirs=False,
            leading_dirs_depth=None,
            leading_dirs_consider=None,
            use_current_dir=False,
            delete=False,
            key=False,
            exclude=None,
            rename=None,
            existing='fail',
            annex_options=None,
            copy=False,
            commit=True,
            allow_dirty=False,
            stats=None,
            drop_after=False,
            delete_after=False):

        if exclude:
            exclude = ensure_tuple_or_list(exclude)
        if rename:
            rename = ensure_tuple_or_list(rename)
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='add-archive-content')

        # set up common params for result records
        res_kwargs = {
            'action': 'add-archive-content',
            'logger': lgr,
        }

        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message="Can't operate in a pure Git repository",
                **res_kwargs
            )
            return
        if annex:
            warnings.warn(
                "datalad add_archive_content's `annex` parameter is "
                "deprecated and will be removed in a future release. "
                "Use the 'dataset' parameter instead.",
                DeprecationWarning)
        annex = ds.repo
        # get the archive path relative from the ds root
        archive_path = resolve_path(archive, ds=dataset)
        # let Status decide whether we can act on the given file
        for s in ds.status(
                path=archive_path,
                on_failure='ignore',
                result_renderer='disabled'):
            if s['status'] == 'error':
                if 'path not underneath the reference dataset %s' in s['message']:
                    yield get_status_dict(
                        ds=ds,
                        status='impossible',
                        message='Can not add archive outside of the dataset',
                        **res_kwargs)
                    return
                # status errored & we haven't anticipated the cause. Bubble up
                yield s
                return
            elif s['state'] == 'untracked':
                # we can't act on an untracked file
                message = (
                    "Can not add an untracked archive. "
                    "Run 'datalad save {}'".format(archive)
                )
                yield get_status_dict(
                           ds=ds,
                           status='impossible',
                           message=message,
                           **res_kwargs)
                return

        if not allow_dirty and annex.dirty:
            # error out here if the dataset contains untracked changes
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required. '
                    'Use `datalad status` to inspect unsaved changes'),
                **res_kwargs
            )
            return

        # ensure the archive exists, status doesn't error on a non-existing file
        if not key and not lexists(archive_path):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'No such file: {}'.format(archive_path),
                ),
                **res_kwargs
            )
            return

        if not key:
            check_path = archive_path.relative_to(ds.pathobj)
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            # can become get_file_annexinfo once #6104 is merged
            key = annex.get_file_annexinfo(check_path)['key']
            if not key:
                raise RuntimeError(
                    f"Archive must be an annexed file in {ds}")
            archive_dir = Path(archive_path).parent
        else:
            origin = 'key'
            key = archive
            # We must not have anything to do with the location under .git/annex
            archive_dir = None
            # instead, we will go from the current directory
            use_current_dir = True

        archive_basename = file_basename(archive)

        if not key:
            # if we didn't manage to get a key, the file must be in Git
            raise NotImplementedError(
                "Provided file %s does not seem to be under annex control. "
                "We don't support adding everything straight to Git" % archive
            )

        # figure out our location
        pwd = getpwd()
        # are we in a subdirectory of the repository?
        pwd_in_root = annex.path == archive_dir
        # then we should add content under that subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # extract the archive under the current directory, not the directory
            # where the archive is located
            extract_rpath = Path(pwd).relative_to(ds.path) \
                if not pwd_in_root \
                else None
        else:
            extract_rpath = archive_dir.relative_to(ds.path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None

        try:
            key_rpath = annex.get_contentlocation(key)
        except:
            # the only probable reason for this to fail is that there is no
            # content present
            raise RuntimeError(
                "Content of %s seems to be N/A.  Fetch it first" % key
            )

        # now we simply need to go through every file in that archive and
        lgr.info(
            "Adding content of the archive %s into annex %s", archive, annex
        )

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote

        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        # OK, let's ignore that the following class is actually a special
        # remote implementation, and use it only to work with its cache
        annexarchive = ArchiveAnnexCustomRemote(annex=None,
                                                path=annex.path,
                                                persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]
        # make sure there is an enabled datalad-archives special remote
        ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE,
                              autoenable=True)

        precommitted = False
        old_always_commit = annex.always_commit
        # batch mode is disabled when faking dates, we want to always commit
        annex.always_commit = annex.fake_dates_enabled
        if annex_options:
            if isinstance(annex_options, str):
                annex_options = split_cmdline(annex_options)
        delete_after_rpath = None

        prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad",
                                               dir=annex.path)) \
            if delete_after \
            else None

        # dedicated stats which would be added to passed in (if any)
        outside_stats = stats
        stats = ActivityStats()

        try:
            # keep track of extracted files for progress bar logging
            file_counter = 0
            # iterative over all files in the archive
            extracted_files = list(earchive.get_extracted_files())
            # start a progress bar for extraction
            pbar_id = f'add-archive-{archive_path}'
            log_progress(
                lgr.info, pbar_id, 'Extracting archive',
                label="Extracting archive",
                unit=' Files',
                total = len(extracted_files),
                noninteractive_level = logging.INFO)
            for extracted_file in extracted_files:
                file_counter += 1
                files_left = len(extracted_files) - file_counter
                log_progress(
                    lgr.info, pbar_id,
                    "Files to extract %i ", files_left,
                    update=1,
                    increment=True,
                    noninteractive_level=logging.DEBUG)
                stats.files += 1
                extracted_path = Path(earchive.path) / Path(extracted_file)

                if extracted_path.is_symlink():
                    link_path = str(extracted_path.resolve())
                    if not exists(link_path):
                        # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning(
                            "Path %s points to non-existing file %s" %
                            (extracted_path, link_path)
                        )
                        stats.skipped += 1
                        continue
                        # TODO: check if points outside of archive - warn & skip

                url = annexarchive.get_file_url(
                    archive_key=key,
                    file=extracted_file,
                    size=os.stat(extracted_path).st_size)

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = Path(extracted_file)

                # stream archives would not have had the original filename
                # information in them, so would be extracted under a name
                # derived from their annex key.
                # Provide ad-hoc handling for such cases
                if (len(extracted_files) == 1 and
                    Path(archive).suffix in ('.xz', '.gz', '.lzma') and
                        Path(key_rpath).name.startswith(Path(
                            extracted_file).name)):
                    # take archive's name without extension for filename & place
                    # where it was originally extracted
                    target_file = \
                        Path(extracted_file).parent / Path(archive).stem

                if strip_leading_dirs:
                    leading_dir = earchive.get_leading_directory(
                        depth=leading_dirs_depth, exclude=exclude,
                        consider=leading_dirs_consider)
                    leading_dir_len = \
                        len(leading_dir) + len(opsep) if leading_dir else 0
                    target_file = str(target_file)[leading_dir_len:]

                if add_archive_leading_dir:
                    # place extracted content under a directory corresponding to
                    # the archive name with suffix stripped.
                    target_file = Path(archive_basename) / target_file

                if rename:
                    target_file = apply_replacement_rules(rename,
                                                          str(target_file))

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains "
                                    "{regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if delete_after:
                    # place target file in a temporary directory
                    target_file = Path(prefix_dir) / Path(target_file)
                    # but also allow for it in the orig
                    target_file_orig = Path(prefix_dir) / Path(target_file_orig)

                target_file_path_orig = annex.pathobj / target_file_orig

                # If we were invoked in a subdirectory, patch together the
                # correct path
                target_file_path = extract_rpath / target_file \
                    if extract_rpath else target_file
                target_file_path = annex.pathobj / target_file_path

                # when the file already exists...
                if lexists(target_file_path):
                    handle_existing = True
                    if md5sum(str(target_file_path)) == \
                            md5sum(str(extracted_path)):
                        if not annex.is_under_annex(str(extracted_path)):
                            # if under annex -- must be having the same content,
                            # we should just add possibly a new extra URL
                            # but if under git -- we cannot/should not do
                            # anything about it ATM
                            if existing != 'overwrite':
                                continue
                        else:
                            handle_existing = False
                    if not handle_existing:
                        pass  # nothing... just to avoid additional indentation
                    elif existing == 'fail':
                        message = \
                            "{} exists, but would be overwritten by new file " \
                            "{}. Consider adjusting --existing".format\
                            (target_file_path, extracted_file)
                        yield get_status_dict(
                            ds=ds,
                            status='error',
                            message=message,
                            **res_kwargs)
                        return
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a
                        # tree
                        rmtree(target_file_path)
                    else:
                        # an elaborate dance to piece together new archive names
                        target_file_path_orig_ = target_file_path

                        # To keep extension intact -- operate on the base of the
                        # filename
                        p, fn = os.path.split(target_file_path)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            # we shouldn't get here, argparse should catch a
                            # non-existing value for --existing right away
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file
                        # doesn't collide
                        suf, i = '', 0
                        while True:
                            connector = \
                                ('.' if (fn_ext or ends_with_dot) else '')
                            file = fn_base + suf + connector + fn_ext
                            target_file_path_new =  \
                                Path(p) / Path(file)
                            if not lexists(target_file_path_new):
                                # we found a file name that is not yet taken
                                break
                            lgr.debug("Iteration %i of file name finding. "
                                      "File %s already exists", i,
                                      target_file_path_new)
                            i += 1
                            suf = '.%d' % i
                        target_file_path = target_file_path_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_path_orig_, target_file_path))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file_path != target_file_path_orig:
                    stats.renamed += 1

                if copy:
                    raise NotImplementedError(
                        "Not yet copying from 'persistent' cache"
                    )

                lgr.debug("Adding %s to annex pointing to %s and with options "
                          "%r", target_file_path, url, annex_options)

                out_json = annex.add_url_to_file(
                    target_file_path,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:
                    # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        # drop extracted files after adding to annex
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(
                        target_file_path))
                    stats.add_git += 1

                if delete_after:
                    # we count the removal here, but don't yet perform it
                    # to not interfer with batched processes - any pure Git
                    # action invokes precommit which closes batched processes.
                    stats.removed += 1

                # Done with target_file -- just to have clear end of the loop
                del target_file

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(str(archive_path), force=True)

            lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line'))

            if outside_stats:
                outside_stats += stats
            if delete_after:
                # force since not committed. r=True for -r (passed into git call
                # to recurse)
                delete_after_rpath = opj(extract_rpath, prefix_dir) \
                    if extract_rpath else prefix_dir
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                lgr.debug(
                    "Removing extracted and annexed files under %s",
                    delete_after_rpath
                )
                annex.remove(str(delete_after_rpath), r=True, force=True)
            if commit:
                archive_rpath = archive_path.relative_to(ds.path)
                commit_stats = outside_stats if outside_stats else stats
                # so batched ones close and files become annex symlinks etc
                annex.precommit()
                precommitted = True
                if any(r.get('state', None) != 'clean'
                       for p, r in annex.status(untracked='no').items()):
                    annex.commit(
                        "Added content extracted from %s %s\n\n%s" %
                        (origin, archive_rpath,
                         commit_stats.as_str(mode='full')),
                        _datalad_msg=True
                    )
                    commit_stats.reset()
            else:
                # don't commit upon completion
                pass
        finally:
            # take down the progress bar
            log_progress(
                lgr.info, pbar_id,
                'Finished extraction',
                noninteractive_level=logging.INFO)
            # since we batched addurl, we should close those batched processes
            # if haven't done yet.  explicitly checked to avoid any possible
            # "double-action"
            if not precommitted:
                annex.precommit()

            if delete_after_rpath:
                delete_after_path = opj(annex.path, delete_after_rpath)
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                if exists(delete_after_path):  # should not be there
                    # but for paranoid yoh
                    lgr.warning(
                        "Removing temporary directory under which extracted "
                        "files were annexed and should have been removed: %s",
                        delete_after_path)
                    rmtree(delete_after_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)
            # remove tempfile directories (not cleaned up automatically):
            if prefix_dir is not None and lexists(prefix_dir):
                os.rmdir(prefix_dir)
        yield get_status_dict(
            ds=ds,
            status='ok',
            **res_kwargs)
        return annex