Beispiel #1
0
def check_ensure_datalad_remote_maybe_enable(autoenable, path):
    path = Path(path)
    ds_a = Dataset(path / "a").create(force=True)
    init_datalad_remote(ds_a.repo,
                        DATALAD_SPECIAL_REMOTE,
                        autoenable=autoenable)

    ds_b = clone(source=ds_a.path, path=path / "b")
    repo = ds_b.repo
    if not autoenable:
        assert_not_in("datalad", repo.get_remotes())
    _ensure_datalad_remote(repo)
    assert_in("datalad", repo.get_remotes())
Beispiel #2
0
    def __init__(self, path=None, puke_if_exists=True):
        if not path:
            path = tempfile.mktemp(**get_tempfile_kwargs({}, prefix='testrepo'))
            # to be removed upon teardown
            _TEMP_PATHS_GENERATED.append(path)
        if puke_if_exists and exists(path):
            raise RuntimeError("Directory %s for test repo already exist" % path)
        # swallow logs so we don't print all those about crippled FS etc
        with swallow_logs():
            self.repo = self.REPO_CLASS(path)
            # For additional testing of our datalad remote to not interfer
            # and manage to handle all http urls and requests:
            if self.REPO_CLASS is AnnexRepo and \
                    os.environ.get('DATALAD_TESTS_DATALADREMOTE'):
                init_datalad_remote(self.repo, 'datalad', autoenable=True)

        self._created = False
def _ensure_datalad_remote(repo):
    """Initialize and enable datalad special remote if it isn't already."""
    dl_remote = None
    for info in repo.get_special_remotes().values():
        if info["externaltype"] == "datalad":
            dl_remote = info["name"]
            break

    if not dl_remote:
        from datalad.consts import DATALAD_SPECIAL_REMOTE
        from datalad.customremotes.base import init_datalad_remote

        init_datalad_remote(repo, DATALAD_SPECIAL_REMOTE, autoenable=True)
    elif repo.is_special_annex_remote(dl_remote, check_if_known=False):
        lgr.debug("datalad special remote '%s' is already enabled", dl_remote)
    else:
        lgr.debug("datalad special remote '%s' found. Enabling", dl_remote)
        repo.enable_remote(dl_remote)
Beispiel #4
0
    def __init__(self, path=None, puke_if_exists=True):
        if not path:
            from .utils import get_tempfile_kwargs
            path = tempfile.mktemp(**get_tempfile_kwargs({}, prefix='testrepo'))
            # to be removed upon teardown
            _TEMP_PATHS_GENERATED.append(path)
        if puke_if_exists and exists(path):
            raise RuntimeError("Directory %s for test repo already exist" % path)
        # swallow logs so we don't print all those about crippled FS etc
        with swallow_logs():
            self.repo = self.REPO_CLASS(path)
            # For additional testing of our datalad remote to not interfer
            # and manage to handle all http urls and requests:
            if self.REPO_CLASS is AnnexRepo and \
                    os.environ.get('DATALAD_TESTS_DATALADREMOTE'):
                init_datalad_remote(self.repo, 'datalad', autoenable=True)

        self._created = False
Beispiel #5
0
def test_download_docker_blob(path=None):
    from datalad.consts import (
        DATALAD_SPECIAL_REMOTE,
        DATALAD_SPECIAL_REMOTES_UUIDS,
    )
    from datalad.customremotes.base import init_datalad_remote

    with patch_config({"datalad.repo.backend": "SHA256E"}):
        ds = Dataset(path).create()
    ds_repo = ds.repo
    init_datalad_remote(ds_repo, DATALAD_SPECIAL_REMOTE)

    id_ = "f0b02e9d092d905d0d87a8455a1ae3e9bb47b4aa3dc125125ca5cd10d6441c9f"
    outfile = ds_repo.pathobj / "blob"
    url = "https://registry-1.docker.io/v2/library/busybox/blobs/sha256:" + id_
    ds.download_url(urls=[url], path=str(outfile))

    annex_info = ds.repo.get_content_annexinfo(paths=[outfile], init=None)
    eq_(id_, annex_info[outfile]["keyname"])
    assert_in(DATALAD_SPECIAL_REMOTES_UUIDS[DATALAD_SPECIAL_REMOTE],
              ds_repo.whereis([str(outfile)])[0])
Beispiel #6
0
    def __call__(archive, annex=None,
                 add_archive_leading_dir=False,
                 strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None,
                 use_current_dir=False,
                 delete=False, key=False, exclude=None, rename=None, existing='fail',
                 annex_options=None, copy=False, commit=True, allow_dirty=False,
                 stats=None, drop_after=False, delete_after=False):
        """
        Returns
        -------
        annex
        """
        if exclude:
            exclude = assure_tuple_or_list(exclude)
        if rename:
            rename = assure_tuple_or_list(rename)

        # TODO: actually I see possibly us asking user either he wants to convert
        # his git repo into annex
        archive_path = archive
        pwd = getpwd()
        if annex is None:
            annex = get_repo_instance(pwd, class_=AnnexRepo)
            if not isabs(archive):
                # if not absolute -- relative to wd and thus
                archive_path = normpath(opj(realpath(pwd), archive))
                # abspath(archive) is not "good" since dereferences links in the path
                # archive_path = abspath(archive)
        elif not isabs(archive):
            # if we are given an annex, then assume that given path is within annex, not
            # relative to PWD
            archive_path = opj(annex.path, archive)
        annex_path = annex.path

        # _rpath below should depict paths relative to the top of the annex
        archive_rpath = relpath(
            archive_path,
            # Use `get_dataset_root` to avoid resolving the leading path. If no
            # repo is found, downstream code will raise FileNotInRepositoryError.
            get_dataset_root(archive_path) or ".")

        if archive in annex.untracked_files:
            raise RuntimeError(
                "The archive is not under annex yet. You should run 'datalad "
                "add {}' first".format(archive))

        if not allow_dirty and annex.dirty:
            # already saved me once ;)
            raise RuntimeError("You better commit all the changes and untracked files first")

        if not key:
            # we were given a file which must exist
            if not exists(archive_path):
                raise ValueError("Archive {} does not exist".format(archive))
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            key = annex.get_file_key(archive_rpath)
            archive_dir = dirname(archive_path)
        else:
            origin = 'key'
            key = archive
            archive_dir = None  # We must not have anything to do with the location under .git/annex

        archive_basename = file_basename(archive)

        if not key:
            # TODO: allow for it to be under git???  how to reference then?
            raise NotImplementedError(
                "Provided file %s is not under annex.  We don't support yet adding everything "
                "straight to git" % archive
            )

        # are we in a subdirectory of the repository?
        pwd_under_annex = commonprefix([pwd, annex_path]) == annex_path
        #  then we should add content under that
        # subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # if outside -- extract to the top of repo
            extract_rpath = relpath(pwd, annex_path) \
                if pwd_under_annex \
                else None
        else:
            extract_rpath = relpath(archive_dir, annex_path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None  # no special relpath from top of the repo

        # and operate from now on the key or whereever content available "canonically"
        try:
            key_rpath = annex.get_contentlocation(key)  # , relative_to_top=True)
        except:
            raise RuntimeError("Content of %s seems to be N/A.  Fetch it first" % key)

        # now we simply need to go through every file in that archive and
        lgr.info("Adding content of the archive %s into annex %s", archive, annex)

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote
        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        annexarchive = ArchiveAnnexCustomRemote(path=annex_path, persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]

        # TODO: check if may be it was already added
        if ARCHIVES_SPECIAL_REMOTE not in annex.get_remotes():
            init_datalad_remote(annex, ARCHIVES_SPECIAL_REMOTE, autoenable=True)
        else:
            lgr.debug("Special remote {} already exists".format(ARCHIVES_SPECIAL_REMOTE))

        precommitted = False
        delete_after_rpath = None
        try:
            old_always_commit = annex.always_commit
            # When faking dates, batch mode is disabled, so we want to always
            # commit.
            annex.always_commit = annex.fake_dates_enabled

            if annex_options:
                if isinstance(annex_options, str):
                    annex_options = split_cmdline(annex_options)

            leading_dir = earchive.get_leading_directory(
                depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) \
                if strip_leading_dirs else None
            leading_dir_len = len(leading_dir) + len(opsep) if leading_dir else 0

            # we need to create a temporary directory at the top level which would later be
            # removed
            prefix_dir = basename(tempfile.mktemp(prefix=".datalad", dir=annex_path)) \
                if delete_after \
                else None

            # dedicated stats which would be added to passed in (if any)
            outside_stats = stats
            stats = ActivityStats()

            for extracted_file in earchive.get_extracted_files():
                stats.files += 1
                extracted_path = opj(earchive.path, extracted_file)

                if islink(extracted_path):
                    link_path = realpath(extracted_path)
                    if not exists(link_path):  # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning("Path %s points to non-existing file %s" % (extracted_path, link_path))
                        stats.skipped += 1
                        continue
                        # TODO: check if points outside of the archive -- warning and skip

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = extracted_file

                # strip leading dirs
                target_file = target_file[leading_dir_len:]

                if add_archive_leading_dir:
                    target_file = opj(archive_basename, target_file)

                if rename:
                    target_file = apply_replacement_rules(rename, target_file)

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains {regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if prefix_dir:
                    target_file = opj(prefix_dir, target_file)
                    # but also allow for it in the orig
                    target_file_orig = opj(prefix_dir, target_file_orig)

                target_file_path_orig = opj(annex.path, target_file_orig)

                url = annexarchive.get_file_url(archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size)

                # lgr.debug("mv {extracted_path} {target_file}. URL: {url}".format(**locals()))

                target_file_path = opj(extract_rpath, target_file) \
                    if extract_rpath else target_file

                target_file_path = opj(annex.path, target_file_path)

                if lexists(target_file_path):
                    handle_existing = True
                    if md5sum(target_file_path) == md5sum(extracted_path):
                        if not annex.is_under_annex(extracted_path):
                            # if under annex -- must be having the same content,
                            # we should just add possibly a new extra URL
                            # but if under git -- we cannot/should not do
                            # anything about it ATM
                            if existing != 'overwrite':
                                continue
                        else:
                            handle_existing = False
                    if not handle_existing:
                        pass  # nothing... just to avoid additional indentation
                    elif existing == 'fail':
                        raise RuntimeError(
                            "File {} already exists, but new (?) file {} was instructed "
                            "to be placed there while overwrite=False".format
                                (target_file_path, extracted_file)
                        )
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a tree
                        rmtree(target_file_path)
                    else:
                        target_file_path_orig_ = target_file_path

                        # To keep extension intact -- operate on the base of the filename
                        p, fn = os.path.split(target_file_path)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file doesn't collide
                        suf, i = '', 0
                        while True:
                            target_file_path_new = opj(p, fn_base + suf + ('.' if (fn_ext or ends_with_dot) else '') + fn_ext)
                            if not lexists(target_file_path_new):
                                break
                            lgr.debug("File %s already exists" % target_file_path_new)
                            i += 1
                            suf = '.%d' % i
                        target_file_path = target_file_path_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_path_orig_, target_file_path))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file_path != target_file_path_orig:
                    stats.renamed += 1

                #target_path = opj(getpwd(), target_file)
                if copy:
                    raise NotImplementedError("Not yet copying from 'persistent' cache")
                else:
                    # os.renames(extracted_path, target_path)
                    # addurl implementation relying on annex'es addurl below would actually copy
                    pass

                lgr.debug("Adding %s to annex pointing to %s and with options %r",
                          target_file_path, url, annex_options)

                out_json = annex.add_url_to_file(
                    target_file_path,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:  # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(target_file_path))
                    stats.add_git += 1

                if delete_after:
                    # delayed removal so it doesn't interfer with batched processes since any pure
                    # git action invokes precommit which closes batched processes. But we like to count
                    stats.removed += 1

                # # chaining 3 annex commands, 2 of which not batched -- less efficient but more bullet proof etc
                # annex.add(target_path, options=annex_options)
                # # above action might add to git or to annex
                # if annex.file_has_content(target_path):
                #     # if not --  it was added to git, if in annex, it is present and output is True
                #     annex.add_url_to_file(target_file, url, options=['--relaxed'], batch=True)
                #     stats.add_annex += 1
                # else:
                #     lgr.debug("File {} was added to git, not adding url".format(target_file))
                #     stats.add_git += 1
                # # TODO: actually check if it is anyhow different from a previous version. If not
                # # then it wasn't really added

                del target_file  # Done with target_file -- just to have clear end of the loop

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(archive_rpath, force=True)

            lgr.info("Finished adding %s: %s" % (archive, stats.as_str(mode='line')))

            if outside_stats:
                outside_stats += stats
            if delete_after:
                # force since not committed. r=True for -r (passed into git call
                # to recurse)
                delete_after_rpath = opj(extract_rpath, prefix_dir) if extract_rpath else prefix_dir
                lgr.debug(
                    "Removing extracted and annexed files under %s",
                    delete_after_rpath
                )
                annex.remove(delete_after_rpath, r=True, force=True)
            if commit:
                commit_stats = outside_stats if outside_stats else stats
                annex.precommit()  # so batched ones close and files become annex symlinks etc
                precommitted = True
                if any(r.get('state', None) != 'clean'
                       for p, r in annex.status(untracked='no').items()):
                    annex.commit(
                        "Added content extracted from %s %s\n\n%s" %
                        (origin, archive_rpath, commit_stats.as_str(mode='full')),
                        _datalad_msg=True
                    )
                    commit_stats.reset()
        finally:
            # since we batched addurl, we should close those batched processes
            # if haven't done yet.  explicitly checked to avoid any possible
            # "double-action"
            if not precommitted:
                annex.precommit()

            if delete_after_rpath:
                delete_after_path = opj(annex_path, delete_after_rpath)
                if exists(delete_after_path):  # should not be there
                    # but for paranoid yoh
                    lgr.warning(
                        "Removing temporary directory under which extracted "
                        "files were annexed and should have been removed: %s",
                        delete_after_path)
                    rmtree(delete_after_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)

        return annex
Beispiel #7
0
    def __call__(archive, annex=None,
                 add_archive_leading_dir=False,
                 strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None,
                 use_current_dir=False,
                 delete=False, key=False, exclude=None, rename=None, existing='fail',
                 annex_options=None, copy=False, commit=True, allow_dirty=False,
                 stats=None, drop_after=False, delete_after=False):
        """
        Returns
        -------
        annex
        """
        if exclude:
            exclude = assure_tuple_or_list(exclude)
        if rename:
            rename = assure_tuple_or_list(rename)

        # TODO: actually I see possibly us asking user either he wants to convert
        # his git repo into annex
        archive_path = archive
        pwd = getpwd()
        if annex is None:
            annex = get_repo_instance(pwd, class_=AnnexRepo)
            if not isabs(archive):
                # if not absolute -- relative to wd and thus
                archive_path = normpath(opj(realpath(pwd), archive))
                # abspath(archive) is not "good" since dereferences links in the path
                # archive_path = abspath(archive)
        elif not isabs(archive):
            # if we are given an annex, then assume that given path is within annex, not
            # relative to PWD
            archive_path = opj(annex.path, archive)
        annex_path = annex.path

        # _rpath below should depict paths relative to the top of the annex
        archive_rpath = relpath(
            archive_path,
            # Use `get_dataset_root` to avoid resolving the leading path. If no
            # repo is found, downstream code will raise FileNotInRepositoryError.
            get_dataset_root(archive_path) or ".")

        if archive in annex.untracked_files:
            raise RuntimeError(
                "The archive is not under annex yet. You should run 'datalad "
                "add {}' first".format(archive))

        # TODO: somewhat too cruel -- may be an option or smth...
        if not allow_dirty and annex.dirty:
            # already saved me once ;)
            raise RuntimeError("You better commit all the changes and untracked files first")

        if not key:
            # we were given a file which must exist
            if not exists(archive_path):
                raise ValueError("Archive {} does not exist".format(archive))
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            key = annex.get_file_key(archive_rpath)
            archive_dir = dirname(archive_path)
        else:
            origin = 'key'
            key = archive
            archive_dir = None  # We must not have anything to do with the location under .git/annex

        archive_basename = file_basename(archive)

        if not key:
            # TODO: allow for it to be under git???  how to reference then?
            raise NotImplementedError(
                "Provided file %s is not under annex.  We don't support yet adding everything "
                "straight to git" % archive
            )

        # are we in a subdirectory of the repository?
        pwd_under_annex = commonprefix([pwd, annex_path]) == annex_path
        #  then we should add content under that
        # subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # if outside -- extract to the top of repo
            extract_rpath = relpath(pwd, annex_path) \
                if pwd_under_annex \
                else None
        else:
            extract_rpath = relpath(archive_dir, annex_path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None  # no special relpath from top of the repo

        # and operate from now on the key or whereever content available "canonically"
        try:
            key_rpath = annex.get_contentlocation(key)  # , relative_to_top=True)
        except:
            raise RuntimeError("Content of %s seems to be N/A.  Fetch it first" % key)

        # now we simply need to go through every file in that archive and
        lgr.info("Adding content of the archive %s into annex %s", archive, annex)

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote
        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        annexarchive = ArchiveAnnexCustomRemote(path=annex_path, persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]

        # TODO: check if may be it was already added
        if ARCHIVES_SPECIAL_REMOTE not in annex.get_remotes():
            init_datalad_remote(annex, ARCHIVES_SPECIAL_REMOTE, autoenable=True)
        else:
            lgr.debug("Special remote {} already exists".format(ARCHIVES_SPECIAL_REMOTE))

        precommitted = False
        delete_after_rpath = None
        try:
            old_always_commit = annex.always_commit
            # When faking dates, batch mode is disabled, so we want to always
            # commit.
            annex.always_commit = annex.fake_dates_enabled

            if annex_options:
                if isinstance(annex_options, string_types):
                    annex_options = shlex.split(annex_options)

            leading_dir = earchive.get_leading_directory(
                depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) \
                if strip_leading_dirs else None
            leading_dir_len = len(leading_dir) + len(opsep) if leading_dir else 0

            # we need to create a temporary directory at the top level which would later be
            # removed
            prefix_dir = basename(tempfile.mktemp(prefix=".datalad", dir=annex_path)) \
                if delete_after \
                else None

            # dedicated stats which would be added to passed in (if any)
            outside_stats = stats
            stats = ActivityStats()

            for extracted_file in earchive.get_extracted_files():
                stats.files += 1
                extracted_path = opj(earchive.path, extracted_file)

                if islink(extracted_path):
                    link_path = realpath(extracted_path)
                    if not exists(link_path):  # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning("Path %s points to non-existing file %s" % (extracted_path, link_path))
                        stats.skipped += 1
                        continue
                        # TODO: check if points outside of the archive -- warning and skip

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = extracted_file

                # strip leading dirs
                target_file = target_file[leading_dir_len:]

                if add_archive_leading_dir:
                    target_file = opj(archive_basename, target_file)

                if rename:
                    target_file = apply_replacement_rules(rename, target_file)

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains {regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if prefix_dir:
                    target_file = opj(prefix_dir, target_file)
                    # but also allow for it in the orig
                    target_file_orig = opj(prefix_dir, target_file_orig)

                target_file_path_orig = opj(annex.path, target_file_orig)

                url = annexarchive.get_file_url(archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size)

                # lgr.debug("mv {extracted_path} {target_file}. URL: {url}".format(**locals()))

                target_file_path = opj(extract_rpath, target_file) \
                    if extract_rpath else target_file

                target_file_path = opj(annex.path, target_file_path)

                if lexists(target_file_path):
                    handle_existing = True
                    if md5sum(target_file_path) == md5sum(extracted_path):
                        if not annex.is_under_annex(extracted_path):
                            # if under annex -- must be having the same content,
                            # we should just add possibly a new extra URL
                            # but if under git -- we cannot/should not do
                            # anything about it ATM
                            if existing != 'overwrite':
                                continue
                        else:
                            handle_existing = False
                    if not handle_existing:
                        pass  # nothing... just to avoid additional indentation
                    elif existing == 'fail':
                        raise RuntimeError(
                            "File {} already exists, but new (?) file {} was instructed "
                            "to be placed there while overwrite=False".format
                                (target_file_path, extracted_file)
                        )
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a tree
                        rmtree(target_file_path)
                    else:
                        target_file_path_orig_ = target_file_path

                        # To keep extension intact -- operate on the base of the filename
                        p, fn = os.path.split(target_file_path)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file doesn't collide
                        suf, i = '', 0
                        while True:
                            target_file_path_new = opj(p, fn_base + suf + ('.' if (fn_ext or ends_with_dot) else '') + fn_ext)
                            if not lexists(target_file_path_new):
                                break
                            lgr.debug("File %s already exists" % target_file_path_new)
                            i += 1
                            suf = '.%d' % i
                        target_file_path = target_file_path_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_path_orig_, target_file_path))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file_path != target_file_path_orig:
                    stats.renamed += 1

                #target_path = opj(getpwd(), target_file)
                if copy:
                    raise NotImplementedError("Not yet copying from 'persistent' cache")
                else:
                    # os.renames(extracted_path, target_path)
                    # addurl implementation relying on annex'es addurl below would actually copy
                    pass

                lgr.debug("Adding %s to annex pointing to %s and with options %r",
                          target_file_path, url, annex_options)

                out_json = annex.add_url_to_file(
                    target_file_path,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:  # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(target_file_path))
                    stats.add_git += 1

                if delete_after:
                    # delayed removal so it doesn't interfer with batched processes since any pure
                    # git action invokes precommit which closes batched processes. But we like to count
                    stats.removed += 1

                # # chaining 3 annex commands, 2 of which not batched -- less efficient but more bullet proof etc
                # annex.add(target_path, options=annex_options)
                # # above action might add to git or to annex
                # if annex.file_has_content(target_path):
                #     # if not --  it was added to git, if in annex, it is present and output is True
                #     annex.add_url_to_file(target_file, url, options=['--relaxed'], batch=True)
                #     stats.add_annex += 1
                # else:
                #     lgr.debug("File {} was added to git, not adding url".format(target_file))
                #     stats.add_git += 1
                # # TODO: actually check if it is anyhow different from a previous version. If not
                # # then it wasn't really added

                del target_file  # Done with target_file -- just to have clear end of the loop

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(archive_rpath, force=True)

            lgr.info("Finished adding %s: %s" % (archive, stats.as_str(mode='line')))

            if outside_stats:
                outside_stats += stats
            if delete_after:
                # force since not committed. r=True for -r (passed into git call
                # to recurse)
                delete_after_rpath = opj(extract_rpath, prefix_dir) if extract_rpath else prefix_dir
                lgr.debug(
                    "Removing extracted and annexed files under %s",
                    delete_after_rpath
                )
                annex.remove(delete_after_rpath, r=True, force=True)
            if commit:
                commit_stats = outside_stats if outside_stats else stats
                annex.precommit()  # so batched ones close and files become annex symlinks etc
                precommitted = True
                if annex.is_dirty(untracked_files=False):
                    annex.commit(
                        "Added content extracted from %s %s\n\n%s" %
                        (origin, archive_rpath, commit_stats.as_str(mode='full')),
                        _datalad_msg=True
                    )
                    commit_stats.reset()
        finally:
            # since we batched addurl, we should close those batched processes
            # if haven't done yet.  explicitly checked to avoid any possible
            # "double-action"
            if not precommitted:
                annex.precommit()

            if delete_after_rpath:
                delete_after_path = opj(annex_path, delete_after_rpath)
                if exists(delete_after_path):  # should not be there
                    # but for paranoid yoh
                    lgr.warning(
                        "Removing temporary directory under which extracted "
                        "files were annexed and should have been removed: %s",
                        delete_after_path)
                    rmtree(delete_after_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)

        return annex
#!/usr/bin/env python3

import os.path as op
import sys
import xml.dom.minidom

import datalad.api as dl
from datalad.support.network import download_url

ds = dl.Dataset(op.dirname(op.dirname(op.realpath(__file__))))

if 'datalad' not in ds.repo.get_remotes():
    from datalad.customremotes.base import init_datalad_remote
    init_datalad_remote(ds.repo, 'datalad', autoenable=True)

# doc = xml.dom.minidom.parse('/tmp/outi-7T.xml')
topurl = 'https://db.humanconnectome.org/data/archive/projects/HCP_Resources/resources/7T_Movies/'
doc = xml.dom.minidom.parseString(download_url(topurl))

files = [{f: e.getAttribute(f)
          for f in ('ID', 'URI', 'digest', 'name')}
         for e in doc.getElementsByTagName("cat:entry")]
# from pprint import pprint
# pprint(files)
added = list(
    ds.addurls(files, topurl + 'files/{URI}', '{URI}', fast=False, save=False))
print(f"Processed {len(added)} entries")