Beispiel #1
0
def test_merge_follow_parentds_subdataset_other_branch(path):
    path = Path(path)
    ds_src = Dataset(path / "source").create()
    on_adjusted = ds_src.repo.is_managed_branch()
    ds_src_subds = ds_src.create("subds")
    ds_clone = install(source=ds_src.path,
                       path=path / "clone",
                       recursive=True,
                       result_xfm="datasets")
    ds_clone_subds = Dataset(ds_clone.pathobj / "subds")

    ds_src_subds.repo.call_git(["checkout", "-b", "other"])
    (ds_src_subds.pathobj / "foo").write_text("foo content")
    ds_src.save(recursive=True)
    assert_repo_status(ds_src.path)

    res = ds_clone.update(merge=True,
                          follow="parentds",
                          recursive=True,
                          on_failure="ignore")
    if on_adjusted:
        # Our git-annex sync based on approach on adjusted branches is
        # incompatible with follow='parentds'.
        assert_in_results(res, action="update", status="impossible")
        return
    else:
        assert_in_results(res, action="update", status="ok")
    eq_(ds_clone.repo.get_hexsha(), ds_src.repo.get_hexsha())
    ok_(ds_clone_subds.repo.is_under_annex("foo"))

    (ds_src_subds.pathobj / "bar").write_text("bar content")
    ds_src.save(recursive=True)
    ds_clone_subds.repo.checkout(DEFAULT_BRANCH, options=["-bnew"])
    ds_clone.update(merge=True, follow="parentds", recursive=True)
    if not on_adjusted:
        eq_(ds_clone.repo.get_hexsha(), ds_src.repo.get_hexsha())
Beispiel #2
0
def _describe_credentials():
    import keyring
    from keyring.util import platform_

    def describe_keyring_backend(be):
        be_repr = repr(be)
        return be.name if 'object at 0' in be_repr else be_repr.strip('<>')

    # might later add information on non-keyring credentials gh-4981
    props = {}

    active_keyring = keyring.get_keyring()
    krp = {
        'config_file': Path(platform_.config_root(), 'keyringrc.cfg'),
        'data_root': platform_.data_root(),
        'active_backends': [
            describe_keyring_backend(be)
            for be in getattr(active_keyring, 'backends', [active_keyring])
        ],
    }
    props.update(
        keyring=krp,
    )
    return props
Beispiel #3
0
def test_local_path_target_dir(path):
    path = Path(path)
    ds_main = Dataset(path / "main").create()

    ds_main.create_sibling(name="abspath-targetdir",
                           sshurl=str(path / "a"),
                           target_dir="tdir")
    ok_((path / "a" / "tdir").exists())

    ds_main.create_sibling(name="relpath-bound-targetdir",
                           sshurl=os.path.relpath(str(path / "b"),
                                                  ds_main.path),
                           target_dir="tdir")
    ok_((path / "b" / "tdir").exists())

    with chpwd(path):
        create_sibling(dataset=ds_main.path,
                       name="relpath-unbound-targetdir",
                       sshurl="c",
                       target_dir="tdir")
    ok_((path / "c" / "tdir").exists())

    ds_main.create("subds")

    ds_main.create_sibling(name="rec-plain-targetdir",
                           recursive=True,
                           sshurl=str(path / "d"),
                           target_dir="tdir")
    ok_((path / "d" / "tdir" / "subds").exists())

    ds_main.create_sibling(name="rec-template-targetdir",
                           recursive=True,
                           sshurl=str(path / "e"),
                           target_dir="d%RELNAME")
    ok_((path / "e" / "d").exists())
    ok_((path / "e" / "d-subds").exists())
Beispiel #4
0
def check_save_dotfiles(to_git, save_path, path):
    # Note: Take relpath to work with Travis "TMPDIR=/var/tmp/sym\ link" run.
    paths = [Path(op.relpath(op.join(root, fname), path))
             for root, _, fnames in os.walk(op.join(path, save_path or ""))
             for fname in fnames]
    ok_(paths)
    ds = Dataset(path).create(force=True)
    if not to_git and ds.repo.is_managed_branch():
        ver = ds.repo.git_annex_version
        if "8" < ver < "8.20200309":
            # git-annex's 1978a2420 (2020-03-09) fixed a bug where
            # annexed dotfiles could switch when annex.dotfiles=true
            # was not set in .git/config or git-annex:config.log.
            ds.repo.config.set("annex.dotfiles", "true",
                               where="local", reload=True)
        elif ver < "8" and save_path is None:
            raise SkipTest("Fails with annex version below v8.*")

    ds.save(save_path, to_git=to_git)
    if save_path is None:
        assert_repo_status(ds.path)
    repo = ds.repo
    annexinfo = repo.get_content_annexinfo()

    def _check(fn, p):
        fn("key", annexinfo[repo.pathobj / p], p)

    if to_git:
        def check(p):
            _check(assert_not_in, p)
    else:
        def check(p):
            _check(assert_in, p)

    for path in paths:
        check(path)
def test_gitannex(osf_id, dspath):
    from datalad.cmd import (
        GitRunner,
        WitlessRunner
    )
    dspath = Path(dspath)

    ds = Dataset(dspath).create()

    # add remote parameters here
    init_remote_opts = ["project={}".format(osf_id)]

    # add special remote
    init_opts = common_init_opts + init_remote_opts
    ds.repo.init_remote('osfproject', options=init_opts)

    # run git-annex-testremote
    # note, that we don't want to capture output. If something goes wrong we
    # want to see it in test build's output log.
    WitlessRunner(
        cwd=dspath,
        env=GitRunner.get_git_environ_adjusted()).run(
            ['git', 'annex', 'testremote', 'osfproject', "--fast"]
    )
Beispiel #6
0
def check_save_dotfiles(to_git, save_path, path):
    # Note: Take relpath to work with Travis "TMPDIR=/var/tmp/sym\ link" run.
    paths = [
        Path(op.relpath(op.join(root, fname), path))
        for root, _, fnames in os.walk(op.join(path, save_path or ""))
        for fname in fnames
    ]
    ok_(paths)
    ds = Dataset(path).create(force=True)
    if not to_git and ds.repo.is_managed_branch():
        if not ds.repo._check_version_kludges("has-include-dotfiles"):
            # FIXME(annex.dotfiles)
            ds.repo.config.set("annex.dotfiles",
                               "true",
                               where="local",
                               reload=True)
    ds.save(save_path, to_git=to_git)
    if save_path is None:
        assert_repo_status(ds.path)
    repo = ds.repo
    annexinfo = repo.get_content_annexinfo()

    def _check(fn, p):
        fn("key", annexinfo[repo.pathobj / p], p)

    if to_git:

        def check(p):
            _check(assert_not_in, p)
    else:

        def check(p):
            _check(assert_in, p)

    for path in paths:
        check(path)
Beispiel #7
0
    def __init__(self, path):
        # A lock to prevent multiple threads performing write operations in parallel
        self._write_lock = threading.Lock()

        # Note, that the following three path objects are used often and
        # therefore are stored for performance. Path object creation comes with
        # a cost. Most notably, this is used for validity checking of the
        # repository.
        self.pathobj = Path(path)
        self.dot_git = _get_dot_git(self.pathobj, ok_missing=True)
        self._valid_git_test_path = self.dot_git / 'HEAD'

        self._cfg = None
        self._git_runner = GitWitlessRunner(cwd=self.pathobj)

        self.__fake_dates_enabled = None

        # Finally, register a finalizer (instead of having a __del__ method).
        # This will be called by garbage collection as well as "atexit". By
        # keeping the reference here, we can also call it explicitly.
        # Note, that we can pass required attributes to the finalizer, but not
        # `self` itself. This would create an additional reference to the object
        # and thereby preventing it from being collected at all.
        self._finalizer = finalize(self, GitRepo._cleanup, self.pathobj)
Beispiel #8
0
def test_diff_rsync_syntax(path):
    # three nested datasets
    ds = Dataset(path).create()
    subds = ds.create('sub')
    subsubds = subds.create(Path('subdir', 'deep'))
    justtop = ds.diff(fr=PRE_INIT_COMMIT_SHA, path='sub', result_renderer=None)
    # we only get a single result, the subdataset in question
    assert_result_count(justtop, 1)
    assert_result_count(justtop, 1, type='dataset', path=subds.path)
    # now with "peak inside the dataset" syntax
    inside = ds.diff(fr=PRE_INIT_COMMIT_SHA,
                     path='sub' + os.sep,
                     result_renderer=None)
    # we get both subdatasets, but nothing else inside the nested one
    assert_result_count(inside, 2, type='dataset')
    assert_result_count(inside, 1, type='dataset', path=subds.path)
    assert_result_count(inside, 1, type='dataset', path=subsubds.path)
    assert_result_count(inside, 0, type='file', parentds=subsubds.path)
    # if we point to the subdir in 'sub' the reporting wrt the subsubds
    # doesn't change. It is merely a path constraint within the queried
    # subds, but because the subsubds is still underneath it, nothing changes
    inside_subdir = ds.diff(fr=PRE_INIT_COMMIT_SHA,
                            path=op.join('sub', 'subdir'),
                            result_renderer=None)
    assert_result_count(inside_subdir, 2, type='dataset')
    assert_result_count(inside_subdir, 1, type='dataset', path=subds.path)
    assert_result_count(inside_subdir, 1, type='dataset', path=subsubds.path)
    assert_result_count(inside_subdir, 0, type='file', parentds=subsubds.path)
    # but the rest is different (e.g. all the stuff in .datalad is gone)
    neq_(inside, inside_subdir)
    # just for completeness, we get more when going full recursive
    rec = ds.diff(fr=PRE_INIT_COMMIT_SHA,
                  recursive=True,
                  path='sub' + os.sep,
                  result_renderer=None)
    assert (len(inside) < len(rec))
Beispiel #9
0
    def __call__(self, dataset, refcommit, process_type, status):
        # shortcut
        ds = dataset

        repo = ds.repo  # OPT: .repo could be relatively expensive
        if not isinstance(repo, AnnexRepo):
            # nothing to be done
            return

        if process_type not in ('all', 'content'):
            return

        # no progress bar, we are only making a one-shot call to
        # annex, the rest is pretty much instantaneous

        # limit query to paths that are annexed
        query_paths = [
            # go relative to minimize cmdline footprint of annex call
            text_type(Path(s['path']).relative_to(ds.pathobj))
            for s in status
            # anything that looks like an annexed file
            if s.get('type', None) == 'file' \
            and s.get('key', None) is not None
        ]

        log_progress(
            lgr.info,
            'extractorannex',
            'Start annex metadata extraction from %s',
            ds,
            total=len(query_paths),
            label='Annex metadata extraction',
            unit=' Files',
        )
        for fpath, meta in repo.get_metadata(
                query_paths,
                # no timestamps, we are describing the status quo
                timestamps=False,
                # because we have filtered the query to only contained
                # annexed files, we can use batch mode and deal with
                # many files
                batch=True):
            log_progress(lgr.info,
                         'extractorannex',
                         'Extracted annex metadata from %s',
                         fpath,
                         update=1,
                         increment=True)
            meta = {
                k: v[0] if isinstance(v, list) and len(v) == 1 else v
                for k, v in meta.items()
            }
            if not meta:
                # only talk about files that actually carry metadata
                continue
            yield dict(
                # git annex reports the path in POSIX conventions
                path=PurePosixPath(fpath),
                metadata=meta,
                type='file',
                status='ok',
            )
        log_progress(
            lgr.info,
            'extractorannex',
            'Finished annex metadata extraction from %s',
            ds,
        )
Beispiel #10
0
def check_merge_follow_parentds_subdataset_detached(on_adjusted, path):
    # Note: For the adjusted case, this is not much more than a smoke test that
    # on an adjusted branch we fail sensibly. The resulting state is not easy
    # to reason about nor desirable.
    path = Path(path)
    # $path/source/s0/s1
    # The additional dataset level is to gain some confidence that this works
    # for nested datasets.
    ds_src = Dataset(path / "source").create()
    if ds_src.repo.is_managed_branch():
        if not on_adjusted:
            raise SkipTest("System only supports adjusted branches. "
                           "Skipping non-adjusted test")
    ds_src_s0 = ds_src.create("s0")
    ds_src_s1 = ds_src_s0.create("s1")
    ds_src.save(recursive=True)
    if on_adjusted:
        # Note: We adjust after creating all the datasets above to avoid a bug
        # fixed in git-annex 7.20191024, specifically bbdeb1a1a (sync: Fix
        # crash when there are submodules and an adjusted branch is checked
        # out, 2019-10-23).
        for ds in [ds_src, ds_src_s0, ds_src_s1]:
            _adjust(ds.repo)
        ds_src.save(recursive=True)
    assert_repo_status(ds_src.path)

    ds_clone = install(source=ds_src.path,
                       path=path / "clone",
                       recursive=True,
                       result_xfm="datasets")
    ds_clone_s1 = Dataset(ds_clone.pathobj / "s0" / "s1")

    ds_src_s1.repo.checkout(DEFAULT_BRANCH + "^0")
    (ds_src_s1.pathobj / "foo").write_text("foo content")
    ds_src.save(recursive=True)
    assert_repo_status(ds_src.path)

    res = ds_clone.update(merge=True,
                          recursive=True,
                          follow="parentds",
                          on_failure="ignore")
    if on_adjusted:
        # The top-level update is okay because there is no parent revision to
        # update to.
        assert_in_results(res,
                          status="ok",
                          path=ds_clone.path,
                          action="update")
        # The subdataset, on the other hand, is impossible.
        assert_in_results(res,
                          status="impossible",
                          path=ds_clone_s1.path,
                          action="update")
        return
    assert_repo_status(ds_clone.path)

    # We brought in the revision and got to the same state of the remote.
    # Blind saving here without bringing in the current subdataset revision
    # would have resulted in a new commit in ds_clone that reverting the
    # last subdataset ID recorded in ds_src.
    eq_(ds_clone.repo.get_hexsha(), ds_src.repo.get_hexsha())

    # Record a revision in the parent and then move HEAD away from it so that
    # the explicit revision fetch fails.
    (ds_src_s1.pathobj / "bar").write_text("bar content")
    ds_src.save(recursive=True)
    ds_src_s1.repo.checkout(DEFAULT_BRANCH)
    # This is the default, but just in case:
    ds_src_s1.repo.config.set("uploadpack.allowAnySHA1InWant",
                              "false",
                              where="local")
    # Configure the fetcher to use v0 because Git defaults to v2 as of
    # v2.26.0, which allows fetching unadvertised objects regardless
    # of the value of uploadpack.allowAnySHA1InWant.
    ds_clone_s1.repo.config.set("protocol.version", "0", where="local")
    res = ds_clone.update(merge=True,
                          recursive=True,
                          follow="parentds",
                          on_failure="ignore")
    # The fetch with the explicit ref fails because it isn't advertised.
    assert_in_results(res,
                      status="impossible",
                      path=ds_clone_s1.path,
                      action="update")

    # Back to the detached head.
    ds_src_s1.repo.checkout("HEAD@{1}")
    # Set up a case where update() will not resolve the sibling.
    ds_clone_s1.repo.call_git(["branch", "--unset-upstream"])
    ds_clone_s1.config.reload(force=True)
    ds_clone_s1.repo.call_git(["remote", "add", "other", ds_src_s1.path])
    res = ds_clone.update(recursive=True,
                          follow="parentds",
                          on_failure="ignore")
    # In this case, update() won't abort if we call with merge=False, but
    # it does if the revision wasn't brought down in the `fetch(all_=True)`
    # call.
    assert_in_results(res,
                      status="impossible",
                      path=ds_clone_s1.path,
                      action="update")
Beispiel #11
0
def _test_bare_git_version_2(host, dspath, store):
    # Similarly to test_bare_git_version_1, this should ensure a bare git repo
    # at the store location for a dataset doesn't conflict with the ORA remote.
    # Note: Usability of git remote by annex depends on dataset layout version
    #       (dirhashlower vs. -mixed).
    #       For version 2 (mixed) upload via ORA and consumption via git should
    #       work. But not the other way around, since git-annex uses
    #       dirhashlower with bare repos.

    ds_path = Path(dspath)
    store = Path(store)
    ds = Dataset(ds_path).create()
    populate_dataset(ds)
    ds.save()

    bare_repo_path, _, _ = get_layout_locations(1, store, ds.id)
    # Use git to make sure the remote end is what git thinks a bare clone of it
    # should look like
    subprocess.run([
        'git', 'clone', '--bare',
        quote_cmdlinearg(str(dspath)),
        quote_cmdlinearg(str(bare_repo_path))
    ])

    if host:
        url = "ria+ssh://{host}{path}".format(host=host, path=store)
    else:
        url = "ria+{}".format(store.as_uri())
    init_opts = common_init_opts + ['url={}'.format(url)]
    # set up store:
    io = SSHRemoteIO(host) if host else LocalIO()
    create_store(io, store, '1')
    # set up the dataset location, too.
    # Note: Dataset layout version 2 (dirhash mixed):
    create_ds_in_store(io, store, ds.id, '2', '1')

    # Now, let's have the bare repo as a git remote
    git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \
        if host else bare_repo_path.as_uri()
    ds.repo.add_remote('bare-git', git_url)
    ds.repo.enable_remote('bare-git')
    # and the ORA remote in addition:
    ds.repo.init_remote('ora-remote', options=init_opts)
    # upload keys via ORA:
    ds.repo.copy_to('.', 'ora-remote')
    # bare-git doesn't know yet:
    eq_(len(ds.repo.whereis('one.txt')), 2)
    # fsck to make availability known
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='bare-git', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)
    ds.drop('.')
    eq_(len(ds.repo.whereis('one.txt')), 2)
    # actually consumable via git remote:
    ds.repo.call_annex(['move', 'one.txt', '--from', 'bare-git'])
    eq_(len(ds.repo.whereis('one.txt')), 2)
    # now, move back via git - shouldn't be consumable via ORA
    ds.repo.call_annex(['move', 'one.txt', '--to', 'bare-git'])
    # fsck to make availability known, but there's nothing from POV of ORA:
    fsck_res = [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='ora-remote', fast=True)
    ]
    assert_result_count(fsck_res,
                        1,
                        status='error',
                        message='** Based on the location log, one.txt\n'
                        '** was expected to be present, '
                        'but its content is missing.')
    assert_result_count(fsck_res, 1, status='ok')
    eq_(len(fsck_res), 2)
    eq_(len(ds.repo.whereis('one.txt')), 1)
Beispiel #12
0
def postclonecfg_ria(ds, props):
    """Configure a dataset freshly cloned from a RIA store"""
    repo = ds.repo
    # RIA uses hashdir mixed, copying data to it via git-annex (if cloned via
    # ssh) would make it see a bare repo and establish a hashdir lower annex
    # object tree.
    # Moreover, we want the ORA remote to receive all data for the store, so its
    # objects could be moved into archives (the main point of a RIA store).
    RIA_REMOTE_NAME = 'origin'  # don't hardcode everywhere
    ds.config.set(
        'remote.{}.annex-ignore'.format(RIA_REMOTE_NAME), 'true',
        where='local')

    # chances are that if this dataset came from a RIA store, its subdatasets
    # may live there too. Place a subdataset source candidate config that makes
    # get probe this RIA store when obtaining subdatasets
    ds.config.set(
        # we use the label 'origin' for this candidate in order to not have to
        # generate a complicated name from the actual source specification.
        # we pick a cost of 200 to sort it before datalad's default candidates
        # for non-RIA URLs, because they prioritize hierarchical layouts that
        # cannot be found in a RIA store
        'datalad.get.subdataset-source-candidate-200origin',
        # use the entire original URL, up to the fragment + plus dataset ID
        # placeholder, this should make things work with any store setup we
        # support (paths, ports, ...)
        props['source'].split('#', maxsplit=1)[0] + '#{id}',
        where='local')

    # setup publication dependency, if a corresponding special remote exists
    # and was enabled (there could be RIA stores that actually only have repos)
    # make this function be a generator
    ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled')
                   if s.get('annex-externaltype') == 'ora']
    if not ora_remotes and any(
            r.get('externaltype') == 'ora'
            for r in (repo.get_special_remotes().values()
                      if hasattr(repo, 'get_special_remotes')
                      else [])):
        # no ORA remote autoenabled, but configuration known about at least one.
        # Let's check origin's config for datalad.ora-remote.uuid as stored by
        # create-sibling-ria and enable try enabling that one.
        lgr.debug("Found no autoenabled ORA special remote. Trying to look it "
                  "up in source config ...")

        # First figure whether we cloned via SSH, HTTP or local path and then
        # get that config file the same way:
        config_content = None
        scheme = props['giturl'].split(':', 1)[0]
        if scheme in ['http', 'https']:
            try:
                config_content = download_url(
                    "{}{}config".format(
                        props['giturl'],
                        '/' if not props['giturl'].endswith('/') else ''))
            except DownloadError as e:
                lgr.debug("Failed to get config file from source:\n%s",
                          exc_str(e))
        elif scheme == 'ssh':
            # TODO: switch the following to proper command abstraction:
            # SSHRemoteIO ignores the path part ATM. No remote CWD! (To be
            # changed with command abstractions). So we need to get that part to
            # have a valid path to origin's config file:
            cfg_path = PurePosixPath(URL(props['giturl']).path) / 'config'
            op = SSHRemoteIO(props['giturl'])
            try:
                config_content = op.read_file(cfg_path)
            except RIARemoteError as e:
                lgr.debug("Failed to get config file from source: %s",
                          exc_str(e))

        elif scheme == 'file':
            # TODO: switch the following to proper command abstraction:
            op = LocalIO()
            cfg_path = Path(URL(props['giturl']).localpath) / 'config'
            try:
                config_content = op.read_file(cfg_path)
            except (RIARemoteError, OSError) as e:
                lgr.debug("Failed to get config file from source: %s",
                          exc_str(e))
        else:
            lgr.debug("Unknown URL-Scheme %s in %s. Can handle SSH, HTTP or "
                      "FILE scheme URLs.", scheme, props['source'])

        # 3. And read it
        org_uuid = None
        if config_content:
            # TODO: We might be able to spare the saving to a file.
            #       "git config -f -" is not explicitly documented but happens
            #       to work and would read from stdin. Make sure we know this
            #       works for required git versions and on all platforms.
            with make_tempfile(content=config_content) as cfg_file:
                runner = GitWitlessRunner()
                try:
                    result = runner.run(
                        ['git', 'config', '-f', cfg_file,
                         'datalad.ora-remote.uuid'],
                        protocol=StdOutCapture
                    )
                    org_uuid = result['stdout'].strip()
                except CommandError as e:
                    # doesn't contain what we are looking for
                    lgr.debug("Found no UUID for ORA special remote at "
                              "'%s' (%s)", RIA_REMOTE_NAME, exc_str(e))

        # Now, enable it. If annex-init didn't fail to enable it as stored, we
        # wouldn't end up here, so enable with store URL as suggested by the URL
        # we cloned from.
        if org_uuid:
            srs = repo.get_special_remotes()
            if org_uuid in srs.keys():
                # TODO: - Double-check autoenable value and only do this when
                #         true?
                #       - What if still fails? -> Annex shouldn't change config
                #         in that case

                # we only need the store:
                new_url = props['source'].split('#')[0]
                try:
                    repo.enable_remote(srs[org_uuid]['name'],
                                       options=['url={}'.format(new_url)]
                                       )
                    lgr.info("Reconfigured %s for %s",
                             srs[org_uuid]['name'], new_url)
                    # update ora_remotes for considering publication dependency
                    # below
                    ora_remotes = [s for s in
                                   ds.siblings('query',
                                               result_renderer='disabled')
                                   if s.get('annex-externaltype', None) ==
                                   'ora']
                except CommandError as e:
                    lgr.debug("Failed to reconfigure ORA special remote: %s",
                              exc_str(e))
            else:
                lgr.debug("Unknown ORA special remote uuid at '%s': %s",
                          RIA_REMOTE_NAME, org_uuid)
    if ora_remotes:
        if len(ora_remotes) == 1:
            yield from ds.siblings('configure',
                                   name=RIA_REMOTE_NAME,
                                   publish_depends=ora_remotes[0]['name'],
                                   result_filter=None,
                                   result_renderer='disabled')
        else:
            lgr.warning("Found multiple ORA remotes. Couldn't decide which "
                        "publishing to 'origin' should depend on: %s. Consider "
                        "running 'datalad siblings configure -s origin "
                        "--publish-depends ORAREMOTENAME' to set publication "
                        "dependency manually.",
                        [r['name'] for r in ora_remotes])
Beispiel #13
0
def _test_initremote_basic(host, ds_path, store, link):

    ds_path = Path(ds_path)
    store = Path(store)
    link = Path(link)
    ds = Dataset(ds_path).create()
    populate_dataset(ds)
    ds.save()

    if host:
        url = "ria+ssh://{host}{path}".format(host=host,
                                              path=store)
    else:
        url = "ria+{}".format(store.as_uri())
    init_opts = common_init_opts + ['url={}'.format(url)]

    # fails on non-existing storage location
    assert_raises(CommandError,
                  ds.repo.init_remote, 'ria-remote', options=init_opts)
    # Doesn't actually create a remote if it fails
    assert_not_in('ria-remote',
                  [cfg['name']
                   for uuid, cfg in ds.repo.get_special_remotes().items()]
                  )

    # fails on non-RIA URL
    assert_raises(CommandError, ds.repo.init_remote, 'ria-remote',
                  options=common_init_opts + ['url={}'.format(store.as_uri())]
                  )
    # Doesn't actually create a remote if it fails
    assert_not_in('ria-remote',
                  [cfg['name']
                   for uuid, cfg in ds.repo.get_special_remotes().items()]
                  )

    # set up store:
    io = SSHRemoteIO(host) if host else LocalIO()
    create_store(io, store, '1')
    # still fails, since ds isn't setup in the store
    assert_raises(CommandError,
                  ds.repo.init_remote, 'ria-remote', options=init_opts)
    # Doesn't actually create a remote if it fails
    assert_not_in('ria-remote',
                  [cfg['name']
                   for uuid, cfg in ds.repo.get_special_remotes().items()]
                  )
    # set up the dataset as well
    create_ds_in_store(io, store, ds.id, '2', '1')
    # now should work
    ds.repo.init_remote('ria-remote', options=init_opts)
    assert_in('ria-remote',
              [cfg['name']
               for uuid, cfg in ds.repo.get_special_remotes().items()]
              )
    assert_repo_status(ds.path)
    # git-annex:remote.log should have:
    #   - url
    #   - common_init_opts
    #   - archive_id (which equals ds id)
    remote_log = ds.repo.call_git(['cat-file', 'blob', 'git-annex:remote.log'],
                                  read_only=True)
    assert_in("url={}".format(url), remote_log)
    [assert_in(c, remote_log) for c in common_init_opts]
    assert_in("archive-id={}".format(ds.id), remote_log)

    # re-configure with invalid URL should fail:
    assert_raises(
        CommandError,
        ds.repo.call_annex,
        ['enableremote', 'ria-remote'] + common_init_opts + [
            'url=ria+file:///non-existing'])
    # but re-configure with valid URL should work
    if has_symlink_capability():
        link.symlink_to(store)
        new_url = 'ria+{}'.format(link.as_uri())
        ds.repo.call_annex(
            ['enableremote', 'ria-remote'] + common_init_opts + [
                'url={}'.format(new_url)])
        # git-annex:remote.log should have:
        #   - url
        #   - common_init_opts
        #   - archive_id (which equals ds id)
        remote_log = ds.repo.call_git(['cat-file', 'blob',
                                       'git-annex:remote.log'],
                                      read_only=True)
        assert_in("url={}".format(new_url), remote_log)
        [assert_in(c, remote_log) for c in common_init_opts]
        assert_in("archive-id={}".format(ds.id), remote_log)

    # we can deal with --sameas, which leads to a special remote not having a
    # 'name' property, but only a 'sameas-name'. See gh-4259
    try:
        ds.repo.init_remote('ora2',
                            options=init_opts + ['--sameas', 'ria-remote'])
    except CommandError as e:
        if 'Invalid option `--sameas' in e.stderr:
            # annex too old - doesn't know --sameas
            pass
        else:
            raise 
Beispiel #14
0
def _test_remote_layout(host, dspath, store, archiv_store):

    dspath = Path(dspath)
    store = Path(store)
    archiv_store = Path(archiv_store)
    ds = Dataset(dspath).create()
    populate_dataset(ds)
    ds.save()
    assert_repo_status(ds.path)

    # set up store:
    io = SSHRemoteIO(host) if host else LocalIO()
    if host:
        store_url = "ria+ssh://{host}{path}".format(host=host,
                                                    path=store)
        arch_url = "ria+ssh://{host}{path}".format(host=host,
                                                   path=archiv_store)
    else:
        store_url = "ria+{}".format(store.as_uri())
        arch_url = "ria+{}".format(archiv_store.as_uri())

    create_store(io, store, '1')

    # TODO: Re-establish test for version 1
    # version 2: dirhash
    create_ds_in_store(io, store, ds.id, '2', '1')

    # add special remote
    init_opts = common_init_opts + ['url={}'.format(store_url)]
    ds.repo.init_remote('store', options=init_opts)

    # copy files into the RIA store
    ds.repo.copy_to('.', 'store')

    # we should see the exact same annex object tree
    dsgit_dir, archive_dir, dsobj_dir = \
        get_layout_locations(1, store, ds.id)
    store_objects = get_all_files(dsobj_dir)
    local_objects = get_all_files(ds.pathobj / '.git' / 'annex' / 'objects')
    assert_equal(len(store_objects), 2)

    if not ds.repo.is_managed_branch():
        # with managed branches the local repo uses hashdirlower instead
        # TODO: However, with dataset layout version 1 this should therefore
        #       work on adjusted branch the same way
        # TODO: Wonder whether export-archive-ora should account for that and
        #       rehash according to target layout.
        assert_equal(sorted([p for p in store_objects]),
                     sorted([p for p in local_objects])
                     )

        if not io.get_7z():
            raise SkipTest("No 7z available in RIA store")

        # we can simply pack up the content of the remote into a
        # 7z archive and place it in the right location to get a functional
        # archive remote

        create_store(io, archiv_store, '1')
        create_ds_in_store(io, archiv_store, ds.id, '2', '1')

        whereis = ds.repo.whereis('one.txt')
        dsgit_dir, archive_dir, dsobj_dir = \
            get_layout_locations(1, archiv_store, ds.id)
        ds.export_archive_ora(archive_dir / 'archive.7z')
        init_opts = common_init_opts + ['url={}'.format(arch_url)]
        ds.repo.init_remote('archive', options=init_opts)
        # now fsck the new remote to get the new special remote indexed
        ds.repo.fsck(remote='archive', fast=True)
        assert_equal(len(ds.repo.whereis('one.txt')), len(whereis) + 1)
def _create_sibling_ria(ds, url, name, ria_remote, ria_remote_name, existing,
                        shared, group, post_update_hook, res_kwargs):
    # be safe across datasets
    res_kwargs = res_kwargs.copy()

    # parse target URL
    try:
        ssh_host, base_path = verify_ria_url(url, ds.config)
    except ValueError as e:
        yield get_status_dict(status='error', message=str(e), **res_kwargs)
        return

    base_path = Path(base_path)

    git_url = decode_source_spec(
        # append dataset id to url and use magic from clone-helper:
        url + '#{}'.format(ds.id),
        cfg=ds.config)['giturl']
    # go for a v1 layout
    repo_path, _, _ = get_layout_locations(1, base_path, ds.id)

    ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)]
    # Figure whether we are supposed to skip this very dataset
    if existing == 'skip' and (name in ds_siblings or
                               (ria_remote_name
                                and ria_remote_name in ds_siblings)):
        yield get_status_dict(status='notneeded',
                              message="Skipped on existing sibling",
                              **res_kwargs)
        # if we skip here, nothing else can change that decision further
        # down
        return

    # we might learn that some processing (remote repo creation is
    # not desired)
    skip = False

    lgr.info("create sibling{} '{}'{} ...".format(
        's' if ria_remote_name else '',
        name,
        " and '{}'".format(ria_remote_name) if ria_remote_name else '',
    ))
    if ssh_host:
        from datalad import ssh_manager
        ssh = ssh_manager.get_connection(ssh_host,
                                         use_remote_annex_bundle=False)
        ssh.open()

    # determine layout locations
    if ria_remote:
        lgr.debug('init special remote {}'.format(ria_remote_name))
        ria_remote_options = [
            'type=external', 'externaltype=ria', 'encryption=none',
            'autoenable=true', 'url={}'.format(url)
        ]
        try:
            ds.repo.init_remote(ria_remote_name, options=ria_remote_options)
        except CommandError as e:
            if existing in ['replace', 'reconfigure'] \
                    and 'git-annex: There is already a special remote' \
                    in e.stderr:
                # run enableremote instead
                lgr.debug(
                    "special remote '%s' already exists. "
                    "Run enableremote instead.", ria_remote_name)
                # TODO: Use AnnexRepo.enable_remote (which needs to get
                #       `options` first)
                cmd = ['git', 'annex', 'enableremote', ria_remote_name
                       ] + ria_remote_options
                subprocess.run(cmd, cwd=quote_cmdlinearg(ds.repo.path))
            else:
                yield get_status_dict(
                    status='error',
                    message="initremote failed.\nstdout: %s\nstderr: %s" %
                    (e.stdout, e.stderr),
                    **res_kwargs)
                return

        # 1. create remote object store:
        # Note: All it actually takes is to trigger the special
        # remote's `prepare` method once.
        # ATM trying to achieve that by invoking a minimal fsck.
        # TODO: - It's probably faster to actually talk to the special
        #         remote (i.e. pretending to be annex and use
        #         the protocol to send PREPARE)
        #       - Alternatively we can create the remote directory and
        #         ria version file directly, but this means
        #         code duplication that then needs to be kept in sync
        #         with ria-remote implementation.
        #       - this leads to the third option: Have that creation
        #         routine importable and callable from
        #         ria-remote package without the need to actually
        #         instantiate a RIARemote object
        lgr.debug("initializing object store")
        ds.repo.fsck(remote=ria_remote_name,
                     fast=True,
                     annex_options=['--exclude=*/*'])
    else:
        # with no special remote we currently need to create the
        # required directories
        # TODO: This should be cleaner once we have access to the
        #       special remote's RemoteIO classes without
        #       talking via annex
        if ssh_host:
            try:
                stdout, stderr = ssh('test -e {repo}'.format(
                    repo=quote_cmdlinearg(str(repo_path))))
                exists = True
            except CommandError as e:
                exists = False
            if exists:
                if existing == 'skip':
                    # 1. not rendered by default
                    # 2. message doesn't show up in ultimate result
                    #    record as shown by -f json_pp
                    yield get_status_dict(status='notneeded',
                                          message="Skipped on existing remote "
                                          "directory {}".format(repo_path),
                                          **res_kwargs)
                    skip = True
                elif existing in ['error', 'reconfigure']:
                    yield get_status_dict(
                        status='error',
                        message="remote directory {} already "
                        "exists.".format(repo_path),
                        **res_kwargs)
                    return
                elif existing == 'replace':
                    ssh('chmod u+w -R {}'.format(
                        quote_cmdlinearg(str(repo_path))))
                    ssh('rm -rf {}'.format(quote_cmdlinearg(str(repo_path))))
            if not skip:
                ssh('mkdir -p {}'.format(quote_cmdlinearg(str(repo_path))))
        else:
            if repo_path.exists():
                if existing == 'skip':
                    skip = True
                elif existing in ['error', 'reconfigure']:
                    yield get_status_dict(
                        status='error',
                        message="remote directory {} already "
                        "exists.".format(repo_path),
                        **res_kwargs)
                    return
                elif existing == 'replace':
                    rmtree(repo_path)
            if not skip:
                repo_path.mkdir(parents=True)

    # Note, that this could have changed since last tested due to existing
    # remote dir
    if skip:
        return

    # 2. create a bare repository in-store:

    lgr.debug("init bare repository")
    # TODO: we should prob. check whether it's there already. How?
    # Note: like the special remote itself, we assume local FS if no
    # SSH host is specified
    disabled_hook = repo_path / 'hooks' / 'post-update.sample'
    enabled_hook = repo_path / 'hooks' / 'post-update'

    if group:
        chgrp_cmd = "chgrp -R {} {}".format(quote_cmdlinearg(str(group)),
                                            quote_cmdlinearg(str(repo_path)))

    if ssh_host:
        ssh('cd {rootdir} && git init --bare{shared}'.format(
            rootdir=quote_cmdlinearg(str(repo_path)),
            shared=" --shared='{}'".format(quote_cmdlinearg(shared))
            if shared else ''))
        if post_update_hook:
            ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)),
                                  quote_cmdlinearg(str(enabled_hook))))

        if group:
            # Either repository existed before or a new directory was
            # created for it, set its group to a desired one if was
            # provided with the same chgrp
            ssh(chgrp_cmd)
    else:
        GitRepo(repo_path,
                create=True,
                bare=True,
                shared=" --shared='{}'".format(quote_cmdlinearg(shared))
                if shared else None)
        if post_update_hook:
            disabled_hook.rename(enabled_hook)
        if group:
            # TODO; do we need a cwd here?
            subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path))

    # add a git remote to the bare repository
    # Note: needs annex-ignore! Otherwise we might push into default
    # annex/object tree instead of directory type tree with dirhash
    # lower. This in turn would be an issue, if we want to pack the
    # entire thing into an archive. Special remote will then not be
    # able to access content in the "wrong" place within the archive
    lgr.debug("set up git remote")
    # TODO:
    # - This sibings call results in "[WARNING] Failed to determine
    #   if datastore carries annex."
    #   (see https://github.com/datalad/datalad/issues/4028)
    #   => for now have annex-ignore configured before. Evtl. Allow
    #      configure/add to include that option
    #      - additionally there's
    #        https://github.com/datalad/datalad/issues/3989,
    #        where datalad-siblings might hang forever
    if name in ds_siblings:
        # otherwise we should have skipped or failed before
        assert existing in ['replace', 'reconfigure']
    ds.config.set("remote.{}.annex-ignore".format(name),
                  value="true",
                  where="local")
    ds.siblings(
        'configure',
        name=name,
        url=git_url if ssh_host else str(repo_path),
        recursive=False,
        # Note, that this should be None if ria_remote was not set
        publish_depends=ria_remote_name,
        result_renderer=None,
        # Note, that otherwise a subsequent publish will report
        # "notneeded".
        fetch=True)

    yield get_status_dict(
        status='ok',
        **res_kwargs,
    )
Beispiel #16
0
 def ctrl_path(self):
     with open(bogus_socket, "w") as f:
         f.write("whatever")
     return Path(bogus_socket)
Beispiel #17
0
    def __call__(*,
                 dataset=None,
                 what=None,
                 dry_run=False,
                 recursive=False,
                 recursion_limit=None):

        ds = require_dataset(dataset,
                             purpose="report on cleanable locations"
                             if dry_run else "clean dataset")
        res_kwargs = dict(action='clean [dry-run]' if dry_run else 'clean',
                          logger=lgr,
                          refds=ds.path)
        for wds in itertools.chain(
            [ds],
                ds.subdatasets(state='present',
                               recursive=recursive,
                               recursion_limit=recursion_limit,
                               return_type='generator',
                               result_renderer='disabled',
                               result_xfm='datasets') if recursive else []):
            d = wds.pathobj
            gitdir = wds.repo.dot_git
            DIRS_PLURAL = ("directory", "directories")
            FILES_PLURAL = ("file", "files")
            discover_or_remove = "Discovered" if dry_run else "Removed"

            for dirpath, flag, msg, sing_pl in [
                (Path(ARCHIVES_TEMP_DIR), "cached-archives",
                 "temporary archive", DIRS_PLURAL),
                (Path(ANNEX_TEMP_DIR), "annex-tmp", "temporary annex",
                 FILES_PLURAL),
                (Path(ANNEX_TRANSFER_DIR), "annex-transfer",
                 "annex temporary transfer", DIRS_PLURAL),
                (gitdir / Path(SEARCH_INDEX_DOTGITDIR), 'search-index',
                 "metadata search index", FILES_PLURAL),
            ]:
                topdir = wds.pathobj / dirpath
                lgr.debug("Considering to clean %s:%s", d, dirpath)
                if not ((what is None) or (flag in what)):
                    yield get_status_dict(path=str(topdir),
                                          status='notneeded',
                                          type='directory',
                                          **res_kwargs)
                    continue

                paths = [p for p in topdir.glob('*')]
                if not paths:
                    if not topdir.exists():
                        yield get_status_dict(path=str(topdir),
                                              status='notneeded',
                                              type='directory',
                                              **res_kwargs)
                        continue
                    else:
                        # we empty topdir only
                        message = ("%s empty %s directory", discover_or_remove,
                                   msg)
                else:
                    pl = len(paths) > 1
                    message = ("%s %d %s %s: %s", discover_or_remove,
                               len(paths), msg, sing_pl[int(pl)], ", ".join(
                                   sorted([
                                       str(p.relative_to(topdir))
                                       for p in paths if p != topdir
                                   ])))

                if not dry_run:
                    rmtree(str(topdir))

                yield get_status_dict(path=str(topdir),
                                      status='ok',
                                      type='directory',
                                      message=message,
                                      **res_kwargs)
Beispiel #18
0
def test_pathlib_unicode():
    eq_(str(Path("a")), u"a")
    eq_(str(Path(u"β")), u"β")
Beispiel #19
0
def _test_version_check(host, dspath, store):

    dspath = Path(dspath)
    store = Path(store)

    ds = Dataset(dspath).create()
    populate_dataset(ds)
    ds.save()
    assert_repo_status(ds.path)

    # set up store:
    io = SSHRemoteIO(host) if host else LocalIO()
    if host:
        store_url = "ria+ssh://{host}{path}".format(host=host,
                                                    path=store)
    else:
        store_url = "ria+{}".format(store.as_uri())

    create_store(io, store, '1')

    # TODO: Re-establish test for version 1
    # version 2: dirhash
    create_ds_in_store(io, store, ds.id, '2', '1')

    # add special remote
    init_opts = common_init_opts + ['url={}'.format(store_url)]
    ds.repo.init_remote('store', options=init_opts)
    ds.repo.copy_to('.', 'store')

    # check version files
    remote_ds_tree_version_file = store / 'ria-layout-version'
    dsgit_dir, archive_dir, dsobj_dir = \
        get_layout_locations(1, store, ds.id)
    remote_obj_tree_version_file = dsgit_dir / 'ria-layout-version'

    assert_true(remote_ds_tree_version_file.exists())
    assert_true(remote_obj_tree_version_file.exists())

    with open(str(remote_ds_tree_version_file), 'r') as f:
        assert_equal(f.read().strip(), '1')
    with open(str(remote_obj_tree_version_file), 'r') as f:
        assert_equal(f.read().strip(), '2')

    # Accessing the remote should not yield any output regarding versioning,
    # since it's the "correct" version. Note that "fsck" is an arbitrary choice.
    # We need just something to talk to the special remote.
    with swallow_logs(new_level=logging.INFO) as cml:
        ds.repo.fsck(remote='store', fast=True)
        # TODO: For some reason didn't get cml.assert_logged to assert
        #       "nothing was logged"
        assert not cml.out

    # Now fake-change the version
    with open(str(remote_obj_tree_version_file), 'w') as f:
        f.write('X\n')

    # Now we should see a message about it
    with swallow_logs(new_level=logging.INFO) as cml:
        ds.repo.fsck(remote='store', fast=True)
        cml.assert_logged(level="INFO",
                          msg="Remote object tree reports version X",
                          regex=False)

    # reading still works:
    ds.drop('.')
    assert_status('ok', ds.get('.'))

    # but writing doesn't:
    with open(str(Path(ds.path) / 'new_file'), 'w') as f:
        f.write("arbitrary addition")
    ds.save(message="Add a new_file")

    # TODO: use self.annex.error in special remote and see whether we get an
    #       actual error result
    assert_raises(CommandError,
                  ds.repo.copy_to, 'new_file', 'store')

    # However, we can force it by configuration
    ds.config.add("annex.ora-remote.store.force-write", "true", where='local')
    ds.repo.copy_to('new_file', 'store')
Beispiel #20
0
    def __call__(url,
                 name,
                 dataset=None,
                 storage_name=None,
                 post_update_hook=False,
                 shared=None,
                 group=None,
                 storage_sibling=True,
                 existing='error',
                 trust_level=None,
                 recursive=False,
                 recursion_limit=None,
                 disable_storage__=None,
                 ):
        if disable_storage__ is not None:
            import warnings
            warnings.warn("datalad-create-sibling-ria --no-storage-sibling "
                          "is deprecated, use --storage-sibling off instead.",
                          DeprecationWarning)
            # recode to new setup
            disable_storage__ = None
            storage_sibling = False

        if storage_sibling == 'only' and storage_name:
            lgr.warning(
                "Sibling name will be used for storage sibling in "
                "storage-sibling-only mode, but a storage sibling name "
                "was provided"
            )

        ds = require_dataset(
            dataset, check_installed=True, purpose='create sibling RIA')
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-ria",
            logger=lgr,
        )

        # parse target URL
        try:
            ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config)
        except ValueError as e:
            yield get_status_dict(
                status='error',
                message=str(e),
                **res_kwargs
            )
            return

        if ds.repo.get_hexsha() is None or ds.id is None:
            raise RuntimeError(
                "Repository at {} is not a DataLad dataset, "
                "run 'datalad create [--force]' first.".format(ds.path))

        if not storage_sibling and storage_name:
            lgr.warning(
                "Storage sibling setup disabled, but a storage sibling name "
                "was provided"
            )

        if storage_sibling and not storage_name:
            storage_name = "{}-storage".format(name)

        if storage_sibling and name == storage_name:
            # leads to unresolvable, circular dependency with publish-depends
            raise ValueError("sibling names must not be equal")

        if not isinstance(url, str):
            raise TypeError("url is not a string, but %s" % type(url))

        # Query existing siblings upfront in order to fail early on
        # existing=='error', since misconfiguration (particularly of special
        # remotes) only to fail in a subdataset later on with that config, can
        # be quite painful.
        # TODO: messages - this is "create-sibling". Don't confuse existence of
        #       local remotes with existence of the actual remote sibling
        #       in wording
        if existing == 'error':
            # in recursive mode this check could take a substantial amount of
            # time: employ a progress bar (or rather a counter, because we don't
            # know the total in advance
            pbar_id = 'check-siblings-{}'.format(id(ds))
            log_progress(
                lgr.info, pbar_id,
                'Start checking pre-existing sibling configuration %s', ds,
                label='Query siblings',
                unit=' Siblings',
            )
            # even if we have to fail, let's report all conflicting siblings
            # in subdatasets
            failed = False
            for r in ds.siblings(result_renderer=None,
                                 recursive=recursive,
                                 recursion_limit=recursion_limit):
                log_progress(
                    lgr.info, pbar_id,
                    'Discovered sibling %s in dataset at %s',
                    r['name'], r['path'],
                    update=1,
                    increment=True)
                if not r['type'] == 'sibling' or r['status'] != 'ok':
                    # this is an internal status query that has not consequence
                    # for the outside world. Be silent unless something useful
                    # can be said
                    #yield r
                    continue
                if r['name'] == name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
                if storage_name and r['name'] == storage_name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(storage_name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
            log_progress(
                lgr.info, pbar_id,
                'Finished checking pre-existing sibling configuration %s', ds,
            )
            if failed:
                return

        # TODO: - URL parsing + store creation needs to be RF'ed based on
        #         command abstractions
        #       - more generally consider store creation a dedicated command or
        #         option
        # Note: URL parsing is done twice ATM (for top-level ds). This can't be
        # reduced to single instance, since rewriting url based on config could
        # be different for subdatasets.

        create_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(),
                     Path(base_path),
                     '1')

        yield from _create_sibling_ria(
            ds,
            url,
            name,
            storage_sibling,
            storage_name,
            existing,
            shared,
            group,
            post_update_hook,
            trust_level,
            res_kwargs)

        if recursive:
            # Note: subdatasets can be treated independently, so go full
            # recursion when querying for them and _no_recursion with the
            # actual call. Theoretically this can be parallelized.

            for subds in ds.subdatasets(fulfilled=True,
                                        recursive=True,
                                        recursion_limit=recursion_limit,
                                        result_xfm='datasets'):
                yield from _create_sibling_ria(
                    subds,
                    url,
                    name,
                    storage_sibling,
                    storage_name,
                    existing,
                    shared,
                    group,
                    post_update_hook,
                    trust_level,
                    res_kwargs)
Beispiel #21
0
def clone_dataset(
        srcs,
        destds,
        reckless=None,
        description=None,
        result_props=None,
        cfg=None):
    """Internal helper to perform cloning without sanity checks (assumed done)

    This helper does not handle any saving of subdataset modification or adding
    in a superdataset.

    Parameters
    ----------
    srcs : list
      Any suitable clone source specifications (paths, URLs)
    destds : Dataset
      Dataset instance for the clone destination
    reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional
      Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e.
      sacrifice data safety for performance or resource footprint. When None
      and `cfg` is specified, use the value of `datalad.clone.reckless`.
    description : str, optional
      Location description for the annex of the dataset clone (if there is any).
    result_props : dict, optional
      Default properties for any yielded result, passed on to get_status_dict().
    cfg : ConfigManager, optional
      Configuration for parent dataset. This will be queried instead
      of the global DataLad configuration.

    Yields
    ------
    dict
      DataLad result records
    """
    if not result_props:
        # in case the caller had no specific idea on how results should look
        # like, provide sensible defaults
        result_props = dict(
            action='install',
            logger=lgr,
            ds=destds,
        )

    if reckless is None and cfg:
        # if reckless is not explicitly given, but we operate on a
        # superdataset, query whether it has been instructed to operate
        # in a reckless mode, and inherit it for the coming clone
        reckless = cfg.get('datalad.clone.reckless', None)

    dest_path = destds.pathobj

    # decode all source candidate specifications
    candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs]

    # now expand the candidate sources with additional variants of the decoded
    # giturl, while duplicating the other properties in the additional records
    # for simplicity. The hope is to overcome a few corner cases and be more
    # robust than git clone
    candidate_sources = [
        dict(props, giturl=s) for props in candidate_sources
        for s in _get_flexible_source_candidates(props['giturl'])
    ]

    # important test! based on this `rmtree` will happen below after failed clone
    dest_path_existed = dest_path.exists()
    if dest_path_existed and any(dest_path.iterdir()):
        if destds.is_installed():
            # check if dest was cloned from the given source before
            # this is where we would have installed this from
            # this is where it was actually installed from
            track_name, track_url = _get_tracking_source(destds)
            try:
                # this will get us track_url in system native path conventions,
                # whenever it is a path (and not a URL)
                # this is needed to match it to any potentially incoming local
                # source path in the 'notneeded' test below
                track_path = str(Path(track_url))
            except Exception:
                # this should never happen, because Path() will let any non-path stringification
                # pass through unmodified, but we do not want any potential crash due to
                # pathlib behavior changes
                lgr.debug("Unexpected behavior of pathlib!")
                track_path = None
            for cand in candidate_sources:
                src = cand['giturl']
                if track_url == src \
                        or (not is_url(track_url)
                            and get_local_file_url(track_url, compatibility='git') == src) \
                        or track_path == expanduser(src):
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destds,
                                 src),
                        **result_props)
                    return
        # anything else is an error
        yield get_status_dict(
            status='error',
            message='target path already exists and not empty, refuse to clone into target path',
            **result_props)
        return

    log_progress(
        lgr.info,
        'cloneds',
        'Cloning dataset to %s', destds,
        total=len(candidate_sources),
        label='Clone attempt',
        unit=' Candidate locations',
    )
    error_msgs = OrderedDict()  # accumulate all error messages formatted per each url
    for cand in candidate_sources:
        log_progress(
            lgr.info,
            'cloneds',
            'Attempting to clone from %s to %s', cand['giturl'], dest_path,
            update=1,
            increment=True)

        clone_opts = {}

        if cand.get('version', None):
            clone_opts['branch'] = cand['version']
        try:
            # TODO for now GitRepo.clone() cannot handle Path instances, and PY35
            # doesn't make it happen seemlessly
            GitRepo.clone(
                path=str(dest_path),
                url=cand['giturl'],
                clone_options=clone_opts,
                create=True)

        except CommandError as e:
            e_stderr = e.stderr

            error_msgs[cand['giturl']] = e
            lgr.debug("Failed to clone from URL: %s (%s)",
                      cand['giturl'], exc_str(e))
            if dest_path.exists():
                lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                          dest_path)
                # We must not just rmtree since it might be curdir etc
                # we should remove all files/directories under it
                # TODO stringification can be removed once patlib compatible
                # or if PY35 is no longer supported
                rmtree(str(dest_path), children_only=dest_path_existed)

            if e_stderr and 'could not create work tree' in e_stderr.lower():
                # this cannot be fixed by trying another URL
                re_match = re.match(r".*fatal: (.*)$", e_stderr,
                                    flags=re.MULTILINE | re.DOTALL)
                # cancel progress bar
                log_progress(
                    lgr.info,
                    'cloneds',
                    'Completed clone attempts for %s', destds
                )
                yield get_status_dict(
                    status='error',
                    message=re_match.group(1).strip()
                    if re_match else "stderr: " + e_stderr,
                    **result_props)
                return
            # next candidate
            continue

        result_props['source'] = cand
        # do not bother with other sources if succeeded
        break

    log_progress(
        lgr.info,
        'cloneds',
        'Completed clone attempts for %s', destds
    )

    if not destds.is_installed():
        if len(error_msgs):
            if all(not e.stdout and not e.stderr for e in error_msgs.values()):
                # there is nothing we can learn from the actual exception,
                # the exit code is uninformative, the command is predictable
                error_msg = "Failed to clone from all attempted sources: %s"
                error_args = list(error_msgs.keys())
            else:
                error_msg = "Failed to clone from any candidate source URL. " \
                            "Encountered errors per each url were:\n- %s"
                error_args = '\n- '.join(
                    '{}\n  {}'.format(url, exc_str(exc))
                    for url, exc in error_msgs.items()
                )
        else:
            # yoh: Not sure if we ever get here but I felt that there could
            #      be a case when this might happen and original error would
            #      not be sufficient to troubleshoot what is going on.
            error_msg = "Awkward error -- we failed to clone properly. " \
                        "Although no errors were encountered, target " \
                        "dataset at %s seems to be not fully installed. " \
                        "The 'succesful' source was: %s"
            error_args = (destds.path, cand['giturl'])
        yield get_status_dict(
            status='error',
            message=(error_msg, error_args),
            **result_props)
        return

    if not cand.get("version"):
        postclone_check_head(destds)

    # act on --reckless=shared-...
    # must happen prior git-annex-init, where we can cheaply alter the repo
    # setup through safe re-init'ing
    if reckless and reckless.startswith('shared-'):
        lgr.debug('Reinit %s to enable shared access permissions', destds)
        destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])])

    yield from postclonecfg_annexdataset(
        destds,
        reckless,
        description)

    # perform any post-processing that needs to know details of the clone
    # source
    if result_props['source']['type'] == 'ria':
        yield from postclonecfg_ria(destds, result_props['source'])

    if reckless:
        # store the reckless setting in the dataset to make it
        # known to later clones of subdatasets via get()
        destds.config.set(
            'datalad.clone.reckless', reckless,
            where='local',
            reload=True)

    # yield successful clone of the base dataset now, as any possible
    # subdataset clone down below will not alter the Git-state of the
    # parent
    yield get_status_dict(status='ok', **result_props)
Beispiel #22
0
def _create_sibling_ria(
        ds,
        url,
        name,
        storage_sibling,
        storage_name,
        existing,
        shared,
        group,
        post_update_hook,
        trust_level,
        res_kwargs):
    # be safe across datasets
    res_kwargs = res_kwargs.copy()
    # update dataset
    res_kwargs['ds'] = ds

    if not isinstance(ds.repo, AnnexRepo):
        # No point in dealing with a special remote when there's no annex.
        # Note, that in recursive invocations this might only apply to some of
        # the datasets. Therefore dealing with it here rather than one level up.
        lgr.debug("No annex at %s. Ignoring special remote options.", ds.path)
        storage_sibling = False
        storage_name = None

    # parse target URL
    try:
        ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config)
    except ValueError as e:
        yield get_status_dict(
            status='error',
            message=str(e),
            **res_kwargs
        )
        return

    base_path = Path(base_path)

    git_url = decode_source_spec(
        # append dataset id to url and use magic from clone-helper:
        url + '#{}'.format(ds.id),
        cfg=ds.config
    )['giturl']
    # determine layout locations; go for a v1 layout
    repo_path, _, _ = get_layout_locations(1, base_path, ds.id)

    ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)]
    # Figure whether we are supposed to skip this very dataset
    if existing == 'skip' and (
            name in ds_siblings or (
                storage_name and storage_name in ds_siblings)):
        yield get_status_dict(
            status='notneeded',
            message="Skipped on existing sibling",
            **res_kwargs
        )
        # if we skip here, nothing else can change that decision further
        # down
        return

    # figure whether we need to skip or error due an existing target repo before
    # we try to init a special remote.
    if ssh_host:
        from datalad import ssh_manager
        ssh = ssh_manager.get_connection(
            ssh_host,
            use_remote_annex_bundle=False)
        ssh.open()

    if existing in ['skip', 'error']:
        config_path = repo_path / 'config'
        # No .git -- if it's an existing repo in a RIA store it should be a
        # bare repo.
        # Theoretically we could have additional checks for whether we have
        # an empty repo dir or a non-bare repo or whatever else.
        if ssh_host:
            try:
                ssh('[ -e {p} ]'.format(p=quote_cmdlinearg(str(config_path))))
                exists = True
            except CommandError:
                exists = False
        else:
            exists = config_path.exists()

        if exists:
            if existing == 'skip':
                # 1. not rendered by default
                # 2. message doesn't show up in ultimate result
                #    record as shown by -f json_pp
                yield get_status_dict(
                    status='notneeded',
                    message="Skipped on existing remote "
                            "directory {}".format(repo_path),
                    **res_kwargs
                )
                return
            else:  # existing == 'error'
                yield get_status_dict(
                    status='error',
                    message="remote directory {} already "
                            "exists.".format(repo_path),
                    **res_kwargs
                )
                return

    if storage_sibling == 'only':
        lgr.info("create storage sibling '{}' ...".format(name))
    else:
        lgr.info("create sibling{} '{}'{} ...".format(
            's' if storage_name else '',
            name,
            " and '{}'".format(storage_name) if storage_name else '',
        ))
    create_ds_in_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(),
                       base_path, ds.id, '2', '1')
    if storage_sibling:
        # we are using the main `name`, if the only thing we are creating
        # is the storage sibling
        srname = name if storage_sibling == 'only' else storage_name

        lgr.debug('init special remote {}'.format(srname))
        special_remote_options = [
            'type=external',
            'externaltype=ora',
            'encryption=none',
            'autoenable=true',
            'url={}'.format(url)]
        try:
            ds.repo.init_remote(
                srname,
                options=special_remote_options)
        except CommandError as e:
            if existing == 'reconfigure' \
                    and 'git-annex: There is already a special remote' \
                    in e.stderr:
                # run enableremote instead
                lgr.debug(
                    "special remote '%s' already exists. "
                    "Run enableremote instead.",
                    srname)
                # TODO: Use AnnexRepo.enable_remote (which needs to get
                #       `options` first)
                ds.repo.call_annex([
                    'enableremote',
                    srname] + special_remote_options)
            else:
                yield get_status_dict(
                    status='error',
                    message="initremote failed.\nstdout: %s\nstderr: %s"
                    % (e.stdout, e.stderr),
                    **res_kwargs
                )
                return

        if trust_level:
            ds.repo.call_annex([trust_level, srname])
        # get uuid for use in bare repo's config
        uuid = ds.config.get("remote.{}.annex-uuid".format(srname))

    if storage_sibling == 'only':
        # we can stop here, the rest of the function is about setting up
        # the git remote part of the sibling
        yield get_status_dict(
            status='ok',
            **res_kwargs,
        )
        return

    # 2. create a bare repository in-store:

    lgr.debug("init bare repository")
    # TODO: we should prob. check whether it's there already. How?
    # Note: like the special remote itself, we assume local FS if no
    # SSH host is specified
    disabled_hook = repo_path / 'hooks' / 'post-update.sample'
    enabled_hook = repo_path / 'hooks' / 'post-update'

    if group:
        chgrp_cmd = "chgrp -R {} {}".format(
            quote_cmdlinearg(str(group)),
            quote_cmdlinearg(str(repo_path)))

    if ssh_host:
        ssh('cd {rootdir} && git init --bare{shared}'.format(
            rootdir=quote_cmdlinearg(str(repo_path)),
            shared=" --shared='{}'".format(
                quote_cmdlinearg(shared)) if shared else ''
        ))

        if storage_sibling:
            # write special remote's uuid into git-config, so clone can
            # which one it is supposed to be and enable it even with
            # fallback URL
            ssh("cd {rootdir} && git config datalad.ora-remote.uuid {uuid}"
                "".format(rootdir=quote_cmdlinearg(str(repo_path)),
                          uuid=uuid))

        if post_update_hook:
            ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)),
                                  quote_cmdlinearg(str(enabled_hook))))

        if group:
            # Either repository existed before or a new directory was
            # created for it, set its group to a desired one if was
            # provided with the same chgrp
            ssh(chgrp_cmd)
    else:
        gr = GitRepo(repo_path, create=True, bare=True,
                     shared=shared if shared else None)
        if storage_sibling:
            # write special remote's uuid into git-config, so clone can
            # which one it is supposed to be and enable it even with
            # fallback URL
            gr.config.add("datalad.ora-remote.uuid", uuid, where='local')

        if post_update_hook:
            disabled_hook.rename(enabled_hook)
        if group:
            # TODO; do we need a cwd here?
            subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path))

    # add a git remote to the bare repository
    # Note: needs annex-ignore! Otherwise we might push into dirhash
    # lower annex/object tree instead of mixed, since it's a bare
    # repo. This in turn would be an issue, if we want to pack the
    # entire thing into an archive. Special remote will then not be
    # able to access content in the "wrong" place within the archive
    lgr.debug("set up git remote")
    if name in ds_siblings:
        # otherwise we should have skipped or failed before
        assert existing == 'reconfigure'
    ds.config.set(
        "remote.{}.annex-ignore".format(name),
        value="true",
        where="local")
    ds.siblings(
        'configure',
        name=name,
        url=git_url
        if ssh_host
        else str(repo_path),
        recursive=False,
        # Note, that this should be None if storage_sibling was not set
        publish_depends=storage_name,
        result_renderer=None,
        # Note, that otherwise a subsequent publish will report
        # "notneeded".
        fetch=True
    )

    yield get_status_dict(
        status='ok',
        **res_kwargs,
    )
Beispiel #23
0
def postclonecfg_annexdataset(ds, reckless, description=None):
    """If ds "knows annex" -- annex init it, set into reckless etc

    Provides additional tune up to a possibly an annex repo, e.g.
    "enables" reckless mode, sets up description
    """
    # in any case check whether we need to annex-init the installed thing:
    if not knows_annex(ds.path):
        # not for us
        return

    # init annex when traces of a remote annex can be detected
    if reckless == 'auto':
        lgr.debug(
            "Instruct annex to hardlink content in %s from local "
            "sources, if possible (reckless)", ds.path)
        ds.config.set(
            'annex.hardlink', 'true', where='local', reload=True)

    lgr.debug("Initializing annex repo at %s", ds.path)
    # Note, that we cannot enforce annex-init via AnnexRepo().
    # If such an instance already exists, its __init__ will not be executed.
    # Therefore do quick test once we have an object and decide whether to call
    # its _init().
    #
    # Additionally, call init if we need to add a description (see #1403),
    # since AnnexRepo.__init__ can only do it with create=True
    repo = AnnexRepo(ds.path, init=True)
    if not repo.is_initialized() or description:
        repo._init(description=description)
    if reckless == 'auto' or (reckless and reckless.startswith('shared-')):
        repo.call_annex(['untrust', 'here'])

    elif reckless == 'ephemeral':
        # with ephemeral we declare 'here' as 'dead' right away, whenever
        # we symlink origin's annex, since availability from 'here' should
        # not be propagated for an ephemeral clone when we publish back to
        # origin.
        # This will cause stuff like this for a locally present annexed file:
        # % git annex whereis d1
        # whereis d1 (0 copies) failed
        # BUT this works:
        # % git annex find . --not --in here
        # % git annex find . --in here
        # d1

        # we don't want annex copy-to origin
        ds.config.set(
            'remote.origin.annex-ignore', 'true',
            where='local')

        ds.repo.set_remote_dead('here')

        if check_symlink_capability(ds.repo.dot_git / 'dl_link_test',
                                    ds.repo.dot_git / 'dl_target_test'):
            # symlink the annex to avoid needless copies in an ephemeral clone
            annex_dir = ds.repo.dot_git / 'annex'
            origin_annex_url = ds.config.get("remote.origin.url", None)
            origin_git_path = None
            if origin_annex_url:
                try:
                    # Deal with file:// scheme URLs as well as plain paths.
                    # If origin isn't local, we have nothing to do.
                    origin_git_path = Path(RI(origin_annex_url).localpath)

                    # we are local; check for a bare repo first to not mess w/
                    # the path
                    if GitRepo(origin_git_path, create=False).bare:
                        # origin is a bare repo -> use path as is
                        pass
                    elif origin_git_path.name != '.git':
                        origin_git_path /= '.git'
                except ValueError:
                    # Note, that accessing localpath on a non-local RI throws
                    # ValueError rather than resulting in an AttributeError.
                    # TODO: Warning level okay or is info level sufficient?
                    # Note, that setting annex-dead is independent of
                    # symlinking .git/annex. It might still make sense to
                    # have an ephemeral clone that doesn't propagate its avail.
                    # info. Therefore don't fail altogether.
                    lgr.warning("reckless=ephemeral mode: origin doesn't seem "
                                "local: %s\nno symlinks being used",
                                origin_annex_url)
            if origin_git_path:
                # TODO make sure that we do not delete any unique data
                rmtree(str(annex_dir)) \
                    if not annex_dir.is_symlink() else annex_dir.unlink()
                annex_dir.symlink_to(origin_git_path / 'annex',
                                     target_is_directory=True)
        else:
            # TODO: What level? + note, that annex-dead is independ
            lgr.warning("reckless=ephemeral mode: Unable to create symlinks on "
                        "this file system.")

    srs = {True: [], False: []}  # special remotes by "autoenable" key
    remote_uuids = None  # might be necessary to discover known UUIDs

    repo_config = repo.config
    # Note: The purpose of this function is to inform the user. So if something
    # looks misconfigured, we'll warn and move on to the next item.
    for uuid, config in repo.get_special_remotes().items():
        sr_name = config.get('name', None)
        if sr_name is None:
            lgr.warning(
                'Ignoring special remote %s because it does not have a name. '
                'Known information: %s',
                uuid, config)
            continue
        sr_autoenable = config.get('autoenable', False)
        try:
            sr_autoenable = ensure_bool(sr_autoenable)
        except ValueError:
            lgr.warning(
                'Failed to process "autoenable" value %r for sibling %s in '
                'dataset %s as bool.'
                'You might need to enable it later manually and/or fix it up to'
                ' avoid this message in the future.',
                sr_autoenable, sr_name, ds.path)
            continue

        # If it looks like a type=git special remote, make sure we have up to
        # date information. See gh-2897.
        if sr_autoenable and repo_config.get("remote.{}.fetch".format(sr_name)):
            try:
                repo.fetch(remote=sr_name)
            except CommandError as exc:
                lgr.warning("Failed to fetch type=git special remote %s: %s",
                            sr_name, exc_str(exc))

        # determine whether there is a registered remote with matching UUID
        if uuid:
            if remote_uuids is None:
                remote_uuids = {
                    # Check annex-config-uuid first. For sameas annex remotes,
                    # this will point to the UUID for the configuration (i.e.
                    # the key returned by get_special_remotes) rather than the
                    # shared UUID.
                    (repo_config.get('remote.%s.annex-config-uuid' % r) or
                     repo_config.get('remote.%s.annex-uuid' % r))
                    for r in repo.get_remotes()
                }
            if uuid not in remote_uuids:
                srs[sr_autoenable].append(sr_name)

    if srs[True]:
        lgr.debug(
            "configuration for %s %s added because of autoenable,"
            " but no UUIDs for them yet known for dataset %s",
            # since we are only at debug level, we could call things their
            # proper names
            single_or_plural("special remote",
                             "special remotes", len(srs[True]), True),
            ", ".join(srs[True]),
            ds.path
        )

    if srs[False]:
        # if has no auto-enable special remotes
        lgr.info(
            'access to %s %s not auto-enabled, enable with:\n'
            '\t\tdatalad siblings -d "%s" enable -s %s',
            # but since humans might read it, we better confuse them with our
            # own terms!
            single_or_plural("dataset sibling",
                             "dataset siblings", len(srs[False]), True),
            ", ".join(srs[False]),
            ds.path,
            srs[False][0] if len(srs[False]) == 1 else "SIBLING",
        )

    # we have just cloned the repo, so it has 'origin', configure any
    # reachable origin of origins
    yield from configure_origins(ds, ds)
Beispiel #24
0
def test_unlock_gh_5456(path=None):
    path = Path(path)
    unrelated_super = Dataset(path).create(annex=False, force=True)
    ds = Dataset(path / 'subdir' / 'sub').create()
    ds.unlock('.')
Beispiel #25
0
def _test_bare_git_version_1(host, dspath, store):
    # This test should take a dataset and create a bare repository at the remote
    # end from it.
    # Given, that it is placed correctly within a tree of dataset, that remote
    # thing should then be usable as an ora-remote as well as as a git-type
    # remote.
    # Note: Usability of git remote by annex depends on dataset layout version
    #       (dirhashlower vs. -mixed).
    #       For version 1 (lower) upload and consumption should be
    #       interchangeable. It doesn't matter which remote is used for what
    #       direction.
    ds_path = Path(dspath)
    store = Path(store)
    ds = Dataset(ds_path).create()
    populate_dataset(ds)
    ds.save()

    bare_repo_path, _, _ = get_layout_locations(1, store, ds.id)
    # Use git to make sure the remote end is what git thinks a bare clone of it
    # should look like
    subprocess.run([
        'git', 'clone', '--bare',
        quote_cmdlinearg(str(dspath)),
        quote_cmdlinearg(str(bare_repo_path))
    ])

    if host:
        url = "ria+ssh://{host}{path}".format(host=host, path=store)
    else:
        url = "ria+{}".format(store.as_uri())
    init_opts = common_init_opts + ['url={}'.format(url)]
    # set up store:
    io = SSHRemoteIO(host) if host else LocalIO()
    create_store(io, store, '1')
    # set up the dataset location, too.
    # Note: Dataset layout version 1 (dirhash lower):
    create_ds_in_store(io, store, ds.id, '1', '1')

    # Now, let's have the bare repo as a git remote and use it with annex
    git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \
        if host else bare_repo_path.as_uri()
    ds.repo.add_remote('bare-git', git_url)
    ds.repo.enable_remote('bare-git')

    # copy files to the remote
    ds.repo.copy_to('.', 'bare-git')
    eq_(len(ds.repo.whereis('one.txt')), 2)

    # now we can drop all content locally, reobtain it, and survive an
    # fsck
    ds.drop('.')
    ds.get('.')
    assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()])

    # Now, add the ora remote:
    ds.repo.init_remote('ora-remote', options=init_opts)
    # fsck to make availability known
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='ora-remote', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)

    # Now move content from git-remote to local and see it not being available
    # via bare-git anymore.
    ds.repo.call_annex(['move', '--all', '--from=bare-git'])
    # ora-remote doesn't know yet:
    eq_(len(ds.repo.whereis('one.txt')), 2)

    # But after fsck it does:
    fsck_res = [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='ora-remote', fast=True)
    ]
    assert_result_count(fsck_res,
                        1,
                        status='error',
                        message='** Based on the location log, one.txt\n'
                        '** was expected to be present, '
                        'but its content is missing.')
    assert_result_count(fsck_res,
                        1,
                        status='error',
                        message='** Based on the location log, subdir/two\n'
                        '** was expected to be present, '
                        'but its content is missing.')
    eq_(len(ds.repo.whereis('one.txt')), 1)
    # and the other way around: upload via ora-remote and have it available via
    # git-remote:
    ds.repo.copy_to('.', 'ora-remote')
    # fsck to make availability known
    assert_status('ok', [
        annexjson2result(r, ds)
        for r in ds.repo.fsck(remote='bare-git', fast=True)
    ])
    eq_(len(ds.repo.whereis('one.txt')), 3)
Beispiel #26
0
def test_update_follow_parentds_lazy(path):
    path = Path(path)
    ds_src = Dataset(path / "source").create()
    ds_src_s0 = ds_src.create("s0")
    ds_src_s0_s0 = ds_src_s0.create("s0")
    ds_src_s0.create("s1")
    ds_src_s1 = ds_src.create("s1")
    ds_src.create("s2")
    ds_src.save(recursive=True)
    assert_repo_status(ds_src.path)

    ds_clone = install(source=ds_src.path,
                       path=path / "clone",
                       recursive=True,
                       result_xfm="datasets")
    ds_clone_s0 = Dataset(ds_clone.pathobj / "s0")
    ds_clone_s0_s0 = Dataset(ds_clone.pathobj / "s0" / "s0")
    ds_clone_s0_s1 = Dataset(ds_clone.pathobj / "s0" / "s1")
    ds_clone_s1 = Dataset(ds_clone.pathobj / "s1")
    ds_clone_s2 = Dataset(ds_clone.pathobj / "s2")

    (ds_src_s0_s0.pathobj / "foo").write_text("in s0 s0")
    ds_src_s0_s0.save()
    (ds_src_s1.pathobj / "foo").write_text("in s1")
    ds_src.save(recursive=True)
    # State:
    # .
    # |-- s0
    # |   |-- s0
    # |   `-- s1  * matches registered commit
    # |-- s1
    # `-- s2      * matches registered commit
    res = ds_clone.update(follow="parentds-lazy",
                          merge=True,
                          recursive=True,
                          on_failure="ignore")
    on_adjusted = ds_clone.repo.is_managed_branch()
    # For adjusted branches, follow=parentds* bails with an impossible result,
    # so the s0 update doesn't get brought in and s0_s0 also matches the
    # registered commit.
    n_notneeded_expected = 3 if on_adjusted else 2
    assert_result_count(res,
                        n_notneeded_expected,
                        action="update",
                        status="notneeded")
    assert_in_results(res,
                      action="update",
                      status="notneeded",
                      path=ds_clone_s0_s1.repo.path)
    assert_in_results(res,
                      action="update",
                      status="notneeded",
                      path=ds_clone_s2.repo.path)
    if on_adjusted:
        assert_in_results(res,
                          action="update",
                          status="notneeded",
                          path=ds_clone_s0_s0.repo.path)
        assert_repo_status(
            ds_clone.path,
            modified=[ds_clone_s0.repo.path, ds_clone_s1.repo.path])
    else:
        assert_repo_status(ds_clone.path)
Beispiel #27
0
def _put_in_zip(zip, path, records):
    for k, v in records.items():
        if isinstance(v, dict):
            _put_in_zip(zip, path + [k], v)
        else:
            zip.writestr(str(Path(*path) / k), v)
Beispiel #28
0
    def __call__(path=None,
                 initopts=None,
                 *,
                 force=False,
                 description=None,
                 dataset=None,
                 annex=True,
                 fake_dates=False,
                 cfg_proc=None):
        # we only perform negative tests below
        no_annex = not annex

        if dataset:
            if isinstance(dataset, Dataset):
                ds = dataset
            else:
                ds = Dataset(dataset)
            refds_path = ds.path
        else:
            ds = refds_path = None

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if (isinstance(initopts, (list, tuple))
                and '--bare' in initopts) or (isinstance(initopts, dict)
                                              and 'bare' in initopts):
            raise ValueError(
                "Creation of bare repositories is not supported. Consider "
                "one of the create-sibling commands, or use "
                "Git to init a bare repository and push an existing dataset "
                "into it.")

        if path:
            path = resolve_path(path, dataset)

        path = path if path \
            else getpwd() if ds is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # assure cfg_proc is a list (relevant if used via Python API)
        cfg_proc = ensure_list(cfg_proc)

        # prep for yield
        res = dict(action='create',
                   path=str(path),
                   logger=lgr,
                   type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != str(path):
            refds = require_dataset(refds_path,
                                    check_installed=True,
                                    purpose='create a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", ds, str(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = get_dataset_root(
            op.normpath(op.join(str(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if (not pstatus.get(check_path, {}).get("type") == "dataset"
                    and any(check_path == p or check_path in p.parents
                            for p in pstatus)):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents
                ]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with content in parent dataset at %s: %s',
                     str(parentds_path), [str(c) for c in conflict])
                })
                yield res
                return
            if not force:
                # another set of check to see whether the target path is pointing
                # into a known subdataset that is not around ATM
                subds_status = {
                    parentds_path / k.relative_to(prepo.path)
                    for k, v in pstatus.items()
                    if v.get('type', None) == 'dataset'
                }
                check_paths = [check_path]
                check_paths.extend(check_path.parents)
                if any(p in subds_status for p in check_paths):
                    conflict = [p for p in check_paths if p in subds_status]
                    res.update({
                        'status':
                        'error',
                        'message':
                        ('collision with %s (dataset) in dataset %s',
                         str(conflict[0]), str(parentds_path))
                    })
                    yield res
                    return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = ds if isinstance(ds, Dataset) and \
            ds.path == path else Dataset(str(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`--force` option to ignore'
            })
            yield res
            return

        # Check if specified cfg_proc(s) can be discovered, storing
        # the results so they can be used when the time comes to run
        # the procedure. If a procedure cannot be found, raise an
        # error to prevent creating the dataset.
        cfg_proc_specs = []
        if cfg_proc:
            discovered_procs = tbds.run_procedure(
                discover=True,
                result_renderer='disabled',
                return_type='generator',
            )
            for cfg_proc_ in cfg_proc:
                for discovered_proc in discovered_procs:
                    if discovered_proc['procedure_name'] == 'cfg_' + cfg_proc_:
                        cfg_proc_specs.append(discovered_proc)
                        break
                else:
                    raise ValueError("Cannot find procedure with name "
                                     "'%s'" % cfg_proc_)

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # Note for the code below:
        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Re-use tbrepo instance, do not use tbds.repo

        # create and configure desired repository
        # also provides initial set of content to be tracked with git (not annex)
        if no_annex:
            tbrepo, add_to_git = _setup_git_repo(path, initopts, fake_dates)
        else:
            tbrepo, add_to_git = _setup_annex_repo(path, initopts, fake_dates,
                                                   description)

        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Note, must not happen earlier (before if) since "smart" it would not be
        tbds_config = tbds.config

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds_config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds_config.unset(id_var, scope='branch')

        if _seed is None:
            # just the standard way
            # use a fully random identifier (i.e. UUID version 4)
            uuid_id = str(uuid.uuid4())
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds_config.add(id_var,
                        tbds_id if tbds_id is not None else uuid_id,
                        scope='branch',
                        reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in tbds_config.overrides.items():
            tbds_config.add(k, v, scope='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds_config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbrepo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'
        }

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbrepo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        for cfg_proc_spec in cfg_proc_specs:
            yield from tbds.run_procedure(
                cfg_proc_spec,
                result_renderer='disabled',
                return_type='generator',
            )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            yield from refds.save(
                path=tbds.path,
                return_type='generator',
                result_renderer='disabled',
            )

        res.update({'status': 'ok'})
        yield res
Beispiel #29
0
def test_datalad_credential_helper(path=None):

    ds = Dataset(path).create()

    # tell git to use git-credential-datalad
    ds.config.add('credential.helper', 'datalad', scope='local')
    ds.config.add('datalad.credentials.githelper.noninteractive',
                  'true',
                  scope='global')

    from datalad.downloaders.providers import Providers

    url1 = "https://datalad-test.org/some"
    url2 = "https://datalad-test.org/other"
    provider_name = "datalad-test.org"

    # `Providers` code is old and only considers a dataset root based on PWD
    # for config lookup. contextmanager below can be removed once the
    # provider/credential system is redesigned.
    with chpwd(ds.path):

        gitcred = GitCredentialInterface(url=url1, repo=ds)

        # There's nothing set up yet, helper should return empty
        gitcred.fill()
        eq_(gitcred['username'], '')
        eq_(gitcred['password'], '')

        # store new credentials
        # Note, that `Providers.enter_new()` currently uses user-level config
        # files for storage only. TODO: make that an option!
        # To not mess with existing ones, fail if it already exists:

        cfg_file = Path(Providers._get_providers_dirs()['user']) \
                   / f"{provider_name}.cfg"
        assert_false(cfg_file.exists())

        # Make sure we clean up
        from datalad.tests import _TEMP_PATHS_GENERATED
        _TEMP_PATHS_GENERATED.append(str(cfg_file))

        # Give credentials to git and ask it to store them:
        gitcred = GitCredentialInterface(url=url1,
                                         username="******",
                                         password="******",
                                         repo=ds)
        gitcred.approve()

        assert_true(cfg_file.exists())
        providers = Providers.from_config_files()
        p1 = providers.get_provider(url=url1, only_nondefault=True)
        assert_is_instance(p1.credential, UserPassword)
        eq_(p1.credential.get('user'), 'dl-user')
        eq_(p1.credential.get('password'), 'dl-pwd')

        # default regex should be host only, so matching url2, too
        p2 = providers.get_provider(url=url2, only_nondefault=True)
        assert_is_instance(p1.credential, UserPassword)
        eq_(p1.credential.get('user'), 'dl-user')
        eq_(p1.credential.get('password'), 'dl-pwd')

        # git, too, should now find it for both URLs
        gitcred = GitCredentialInterface(url=url1, repo=ds)
        gitcred.fill()
        eq_(gitcred['username'], 'dl-user')
        eq_(gitcred['password'], 'dl-pwd')

        gitcred = GitCredentialInterface(url=url2, repo=ds)
        gitcred.fill()
        eq_(gitcred['username'], 'dl-user')
        eq_(gitcred['password'], 'dl-pwd')

        # Rejection must not currently lead to deleting anything, since we would
        # delete too broadly.
        gitcred.reject()
        assert_true(cfg_file.exists())
        gitcred = GitCredentialInterface(url=url1, repo=ds)
        gitcred.fill()
        eq_(gitcred['username'], 'dl-user')
        eq_(gitcred['password'], 'dl-pwd')
        dlcred = UserPassword(name=provider_name)
        eq_(dlcred.get('user'), 'dl-user')
        eq_(dlcred.get('password'), 'dl-pwd')
Beispiel #30
0
    def __call__(keyfile=None,
                 merge=False,
                 force_update=False,
                 bids=False,
                 non_bids_dir='non-bids',
                 dataset=None):
        ds = require_dataset(dataset, check_installed=True, purpose='update')

        repo = ds.repo
        if not keyfile:
            # will error out, if no config was given
            keyfile = repo.config.obtain('datalad.ukbiobank.keyfile')

        # prep for yield
        res = dict(
            action='ukb_update',
            path=ds.path,
            type='dataset',
            logger=lgr,
            refds=ds.path,
        )

        if repo.dirty:
            yield dict(
                res,
                status='error',
                message="Refuse to operate on dirty dataset",
            )
            return

        # check if we have 'ukbfetch' before we start fiddling with the dataset
        # and leave it in a mess for no reason
        try:
            subprocess.run(
                # pull version info
                ['ukbfetch', '-i'],
                capture_output=True,
            )
        except Exception as e:
            raise RuntimeError(
                "Cannot execute 'ukbfetch'. Original error: {}".format(e))

        # just to be nice, and to be able to check it out again,
        # when we are done
        initial_branch = repo.get_active_branch()
        initial_incoming = repo.get_hexsha('incoming')

        # make sure we are in incoming
        repo.call_git(['checkout', 'incoming'])

        # first wipe out all prev. downloaded zip files so we can detect
        # when some files are no longer available
        for fp in repo.pathobj.glob('[0-9]*_[0-9]*_[0-9]_[0-9].*'):
            fp.unlink()

        # a place to put the download logs
        # better be semi-persistent to ease inspection
        tmpdir = repo.pathobj / repo.get_git_dir(repo) / 'tmp' / 'ukb'
        tmpdir.mkdir(parents=True, exist_ok=True)

        # redownload, run with explicit mode, because we just deleted the
        # ZIP files and that is OK
        ds.run(
            cmd='ukbfetch -v -a{} -b.ukbbatch -o{}'.format(
                quote_cmdlinearg(keyfile),
                quote_cmdlinearg(str(tmpdir)),
            ),
            explicit=True,
            outputs=['.'],
            message="Update from UKbiobank",
        )

        # TODO what if something broke before? needs force switch
        if not force_update and repo.get_hexsha() == initial_incoming:
            yield dict(
                res,
                status='notneeded',
                message='No new content available',
            )
            repo.call_git(['checkout', initial_branch])
            # TODO drop?
            return

        # onto extraction and transformation of downloaded content
        repo.call_git(['checkout', 'incoming-processed'])

        # mark the incoming change as merged
        # (but we do not actually want any branch content)
        repo.call_git(['merge', 'incoming', '--strategy=ours', 'incoming'])

        for fp in repo.get_content_info(ref='incoming-processed',
                                        eval_file_type=False):
            fp.unlink()

        subid = None
        if bids:
            from datalad_ukbiobank.ukb2bids import restructure_ukb2bids
            # get participant ID from batch file
            subid = list(
                repo.call_git_items_(["cat-file", "-p", "incoming:.ukbbatch"
                                      ]))[0].split(maxsplit=1)[0]

        # discover all zip files present in the last commit in 'incoming'
        for fp, props in repo.get_content_annexinfo(
                ref='incoming', eval_availability=False).items():
            if fp.name.startswith('.'):
                # skip internals
                continue
            # we have to extract into per-instance directories, otherwise files
            # would conflict
            ids = fp.stem.split('_')
            if not len(ids) >= 3:
                raise RuntimeError(
                    'Unrecognized filename structure: {}'.format(fp))
            extract_dir = repo.pathobj / 'instance-{}'.format(ids[2])
            extract_dir.mkdir(exist_ok=True)

            if fp.suffix == '.zip':
                with chpwd(extract_dir):
                    # extract and add their content
                    AddArchiveContent.__call__(
                        props['key'],
                        key=True,
                        annex=repo,
                        # --use-current-dir due to
                        # https://github.com/datalad/datalad/issues/3995
                        use_current_dir=True,
                        allow_dirty=True,
                        commit=False,
                    )
            else:
                # move into instance dir, and strip participant ID, and instance ID
                # but keep array index
                # e.g. -> 25747_3_0.adv -> instance-3/25747_0
                repo.call_git([
                    'annex', 'fromkey', props['key'],
                    str(extract_dir /
                        ('_'.join(ids[1::2]) + ''.join(fp.suffixes)))
                ])

            if bids:
                yield from restructure_ukb2bids(
                    ds,
                    subid=subid,
                    unrecognized_dir=Path('ses-{}'.format(ids[2])) /
                    non_bids_dir,
                    base_path=extract_dir,
                    session=ids[2],
                )

        # save whatever the state is now, `save` will discover deletions
        # automatically and also commit them -- wonderful!
        ds.save(message="Track ZIP file content")
        yield dict(
            res,
            status='ok',
        )

        if not merge:
            return

        # and update active branch
        repo.call_git(['checkout', initial_branch])

        if initial_branch in ('incoming', 'incoming-processed'):
            yield dict(
                res,
                action='ukb_merge_update',
                status='impossible',
                message='Refuse to merge into incoming* branch',
            )
            return

        repo.call_git([
            'merge', '-m', "Merge update from UKbiobank", 'incoming-processed'
        ])

        yield dict(
            res,
            action='ukb_merge_update',
            status='ok',
        )
        return