Example #1
0
def test_get_local_file_url():
    for path, url in (
                # relpaths are special-cased below
                ('test.txt', 'test.txt'),
                # static copy of "most_obscore_name"
                (' "\';a&b&cΔЙקم๗あ `| ',
                 # and translation by google chrome
                 "%20%22%27%3Ba%26b%26c%CE%94%D0%99%D7%A7%D9%85%E0%B9%97%E3%81%82%20%60%7C%20"),
            ) + (
                ('C:\\Windows\\Notepad.exe', 'file://C/Windows/Notepad.exe'),
            ) if on_windows else (
                ('/a', 'file:///a'),
                ('/a/b/c', 'file:///a/b/c'),
                ('/a~', 'file:///a~'),
                # there are no files with trailing slashes in the name
                #('/a b/', 'file:///a%20b/'),
                ('/a b/name', 'file:///a%20b/name'),
            ):
        if isabs(path):
            eq_(get_local_file_url(path), url)
        else:
            eq_(get_local_file_url(path),
                '/'.join((
                    get_local_file_url(os.getcwd()),
                    url))
            )
Example #2
0
def test_get_local_file_url():
    for path, url in (
            # relpaths are special-cased below
        ('test.txt', 'test.txt'),
    ) + (
        ('C:\\Windows\\notepad.exe', 'file://C/Windows/notepad.exe'),
    ) if on_windows else (
            # static copy of "most_obscore_name"
        (
            ' "\';a&b&cΔЙקم๗あ `| ',
            # and translation by google chrome
            "%20%22%27%3Ba%26b%26c%CE%94%D0%99%D7%A7%D9%85%E0%B9%97%E3%81%82%20%60%7C%20"
        ),
        ('/a', 'file:///a'),
        ('/a/b/c', 'file:///a/b/c'),
        ('/a~', 'file:///a~'),
            # there are no files with trailing slashes in the name
            #('/a b/', 'file:///a%20b/'),
        ('/a b/name', 'file:///a%20b/name'),
    ):
        try:
            # Yarik found no better way to trigger.  .decode() isn't enough
            print("D: %s" % path)
        except UnicodeEncodeError:
            if sys.version_info < (3, 7):
                # observed test failing on ubuntu 18.04 with python 3.6
                # (reproduced in conda env locally with python 3.6.10 when LANG=C
                # We will just skip this tricky one
                continue
            raise
        if isabs(path):
            eq_(get_local_file_url(path), url)
        else:
            eq_(get_local_file_url(path), '/'.join(
                (get_local_file_url(os.getcwd()), url)))
Example #3
0
def test_source_candidate_subdataset(store1=None,
                                     store2=None,
                                     intermediate=None,
                                     super=None,
                                     clone=None):

    # This tests the scenario of gh-6159.
    # However, the actual point is to test that `get` does not overwrite a
    # source candidate config in subdatasets, if they already have such a
    # config. This could come from any postclone_cfg routine, but the only one
    # actually doing this ATM is postclone_cfg_ria.

    ds = Dataset(intermediate).create(force=True)
    ds.create("sub1", force=True)
    ds.create("sub2", force=True)
    ds.save(recursive=True)
    ria_url_1 = "ria+" + get_local_file_url(store1, compatibility='git')
    ds.create_sibling_ria(ria_url_1,
                          "firststore",
                          recursive=True,
                          new_store_ok=True)
    ds.push(".", to="firststore", recursive=True)
    superds = Dataset(super).create()
    superds.clone(source=ria_url_1 + "#" + ds.id, path="intermediate")
    ria_url_2 = "ria+" + get_local_file_url(store2, compatibility='git')
    superds.create_sibling_ria(ria_url_2, "secondstore", new_store_ok=True)
    superds.push(".", to="secondstore")

    cloneds = install(clone, source=ria_url_2 + "#" + superds.id)

    # This would fail if source candidates weren't right, since cloneds only
    # knows the second store so far (which doesn't have the subdatasets).
    cloneds.get("intermediate", recursive=True)
Example #4
0
def test_no_storage(store1, store2, ds_path):
    store1_url = 'ria+' + get_local_file_url(store1)
    store2_url = 'ria+' + get_local_file_url(store2)

    ds = Dataset(ds_path).create(force=True)
    ds.save(recursive=True)
    assert_repo_status(ds.path)

    res = ds.create_sibling_ria(store1_url,
                                "datastore1",
                                storage_sibling=False)
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_({'datastore1', 'here'},
        {s['name']
         for s in ds.siblings(result_renderer=None)})

    # deprecated way of disabling storage still works
    res = ds.create_sibling_ria(store2_url,
                                "datastore2",
                                disable_storage__=True)
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_({'datastore2', 'datastore1', 'here'},
        {s['name']
         for s in ds.siblings(result_renderer=None)})

    # smoke test that we can push to it
    res = ds.push(to='datastore1')
    assert_status('ok', res)
    # but nothing was copied, because there is no storage sibling
    assert_result_count(res, 0, action='copy')
Example #5
0
def test_install_dataset_from_just_source(src_repo=None, path=None):

    src_ds = Dataset(src_repo).create(result_renderer='disabled', force=True)
    src_ds.save(['INFO.txt', 'test.dat'], to_git=True)
    src_ds.save('test-annex.dat', to_git=False)
    # equivalent repo on github:
    src_url = "https://github.com/datalad/testrepo--basic--r1.git"
    sources = [
        src_ds.path,
        get_local_file_url(src_ds.path, compatibility='git')
    ]
    if not dl_cfg.get('datalad.tests.nonetwork'):
        sources.append(src_url)

    for url in sources:

        with chpwd(path, mkdir=True):
            ds = install(source=url)

        ok_startswith(ds.path, path)
        ok_(ds.is_installed())
        ok_(GitRepo.is_valid_repo(ds.path))
        assert_repo_status(ds.path, annex=None)
        assert_in('INFO.txt', ds.repo.get_indexed_files())

        # cleanup before next iteration
        rmtree(path)
Example #6
0
def test_custom_call_fmt(path, local_file):
    ds = Dataset(path).create()
    subds = ds.create('sub')

    # plug in a proper singularity image
    subds.containers_add(
        'mycontainer',
        url=get_local_file_url(op.join(local_file, 'some_container.img')),
        image='righthere',
        call_fmt='echo image={img} cmd={cmd} img_dspath={img_dspath} '
        # and environment variable being set/propagated by default
        'name=$DATALAD_CONTAINER_NAME')
    ds.save()  # record the effect in super-dataset

    # Running should work fine either withing sub or within super
    out = WitlessRunner(cwd=subds.path).run(
        ['datalad', 'containers-run', '-n', 'mycontainer', 'XXX'],
        protocol=StdOutCapture)
    assert_in('image=righthere cmd=XXX img_dspath=. name=mycontainer',
              out['stdout'])

    out = WitlessRunner(cwd=ds.path).run(
        ['datalad', 'containers-run', '-n', 'sub/mycontainer', 'XXX'],
        protocol=StdOutCapture)
    assert_in('image=sub/righthere cmd=XXX img_dspath=sub', out['stdout'])

    # Test within subdirectory of the super-dataset
    subdir = op.join(ds.path, 'subdir')
    os.mkdir(subdir)
    out = WitlessRunner(cwd=subdir).run(
        ['datalad', 'containers-run', '-n', 'sub/mycontainer', 'XXX'],
        protocol=StdOutCapture)
    assert_in('image=../sub/righthere cmd=XXX img_dspath=../sub',
              out['stdout'])
Example #7
0
def test_optimized_cloning(path):
    # make test repo with one file and one commit
    originpath = op.join(path, 'origin')
    repo = GitRepo(originpath, create=True)
    with open(op.join(originpath, 'test'), 'w') as f:
        f.write('some')
    repo.add('test')
    repo.commit('init')
    ok_clean_git(originpath, annex=False)
    from glob import glob

    def _get_inodes(repo):
        return dict(
            [(os.path.join(*o.split(os.sep)[-2:]),
              os.stat(o).st_ino)
             for o in glob(os.path.join(repo.path,
                                        repo.get_git_dir(repo),
                                        'objects', '*', '*'))])

    origin_inodes = _get_inodes(repo)
    # now clone it in different ways and see what happens to the object storage
    from datalad.support.network import get_local_file_url
    clonepath = op.join(path, 'clone')
    for src in (originpath, get_local_file_url(originpath)):
        # deprecated
        assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath)
        clone = GitRepo.clone(url=src, path=clonepath, create=True)
        clone_inodes = _get_inodes(clone)
        eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src))
        rmtree(clonepath)
Example #8
0
def test_optimized_cloning(path):
    # make test repo with one file and one commit
    originpath = op.join(path, 'origin')
    repo = GitRepo(originpath, create=True)
    with open(op.join(originpath, 'test'), 'w') as f:
        f.write('some')
    repo.add('test')
    repo.commit('init')
    ok_clean_git(originpath, annex=False)
    from glob import glob

    def _get_inodes(repo):
        return dict([
            (os.path.join(*o.split(os.sep)[-2:]), os.stat(o).st_ino)
            for o in glob(os.path.join(repo.repo.git_dir, 'objects', '*', '*'))
        ])

    origin_inodes = _get_inodes(repo)
    # now clone it in different ways and see what happens to the object storage
    from datalad.support.network import get_local_file_url
    clonepath = op.join(path, 'clone')
    for src in (originpath, get_local_file_url(originpath)):
        # deprecated
        assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath)
        clone = GitRepo.clone(url=src, path=clonepath, create=True)
        clone_inodes = _get_inodes(clone)
        eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src))
        rmtree(clonepath)
Example #9
0
def test_ria_push(srcpath, dstpath):
    # complex test involving a git remote, a special remote, and a
    # publication dependency
    src = Dataset(srcpath).create()
    testfile = src.pathobj / 'test_mod_annex_file'
    testfile.write_text("Heavy stuff.")
    src.save()
    assert_status(
        'ok',
        src.create_sibling_ria(
            "ria+{}".format(get_local_file_url(dstpath, compatibility='git')),
            "datastore"))
    res = src.push(to='datastore')
    assert_in_results(res,
                      action='publish',
                      target='datastore',
                      status='ok',
                      refspec='refs/heads/master:refs/heads/master')
    assert_in_results(res,
                      action='publish',
                      target='datastore',
                      status='ok',
                      refspec='refs/heads/git-annex:refs/heads/git-annex')
    assert_in_results(res,
                      action='copy',
                      target='datastore-storage',
                      status='ok',
                      path=str(testfile))
Example #10
0
def test_ria_postclonecfg():

    if not has_symlink_capability():
        # This is needed to create an ORA remote using an URL for upload,
        # that is then invalidated later on (delete the symlink it's based on).
        raise SkipTest("Can't create symlinks")

    from datalad.utils import make_tempfile
    from datalad.tests.utils import HTTPPath

    with make_tempfile(mkdir=True) as lcl, make_tempfile(mkdir=True) as store:
        id = _postclonetest_prepare(lcl, store)

        # test cloning via ria+file://
        yield _test_ria_postclonecfg, get_local_file_url(
            store, compatibility='git'), id

        # Note: HTTP disabled for now. Requires proper implementation in ORA
        #       remote. See
        # https://github.com/datalad/datalad/pull/4203#discussion_r410284649

        # # test cloning via ria+http://
        # with HTTPPath(store) as url:
        #     yield _test_ria_postclonecfg, url, id

        # test cloning via ria+ssh://
        yield skip_ssh(_test_ria_postclonecfg), \
            "ssh://datalad-test:{}".format(Path(store).as_posix()), id
Example #11
0
def test_custom_call_fmt(path, local_file):
    ds = Dataset(path).create()
    subds = ds.create('sub')

    # plug in a proper singularity image
    subds.containers_add(
        'mycontainer',
        url=get_local_file_url(op.join(local_file, 'some_container.img')),
        image='righthere',
        call_fmt='echo image={img} cmd={cmd} img_dspath={img_dspath} '
                 # and environment variable being set/propagated by default
                 'name=$DATALAD_CONTAINER_NAME'
    )
    ds.save()  # record the effect in super-dataset

    # Running should work fine either withing sub or within super
    with swallow_outputs() as cmo:
        subds.containers_run('XXX', container_name='mycontainer')
        assert_in('image=righthere cmd=XXX img_dspath=. name=mycontainer', cmo.out)

    with swallow_outputs() as cmo:
        ds.containers_run('XXX', container_name='sub/mycontainer')
        assert_in('image=sub/righthere cmd=XXX img_dspath=sub', cmo.out)

    # Test within subdirectory of the super-dataset
    subdir = op.join(ds.path, 'subdir')
    os.mkdir(subdir)
    with chpwd(subdir):
        with swallow_outputs() as cmo:
            containers_run('XXX', container_name='sub/mycontainer')
            assert_in('image=../sub/righthere cmd=XXX img_dspath=../sub', cmo.out)
Example #12
0
def test_get_local_file_url_compatibility(path):
    # smoke test for file:// URL compatibility with other datalad/git/annex
    # pieces
    path = Path(path)
    ds1 = Dataset(path / 'ds1').create()
    ds2 = Dataset(path / 'ds2').create()
    testfile = path / 'testfile.txt'
    testfile.write_text('some')

    # compat with annex addurl
    ds1.repo.add_url_to_file(
        'test.txt', get_local_file_url(testfile, compatibility='git-annex'))

    # compat with git clone/submodule
    assert_status(
        'ok',
        ds1.clone(get_local_file_url(ds2.path, compatibility='git'),
                  result_xfm=None,
                  return_type='generator'))
Example #13
0
def test_container_update(ds_path, local_file, url):
    url_foo = get_local_file_url(op.join(local_file, 'foo.img'))
    url_bar = get_local_file_url(op.join(local_file, 'bar.img'))
    img = op.join(".datalad", "environments", "foo", "image")

    ds = Dataset(ds_path).create()

    ds.containers_add(name="foo", call_fmt="call-fmt1", url=url_foo)

    # Abort without --update flag.
    res = ds.containers_add(name="foo", on_failure="ignore")
    assert_result_count(res, 1, action="containers_add", status="impossible")

    # Abort if nothing to update is specified.
    res = ds.containers_add(name="foo", update=True, on_failure="ignore")
    assert_result_count(res,
                        1,
                        action="containers_add",
                        status="impossible",
                        message="No values to update specified")

    # Update call format.
    ds.containers_add(name="foo", update=True, call_fmt="call-fmt2")
    assert_equal(ds.config.get("datalad.containers.foo.cmdexec"), "call-fmt2")
    ok_file_has_content(op.join(ds.path, img), "foo")

    # Update URL/image.
    ds.drop(img)  # Make sure it works even with absent content.
    res = ds.containers_add(name="foo", update=True, url=url_bar)
    assert_result_count(res, 1, action="remove", status="ok", path=img)
    assert_result_count(res, 1, action="save", status="ok")
    ok_file_has_content(op.join(ds.path, img), "bar")

    # Test commit message
    # In the above case it was updating existing image so should have "Update "
    get_commit_msg = lambda *args: ds.repo.format_commit('%B')
    assert_in("Update ", get_commit_msg())

    # If we add a new image with update=True should say Configure
    res = ds.containers_add(name="foo2", update=True, url=url_bar)
    assert_in("Configure ", get_commit_msg())
Example #14
0
def test_get_local_file_url():
    for path, url in (
            # relpaths are special-cased below
        ('test.txt', 'test.txt'), ) + (
            ('C:\\Windows\\notepad.exe', 'file://C/Windows/notepad.exe'),
        ) if on_windows else (
            (OBSCURE_FILENAME, urlquote(OBSCURE_FILENAME)),
            ('/a', 'file:///a'),
            ('/a/b/c', 'file:///a/b/c'),
            ('/a~', 'file:///a~'),
            # there are no files with trailing slashes in the name
            #('/a b/', 'file:///a%20b/'),
            ('/a b/name', 'file:///a%20b/name'),
        ):
        # Yarik found no better way to trigger.  .decode() isn't enough
        print("D: %s" % path)
        if isabs(path):
            eq_(get_local_file_url(path), url)
        else:
            eq_(get_local_file_url(path), '/'.join(
                (get_local_file_url(os.getcwd()), url)))
Example #15
0
def test_container_files(ds_path, local_file, url):
    # setup things to add
    #
    # Note: Since "adding" as a container doesn't actually call anything or use
    # the container in some way, but simply registers it, for testing any file
    # is sufficient.
    local_file = get_local_file_url(op.join(local_file, 'some_container.img'))

    # prepare dataset:
    ds = Dataset(ds_path).create()
    # non-default location:
    ds.config.add("datalad.containers.location",
                  value=op.join(".datalad", "test-environments"),
                  where='dataset')
    ds.save(message="Configure container mountpoint")

    # no containers yet:
    res = ds.containers_list(**RAW_KWDS)
    assert_result_count(res, 0)

    # add first "image": must end up at the configured default location
    target_path = op.join(ds.path, ".datalad", "test-environments", "first",
                          "image")
    res = ds.containers_add(name="first", url=local_file)
    ok_clean_git(ds.repo)

    assert_result_count(res,
                        1,
                        status="ok",
                        type="file",
                        path=target_path,
                        action="containers_add")
    ok_(op.lexists(target_path))

    res = ds.containers_list(**RAW_KWDS)
    assert_result_count(res, 1)
    assert_result_count(res,
                        1,
                        name='first',
                        type='file',
                        action='containers',
                        status='ok',
                        path=target_path)

    # and kill it again
    # but needs name
    assert_raises(TypeError, ds.containers_remove)
    res = ds.containers_remove('first', remove_image=True)
    assert_status('ok', res)
    assert_result_count(ds.containers_list(**RAW_KWDS), 0)
    # image removed
    assert (not op.lexists(target_path))
def test_no_storage(store1=None, store2=None, ds_path=None):
    store1_url = 'ria+' + get_local_file_url(store1)
    store2_url = 'ria+' + get_local_file_url(store2)

    ds = Dataset(ds_path).create(force=True)
    ds.save(recursive=True)
    assert_repo_status(ds.path)

    res = ds.create_sibling_ria(store1_url,
                                "datastore1",
                                storage_sibling=False,
                                new_store_ok=True)
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_({'datastore1', 'here'},
        {s['name']
         for s in ds.siblings(result_renderer='disabled')})

    # deprecated way of disabling storage still works
    res = ds.create_sibling_ria(store2_url,
                                "datastore2",
                                storage_sibling=False,
                                new_store_ok=True)
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_({'datastore2', 'datastore1', 'here'},
        {s['name']
         for s in ds.siblings(result_renderer='disabled')})

    # no annex/object dir should be created when there is no special remote
    # to use it.
    for s in [store1, store2]:
        p = Path(s) / ds.id[:3] / ds.id[3:] / 'annex' / 'objects'
        assert_false(p.exists())

    # smoke test that we can push to it
    res = ds.push(to='datastore1')
    assert_status('ok', res)
    # but nothing was copied, because there is no storage sibling
    assert_result_count(res, 0, action='copy')
Example #17
0
def _resolve_img_url(url):
    """Takes a URL and tries to resolve it to an actual download
    URL that `annex addurl` can handle"""
    if op.exists(url):
        lgr.debug('Convert local path specification into a file:// URL')
        # annex wants a real url
        url = get_local_file_url(url)
    elif url.startswith('shub://'):
        lgr.debug('Query singularity-hub for image download URL')
        import requests
        req = requests.get(
            'https://www.singularity-hub.org/api/container/{}'.format(url[7:]))
        shub_info = loads(req.text)
        url = shub_info['image']
    return url
Example #18
0
def test_add_local_path(path, local_file):
    ds = Dataset(path).create()
    res = ds.containers_add(name="foobert", url=op.join(local_file, "foo.img"))
    foo_target = op.join(path, ".datalad", "environments", "foobert", "image")
    assert_result_count(res,
                        1,
                        status="ok",
                        type="file",
                        path=foo_target,
                        action="containers_add")
    # We've just copied and added the file.
    assert_not_in(ds.repo.WEB_UUID, ds.repo.whereis(foo_target))

    # We can force the URL to be added. (Note: This works because datalad
    # overrides 'annex.security.allowed-url-schemes' in its tests.)
    ds.containers_add(name="barry",
                      url=get_local_file_url(op.join(local_file, "bar.img")))
    bar_target = op.join(path, ".datalad", "environments", "barry", "image")
    assert_in(ds.repo.WEB_UUID, ds.repo.whereis(bar_target))
Example #19
0
def test_storage_only(base_path, ds_path):
    store_url = 'ria+' + get_local_file_url(base_path)

    ds = Dataset(ds_path).create(force=True)
    ds.save(recursive=True)
    assert_repo_status(ds.path)

    res = ds.create_sibling_ria(store_url, "datastore", storage_sibling='only')
    assert_result_count(res, 1, status='ok', action='create-sibling-ria')
    eq_(len(res), 1)

    # the storage sibling uses the main name, not -storage
    siblings = ds.siblings(result_renderer=None)
    eq_({'datastore', 'here'}, {s['name'] for s in siblings})

    # smoke test that we can push to it
    res = ds.push(to='datastore')
    assert_status('ok', res)
    assert_result_count(res, 1, action='copy')
Example #20
0
def clone_dataset(
        srcs,
        destds,
        reckless=None,
        description=None,
        result_props=None,
        cfg=None):
    """Internal helper to perform cloning without sanity checks (assumed done)

    This helper does not handle any saving of subdataset modification or adding
    in a superdataset.

    Parameters
    ----------
    srcs : list
      Any suitable clone source specifications (paths, URLs)
    destds : Dataset
      Dataset instance for the clone destination
    reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional
      Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e.
      sacrifice data safety for performance or resource footprint. When None
      and `cfg` is specified, use the value of `datalad.clone.reckless`.
    description : str, optional
      Location description for the annex of the dataset clone (if there is any).
    result_props : dict, optional
      Default properties for any yielded result, passed on to get_status_dict().
    cfg : ConfigManager, optional
      Configuration for parent dataset. This will be queried instead
      of the global DataLad configuration.

    Yields
    ------
    dict
      DataLad result records
    """
    if not result_props:
        # in case the caller had no specific idea on how results should look
        # like, provide sensible defaults
        result_props = dict(
            action='install',
            logger=lgr,
            ds=destds,
        )

    if reckless is None and cfg:
        # if reckless is not explicitly given, but we operate on a
        # superdataset, query whether it has been instructed to operate
        # in a reckless mode, and inherit it for the coming clone
        reckless = cfg.get('datalad.clone.reckless', None)

    dest_path = destds.pathobj

    # decode all source candidate specifications
    candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs]

    # now expand the candidate sources with additional variants of the decoded
    # giturl, while duplicating the other properties in the additional records
    # for simplicity. The hope is to overcome a few corner cases and be more
    # robust than git clone
    candidate_sources = [
        dict(props, giturl=s) for props in candidate_sources
        for s in _get_flexible_source_candidates(props['giturl'])
    ]

    # important test! based on this `rmtree` will happen below after failed clone
    dest_path_existed = dest_path.exists()
    if dest_path_existed and any(dest_path.iterdir()):
        if destds.is_installed():
            # check if dest was cloned from the given source before
            # this is where we would have installed this from
            # this is where it was actually installed from
            track_name, track_url = _get_tracking_source(destds)
            try:
                # this will get us track_url in system native path conventions,
                # whenever it is a path (and not a URL)
                # this is needed to match it to any potentially incoming local
                # source path in the 'notneeded' test below
                track_path = str(Path(track_url))
            except Exception:
                # this should never happen, because Path() will let any non-path stringification
                # pass through unmodified, but we do not want any potential crash due to
                # pathlib behavior changes
                lgr.debug("Unexpected behavior of pathlib!")
                track_path = None
            for cand in candidate_sources:
                src = cand['giturl']
                if track_url == src \
                        or (not is_url(track_url)
                            and get_local_file_url(track_url, compatibility='git') == src) \
                        or track_path == expanduser(src):
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destds,
                                 src),
                        **result_props)
                    return
        # anything else is an error
        yield get_status_dict(
            status='error',
            message='target path already exists and not empty, refuse to clone into target path',
            **result_props)
        return

    log_progress(
        lgr.info,
        'cloneds',
        'Cloning dataset to %s', destds,
        total=len(candidate_sources),
        label='Clone attempt',
        unit=' Candidate locations',
    )
    error_msgs = OrderedDict()  # accumulate all error messages formatted per each url
    for cand in candidate_sources:
        log_progress(
            lgr.info,
            'cloneds',
            'Attempting to clone from %s to %s', cand['giturl'], dest_path,
            update=1,
            increment=True)

        clone_opts = {}

        if cand.get('version', None):
            clone_opts['branch'] = cand['version']
        try:
            # TODO for now GitRepo.clone() cannot handle Path instances, and PY35
            # doesn't make it happen seemlessly
            GitRepo.clone(
                path=str(dest_path),
                url=cand['giturl'],
                clone_options=clone_opts,
                create=True)

        except CommandError as e:
            e_stderr = e.stderr

            error_msgs[cand['giturl']] = e
            lgr.debug("Failed to clone from URL: %s (%s)",
                      cand['giturl'], exc_str(e))
            if dest_path.exists():
                lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                          dest_path)
                # We must not just rmtree since it might be curdir etc
                # we should remove all files/directories under it
                # TODO stringification can be removed once patlib compatible
                # or if PY35 is no longer supported
                rmtree(str(dest_path), children_only=dest_path_existed)

            if e_stderr and 'could not create work tree' in e_stderr.lower():
                # this cannot be fixed by trying another URL
                re_match = re.match(r".*fatal: (.*)$", e_stderr,
                                    flags=re.MULTILINE | re.DOTALL)
                # cancel progress bar
                log_progress(
                    lgr.info,
                    'cloneds',
                    'Completed clone attempts for %s', destds
                )
                yield get_status_dict(
                    status='error',
                    message=re_match.group(1).strip()
                    if re_match else "stderr: " + e_stderr,
                    **result_props)
                return
            # next candidate
            continue

        result_props['source'] = cand
        # do not bother with other sources if succeeded
        break

    log_progress(
        lgr.info,
        'cloneds',
        'Completed clone attempts for %s', destds
    )

    if not destds.is_installed():
        if len(error_msgs):
            if all(not e.stdout and not e.stderr for e in error_msgs.values()):
                # there is nothing we can learn from the actual exception,
                # the exit code is uninformative, the command is predictable
                error_msg = "Failed to clone from all attempted sources: %s"
                error_args = list(error_msgs.keys())
            else:
                error_msg = "Failed to clone from any candidate source URL. " \
                            "Encountered errors per each url were:\n- %s"
                error_args = '\n- '.join(
                    '{}\n  {}'.format(url, exc_str(exc))
                    for url, exc in error_msgs.items()
                )
        else:
            # yoh: Not sure if we ever get here but I felt that there could
            #      be a case when this might happen and original error would
            #      not be sufficient to troubleshoot what is going on.
            error_msg = "Awkward error -- we failed to clone properly. " \
                        "Although no errors were encountered, target " \
                        "dataset at %s seems to be not fully installed. " \
                        "The 'succesful' source was: %s"
            error_args = (destds.path, cand['giturl'])
        yield get_status_dict(
            status='error',
            message=(error_msg, error_args),
            **result_props)
        return

    if not cand.get("version"):
        postclone_check_head(destds)

    # act on --reckless=shared-...
    # must happen prior git-annex-init, where we can cheaply alter the repo
    # setup through safe re-init'ing
    if reckless and reckless.startswith('shared-'):
        lgr.debug('Reinit %s to enable shared access permissions', destds)
        destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])])

    yield from postclonecfg_annexdataset(
        destds,
        reckless,
        description)

    # perform any post-processing that needs to know details of the clone
    # source
    if result_props['source']['type'] == 'ria':
        yield from postclonecfg_ria(destds, result_props['source'])

    if reckless:
        # store the reckless setting in the dataset to make it
        # known to later clones of subdatasets via get()
        destds.config.set(
            'datalad.clone.reckless', reckless,
            where='local',
            reload=True)

    # yield successful clone of the base dataset now, as any possible
    # subdataset clone down below will not alter the Git-state of the
    # parent
    yield get_status_dict(status='ok', **result_props)
Example #21
0
    def __call__(
            source,
            path=None,
            dataset=None,
            description=None,
            reckless=False,
            alt_sources=None):
            # TODO next ones should be there, but cannot go anywhere
            # git_opts=None,
            # git_clone_opts=None,
            # annex_opts=None,
            # annex_init_opts=None

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        dataset = require_dataset(
            dataset, check_installed=True, purpose='cloning') \
            if dataset is not None else dataset
        refds_path = dataset.path if dataset else None

        if isinstance(source, Dataset):
            source = source.path

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "clone `source` and destination `path` are identical [{}]. "
                "If you are trying to add a subdataset simply use `add`".format(
                    path))

        if path is not None:
            path = resolve_path(path, dataset)

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source_url = source
        source_ = _get_git_url_from_source(source)
        lgr.debug("Resolved clone source from '%s' to '%s'",
                  source, source_)
        source = source_

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)
            lgr.debug("Determined clone target path from source")
        lgr.debug("Resolved clone target path to: '%s'", path)

        # there is no other way -- my intoxicated brain tells me
        assert(path is not None)

        destination_dataset = Dataset(path)
        dest_path = path

        status_kwargs = dict(
            action='install', ds=destination_dataset, logger=lgr,
            refds=refds_path, source_url=source_url)

        # important test! based on this `rmtree` will happen below after failed clone
        if exists(dest_path) and listdir(dest_path):
            if destination_dataset.is_installed():
                # check if dest was cloned from the given source before
                # this is where we would have installed this from
                guessed_sources = _get_flexible_source_candidates(
                    source, dest_path)
                # this is where it was actually installed from
                track_name, track_url = _get_tracking_source(destination_dataset)
                if track_url in guessed_sources or \
                        get_local_file_url(track_url) in guessed_sources:
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destination_dataset,
                                 source),
                        **status_kwargs)
                    return
            # anything else is an error
            yield get_status_dict(
                status='error',
                message='target path already exists and not empty, refuse to clone into target path',
                **status_kwargs)
            return

        if dataset is not None and relpath(path, start=dataset.path).startswith(pardir):
            yield get_status_dict(
                status='error',
                message=("clone target path '%s' not in specified target dataset '%s'",
                         path, dataset),
                **status_kwargs)
            return

        # generate candidate URLs from source argument to overcome a few corner cases
        # and hopefully be more robust than git clone
        candidate_sources = []
        # combine all given sources (incl. alternatives), maintain order
        for s in [source] + assure_list(alt_sources):
            candidate_sources.extend(_get_flexible_source_candidates(s))
        candidates_str = \
            " [%d other candidates]" % (len(candidate_sources) - 1) \
            if len(candidate_sources) > 1 \
            else ''
        lgr.info("Cloning %s%s into '%s'",
                 source, candidates_str, dest_path)
        dest_path_existed = exists(dest_path)
        error_msgs = OrderedDict()  # accumulate all error messages formatted per each url
        for isource_, source_ in enumerate(candidate_sources):
            try:
                lgr.debug("Attempting to clone %s (%d out of %d candidates) to '%s'",
                          source_, isource_ + 1, len(candidate_sources), dest_path)
                GitRepo.clone(path=dest_path, url=source_, create=True)
                break  # do not bother with other sources if succeeded
            except GitCommandError as e:
                error_msgs[source_] = exc_str_ = exc_str(e)
                lgr.debug("Failed to clone from URL: %s (%s)",
                          source_, exc_str_)
                if exists(dest_path):
                    lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                              dest_path)
                    # We must not just rmtree since it might be curdir etc
                    # we should remove all files/directories under it
                    rmtree(dest_path, children_only=dest_path_existed)
                # Whenever progress reporting is enabled, as it is now,
                # we end up without e.stderr since it is "processed" out by
                # GitPython/our progress handler.
                e_stderr = e.stderr
                from datalad.support.gitrepo import GitPythonProgressBar
                if not e_stderr and GitPythonProgressBar._last_error_lines:
                    e_stderr = os.linesep.join(GitPythonProgressBar._last_error_lines)
                if 'could not create work tree' in e_stderr.lower():
                    # this cannot be fixed by trying another URL
                    re_match = re.match(r".*fatal: (.*)$", e_stderr,
                                        flags=re.MULTILINE | re.DOTALL)
                    yield get_status_dict(
                        status='error',
                        message=re_match.group(1) if re_match else "stderr: " + e_stderr,
                        **status_kwargs)
                    return

        if not destination_dataset.is_installed():
            if len(error_msgs):
                error_msg = "Failed to clone from any candidate source URL. " \
                            "Encountered errors per each url were: %s"
                error_args = (error_msgs, )
            else:
                # yoh: Not sure if we ever get here but I felt that there could
                #      be a case when this might happen and original error would
                #      not be sufficient to troubleshoot what is going on.
                error_msg = "Awkward error -- we failed to clone properly. " \
                            "Although no errors were encountered, target " \
                            "dataset at %s seems to be not fully installed. " \
                            "The 'succesful' source was: %s"
                error_args = (destination_dataset.path, source_)
            yield get_status_dict(
                status='error',
                message=(error_msg, error_args),
                **status_kwargs)
            return

        if dataset is not None:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.save(
                    dest_path,
                    return_type='generator',
                    result_filter=None,
                    result_xfm=None,
                    on_failure='ignore'):
                yield r

        _handle_possible_annex_dataset(
            destination_dataset,
            reckless,
            description=description)

        # yield successful clone of the base dataset now, as any possible
        # subdataset clone down below will not alter the Git-state of the
        # parent
        yield get_status_dict(status='ok', **status_kwargs)
Example #22
0
    def __call__(source,
                 path=None,
                 dataset=None,
                 description=None,
                 reckless=False,
                 alt_sources=None):
        # TODO next ones should be there, but cannot go anywhere
        # git_opts=None,
        # git_clone_opts=None,
        # annex_opts=None,
        # annex_init_opts=None

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        dataset = require_dataset(
            dataset, check_installed=True, purpose='cloning') \
            if dataset is not None else dataset
        refds_path = dataset.path if dataset else None

        if isinstance(source, Dataset):
            source = source.path

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "clone `source` and destination `path` are identical [{}]. "
                "If you are trying to add a subdataset simply use `add`".
                format(path))

        if path is not None:
            path = resolve_path(path, dataset)

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source_url = source
        source_ = _get_git_url_from_source(source)
        lgr.debug("Resolved clone source from '%s' to '%s'", source, source_)
        source = source_

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)
            lgr.debug("Determined clone target path from source")
        lgr.debug("Resolved clone target path to: '%s'", path)

        # there is no other way -- my intoxicated brain tells me
        assert (path is not None)

        destination_dataset = Dataset(path)
        dest_path = path

        status_kwargs = dict(action='install',
                             ds=destination_dataset,
                             logger=lgr,
                             refds=refds_path,
                             source_url=source_url)

        # important test! based on this `rmtree` will happen below after failed clone
        if exists(dest_path) and listdir(dest_path):
            if destination_dataset.is_installed():
                # check if dest was cloned from the given source before
                # this is where we would have installed this from
                guessed_sources = _get_flexible_source_candidates(
                    source, dest_path)
                # this is where it was actually installed from
                track_name, track_url = _get_tracking_source(
                    destination_dataset)
                if track_url in guessed_sources or \
                        get_local_file_url(track_url) in guessed_sources:
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destination_dataset, source),
                        **status_kwargs)
                    return
            # anything else is an error
            yield get_status_dict(
                status='error',
                message=
                'target path already exists and not empty, refuse to clone into target path',
                **status_kwargs)
            return

        if dataset is not None and relpath(
                path, start=dataset.path).startswith(pardir):
            yield get_status_dict(
                status='error',
                message=
                ("clone target path '%s' not in specified target dataset '%s'",
                 path, dataset),
                **status_kwargs)
            return

        # generate candidate URLs from source argument to overcome a few corner cases
        # and hopefully be more robust than git clone
        candidate_sources = []
        # combine all given sources (incl. alternatives), maintain order
        for s in [source] + assure_list(alt_sources):
            candidate_sources.extend(_get_flexible_source_candidates(s))
        candidates_str = \
            " [%d other candidates]" % (len(candidate_sources) - 1) \
            if len(candidate_sources) > 1 \
            else ''
        lgr.info("Cloning %s%s into '%s'", source, candidates_str, dest_path)
        dest_path_existed = exists(dest_path)
        error_msgs = OrderedDict(
        )  # accumulate all error messages formatted per each url
        for isource_, source_ in enumerate(candidate_sources):
            try:
                lgr.debug(
                    "Attempting to clone %s (%d out of %d candidates) to '%s'",
                    source_, isource_ + 1, len(candidate_sources), dest_path)
                GitRepo.clone(path=dest_path, url=source_, create=True)
                break  # do not bother with other sources if succeeded
            except GitCommandError as e:
                error_msgs[source_] = exc_str_ = exc_str(e)
                lgr.debug("Failed to clone from URL: %s (%s)", source_,
                          exc_str_)
                if exists(dest_path):
                    lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                              dest_path)
                    # We must not just rmtree since it might be curdir etc
                    # we should remove all files/directories under it
                    rmtree(dest_path, children_only=dest_path_existed)
                # Whenever progress reporting is enabled, as it is now,
                # we end up without e.stderr since it is "processed" out by
                # GitPython/our progress handler.
                e_stderr = e.stderr
                from datalad.support.gitrepo import GitPythonProgressBar
                if not e_stderr and GitPythonProgressBar._last_error_lines:
                    e_stderr = os.linesep.join(
                        GitPythonProgressBar._last_error_lines)
                if 'could not create work tree' in e_stderr.lower():
                    # this cannot be fixed by trying another URL
                    re_match = re.match(r".*fatal: (.*)$",
                                        e_stderr,
                                        flags=re.MULTILINE | re.DOTALL)
                    yield get_status_dict(
                        status='error',
                        message=re_match.group(1) if re_match else "stderr: " +
                        e_stderr,
                        **status_kwargs)
                    return

        if not destination_dataset.is_installed():
            if len(error_msgs):
                error_msg = "Failed to clone from any candidate source URL. " \
                            "Encountered errors per each url were: %s"
                error_args = (error_msgs, )
            else:
                # yoh: Not sure if we ever get here but I felt that there could
                #      be a case when this might happen and original error would
                #      not be sufficient to troubleshoot what is going on.
                error_msg = "Awkward error -- we failed to clone properly. " \
                            "Although no errors were encountered, target " \
                            "dataset at %s seems to be not fully installed. " \
                            "The 'succesful' source was: %s"
                error_args = (destination_dataset.path, source_)
            yield get_status_dict(status='error',
                                  message=(error_msg, error_args),
                                  **status_kwargs)
            return

        if dataset is not None:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.add(dest_path,
                                 save=True,
                                 ds2super=True,
                                 return_type='generator',
                                 result_filter=None,
                                 result_xfm=None,
                                 on_failure='ignore'):
                yield r

        _handle_possible_annex_dataset(destination_dataset,
                                       reckless,
                                       description=description)

        # yield successful clone of the base dataset now, as any possible
        # subdataset clone down below will not alter the Git-state of the
        # parent
        yield get_status_dict(status='ok', **status_kwargs)
Example #23
0
def _postclonetest_prepare(lcl, storepath, link):

    from datalad.customremotes.ria_utils import (create_store,
                                                 create_ds_in_store,
                                                 get_layout_locations)
    from datalad.distributed.ora_remote import (
        LocalIO, )

    create_tree(lcl,
                tree={
                    'ds': {
                        'test.txt': 'some',
                        'subdir': {
                            'subds': {
                                'testsub.txt': 'somemore'
                            },
                            'subgit': {
                                'testgit.txt': 'even more'
                            }
                        },
                    },
                })

    # create a local dataset with a subdataset
    lcl = Path(lcl)
    storepath = Path(storepath)
    link = Path(link)
    link.symlink_to(storepath)
    subds = Dataset(lcl / 'ds' / 'subdir' / 'subds').create(force=True)
    subds.save()
    # add a plain git dataset as well
    subgit = Dataset(lcl / 'ds' / 'subdir' / 'subgit').create(force=True,
                                                              annex=False)
    subgit.save()
    ds = Dataset(lcl / 'ds').create(force=True)
    ds.save(version_tag='original')
    assert_repo_status(ds.path)

    io = LocalIO()
    create_store(io, storepath, '1')

    # URL to use for upload. Point is, that this should be invalid for the clone
    # so that autoenable would fail. Therefore let it be based on a to be
    # deleted symlink
    upl_url = "ria+{}".format(get_local_file_url(str(link)))

    for d in (ds, subds, subgit):

        # TODO: create-sibling-ria required for config! => adapt to RF'd
        #       creation (missed on rebase?)
        create_ds_in_store(io, storepath, d.id, '2', '1')
        d.create_sibling_ria(upl_url, "store")

        if d is not subgit:
            # Now, simulate the problem by reconfiguring the special remote to
            # not be autoenabled.
            # Note, however, that the actual intention is a URL, that isn't
            # valid from the point of view of the clone (doesn't resolve, no
            # credentials, etc.) and therefore autoenabling on git-annex-init
            # when datalad-cloning would fail to succeed.
            Runner(cwd=d.path).run([
                'git', 'annex', 'enableremote', 'store-storage',
                'autoenable=false'
            ])
        d.push('.', to='store')
        store_loc, _, _ = get_layout_locations(1, storepath, d.id)
        Runner(cwd=str(store_loc)).run(['git', 'update-server-info'])

    link.unlink()
    # We should now have a store with datasets that have an autoenabled ORA
    # remote relying on an inaccessible URL.
    # datalad-clone is supposed to reconfigure based on the URL we cloned from.
    # Test this feature for cloning via HTTP, SSH and FILE URLs.

    return ds.id
Example #24
0
    def __call__(path=None,
                 source=None,
                 dataset=None,
                 get_data=False,
                 description=None,
                 recursive=False,
                 recursion_limit=None,
                 if_dirty='save-before',
                 save=True,
                 reckless=False,
                 git_opts=None,
                 git_clone_opts=None,
                 annex_opts=None,
                 annex_init_opts=None,
                 jobs=None):

        # normalize path argument to be equal when called from cmdline and
        # python and nothing was passed into `path`
        path = assure_list(path)

        if not source and not path:
            raise InsufficientArgumentsError(
                "Please provide at least a source or a path")

        ## Common kwargs to pass to underlying git/install calls.
        #  They might need adjustments (e.g. for recursion_limit, but
        #  otherwise would be applicable throughout
        #
        # There should have been more of common options!
        # since underneath get could do similar installs, but now they
        # have duplicated implementations which differ (e.g. get does not
        # annex init installed annexes)
        common_kwargs = dict(
            get_data=get_data,
            recursive=recursive,
            recursion_limit=recursion_limit,
            git_opts=git_opts,
            annex_opts=annex_opts,
            reckless=reckless,
            jobs=jobs,
        )

        installed_items = []
        failed_items = []

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = None
        if dataset is not None:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='installation')
            handle_dirty_dataset(ds, if_dirty)

        # switch into scenario without --source:
        if source is None:
            # we need to collect URLs and paths
            to_install = []
            to_get = []
            for urlpath in path:
                ri = RI(urlpath)
                (to_get
                 if isinstance(ri, PathRI) else to_install).append(urlpath)

            common_kwargs['dataset'] = dataset

            # first install, and then get
            for s in to_install:
                lgr.debug("Install passes into install source=%s", s)
                try:
                    result = Install.__call__(source=s,
                                              description=description,
                                              if_dirty=if_dirty,
                                              save=save,
                                              git_clone_opts=git_clone_opts,
                                              annex_init_opts=annex_init_opts,
                                              **common_kwargs)
                    installed_items += assure_list(result)
                except Exception as exc:
                    lgr.warning("Installation of %s has failed: %s", s,
                                exc_str(exc))
                    failed_items.append(s)

            if to_get:
                lgr.debug("Install passes into get %d items", len(to_get))
                # all commented out hint on inability to pass those options
                # into underlying install-related calls.
                # Also need to pass from get:
                #  annex_get_opts
                try:
                    installed_datasets = Get.__call__(
                        to_get,
                        # description=description,
                        # if_dirty=if_dirty,
                        # save=save,
                        # git_clone_opts=git_clone_opts,
                        # annex_init_opts=annex_init_opts
                        _return_datasets=True,
                        **common_kwargs)
                except IncompleteResultsError as exc:
                    exc_str_ = ': ' + exc_str(exc) if exc.results else ''
                    lgr.warning("Some items failed to install: %s", exc_str_)
                    installed_datasets = exc.results
                    failed_items.extend(exc.failed)

                # compose content_by_ds into result
                for dspath in installed_datasets:
                    ds_ = Dataset(dspath)
                    if ds_.is_installed():
                        installed_items.append(ds_)
                    else:
                        lgr.warning("%s was not installed", ds_)

            return Install._handle_and_return_installed_items(
                ds, installed_items, failed_items, save)

        if source and path and len(path) > 1:
            raise ValueError(
                "install needs a single PATH when source is provided.  "
                "Was given mutliple PATHs: %s" % str(path))

        # parameter constraints:
        if not source:
            raise InsufficientArgumentsError(
                "a `source` is required for installation")

        # code below deals with a single path only
        path = path[0] if path else None

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "installation `source` and destination `path` are identical. "
                "If you are trying to add a subdataset simply use `save` %s".
                format(path))

        # resolve the target location (if local) against the provided dataset
        # or CWD:
        if path is not None:
            # Should work out just fine for regular paths, so no additional
            # conditioning is necessary
            try:
                path_ri = RI(path)
            except Exception as e:
                raise ValueError("invalid path argument {}: ({})".format(
                    path, exc_str(e)))
            try:
                # Wouldn't work for SSHRI ATM, see TODO within SSHRI
                # yoh: path should be a local path, and mapping note within
                #      SSHRI about mapping localhost:path to path is kinda
                #      a peculiar use-case IMHO
                path = resolve_path(path_ri.localpath, dataset)
                # any `path` argument that point to something local now
                # resolved and is no longer a URL
            except ValueError:
                # URL doesn't point to a local something
                # so we have an actual URL in `path`. Since this is valid as a
                # single positional argument, `source` has to be None at this
                # point.
                if is_datalad_compat_ri(path) and source is None:
                    # we have an actual URL -> this should be the source
                    lgr.debug(
                        "Single argument given to install, that doesn't seem to "
                        "be a local path. "
                        "Assuming the argument identifies a source location.")
                    source = path
                    path = None

                else:
                    # `path` is neither a valid source nor a local path.
                    # TODO: The only thing left is a known subdataset with a
                    # name, that is not a path; Once we correctly distinguish
                    # between path and name of a submodule, we need to consider
                    # this.
                    # For now: Just raise
                    raise ValueError("Invalid path argument {0}".format(path))
        # `path` resolved, if there was any.

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source = _get_git_url_from_source(source)
        lgr.debug("Resolved source: {0}".format(source))
        # TODO: we probably need to resolve source, if it is a local path;
        # expandpath, normpath, ... Where exactly is the point to do it?

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            lgr.debug(
                "Neither dataset nor target installation path provided. "
                "Deriving destination path from given source %s", source)
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)

        # there is no other way -- my intoxicated brain tells me
        assert (path is not None)

        lgr.debug("Resolved installation target: {0}".format(path))
        destination_dataset = Dataset(path)

        if destination_dataset.is_installed():
            # this should not be, check if this is an error, or a reinstall
            # from the same source
            # this is where we would have installed this from
            candidate_sources = _get_flexible_source_candidates(
                source, destination_dataset.path)
            # this is where it was installed from
            track_name, track_url = _get_tracking_source(destination_dataset)
            if track_url in candidate_sources or get_local_file_url(track_url):
                # TODO: this one breaks "promise" assumptions of the repeated
                # invocations of install.
                # yoh thinks that we actually should be the ones to run update
                # (without merge) after basic
                # check that it is clean and up-to-date with its super dataset
                # and if so, not return here but continue with errands (recursive
                # installation and get_data) so we could provide the same
                # result if we rerun the same install twice.
                lgr.info(
                    "%s was already installed from %s. Use `update` to obtain "
                    "latest updates, or `get` or `install` with a path, not URL, "
                    "to (re)fetch data and / or subdatasets",
                    destination_dataset, track_url)
                return destination_dataset
            else:
                raise ValueError(
                    "There is already a dataset installed at the "
                    "destination: %s", destination_dataset)

        ###########
        # we should know everything necessary by now
        # actual installation starts
        ###########

        # FLOW GUIDE:
        # four cases:
        # 1. install into a dataset
        #   1.1. we install a known subdataset
        #        => git submodule update --init
        #   1.2. we install an existing repo as a subdataset inplace
        #        => git submodule add + magic
        #   1.3. we (recursively) try to install implicit subdatasets between
        #        ds and path
        #   1.4. we install a new subdataset from an explicit source
        #        => git submodule add
        # 2. we "just" install from an explicit source
        #    => git clone

        if ds is not None:
            # FLOW GUIDE: 1.

            # express the destination path relative to the root of
            # the dataset
            relativepath = relpath(path, start=ds.path)
            if relativepath.startswith(pardir):
                raise ValueError("installation path outside dataset "
                                 "({0})".format(path))
            lgr.debug("Resolved installation target relative to dataset "
                      "{0}: {1}".format(ds, relativepath))

            # FLOW_GUIDE 1.4.
            lgr.info("Installing subdataset from '{0}' at: {0}".format(
                source, relativepath))
            destination_dataset = _install_subds_from_flexible_source(
                ds, relativepath, source, reckless)
        else:
            # FLOW GUIDE: 2.
            lgr.info("Installing dataset at {0} from {1}".format(path, source))

            # Currently assuming there is nothing at the target to deal with
            # and rely on failures raising from the git call ...

            # We possibly need to consider /.git URL
            candidate_sources = _get_flexible_source_candidates(source)
            _clone_from_any_source(candidate_sources, destination_dataset.path)

        # FLOW GUIDE: All four cases done.
        if not destination_dataset.is_installed():
            # XXX  shouldn't we just fail!? (unless some explicit --skip-failing?)
            lgr.error("Installation failed.")
            return None

        _handle_possible_annex_dataset(destination_dataset, reckless)

        lgr.debug("Installation of %s done.", destination_dataset)

        if not destination_dataset.is_installed():
            # log error and don't report as installed item, but don't raise,
            # since we might be in a process of recursive installation where
            # a lot of other datasets can still be installed successfully.
            lgr.error(
                "Installation of {0} failed.".format(destination_dataset))
        else:
            installed_items.append(destination_dataset)

        # we need to decrease the recursion limit, relative to
        # subdatasets now
        subds_recursion_limit = max(0, recursion_limit - 1) \
                                  if isinstance(recursion_limit, int) \
                                  else recursion_limit
        # Now, recursive calls:
        if recursive:
            if description:
                # yoh: why?  especially if we somehow allow for templating them
                # with e.g. '%s' to catch the subdataset path
                lgr.warning("Description can't be assigned recursively.")

            subs = destination_dataset.get_subdatasets(
                # yes, it does make sense to combine no recursion with
                # recursion_limit: when the latter is 0 we get no subdatasets
                # reported, otherwise we always get the 1st-level subs
                recursive=False,
                recursion_limit=recursion_limit,
                absolute=False)

            if subs:
                lgr.debug("Obtaining subdatasets of %s: %s",
                          destination_dataset, subs)

                kwargs = common_kwargs.copy()
                kwargs['recursion_limit'] = subds_recursion_limit
                rec_installed = Get.__call__(
                    subs,  # all at once
                    dataset=destination_dataset,
                    # TODO expose this
                    # yoh: exactly!
                    #annex_get_opts=annex_get_opts,
                    **kwargs)
                # TODO do we want to filter this so `install` only returns
                # the datasets?
                if isinstance(rec_installed, list):
                    installed_items.extend(rec_installed)
                else:
                    installed_items.append(rec_installed)

        if get_data:
            lgr.debug("Getting data of {0}".format(destination_dataset))
            kwargs = common_kwargs.copy()
            kwargs['recursive'] = False
            destination_dataset.get(curdir, **kwargs)

        return Install._handle_and_return_installed_items(
            ds, installed_items, failed_items, save)
Example #25
0
def test_container_files(ds_path, local_file, url):

    # setup things to add
    #
    # Note: Since "adding" as a container doesn't actually call anything or use
    # the container in some way, but simply registers it, for testing any file
    # is sufficient.
    local_file = get_local_file_url(op.join(local_file, 'some_container.img'))
    remote_file = urljoin(url, 'some_container.img')

    # prepare dataset:
    ds = Dataset(ds_path).create()
    # non-default location:
    ds.config.add("datalad.containers.location",
                  value=op.join(".datalad", "test-environments"),
                  where='dataset')
    ds.save(message="Configure container mountpoint")

    # no containers yet:
    res = ds.containers_list()
    assert_result_count(res, 0)

    # add first "image":
    res = ds.containers_add(name="first", url=local_file)
    ok_clean_git(ds.repo)

    target_path = op.join(ds.path, ".datalad", "test-environments", "first")
    assert_result_count(res,
                        1,
                        status="ok",
                        type="file",
                        path=target_path,
                        action="containers_add")
    ok_(op.lexists(target_path))
    eq_(local_file, ds.config.get("datalad.containers.first.url"))

    # add a "remote" one:
    # don't provide url in the call, but in a config:
    ds.config.add("datalad.containers.second.url",
                  value=remote_file,
                  where='dataset')
    ds.save(message="Configure URL for container 'second'")

    res = ds.containers_add(name="second")
    ok_clean_git(ds.repo)
    target_path = op.join(ds.path, ".datalad", "test-environments", "second")
    assert_result_count(res,
                        1,
                        status="ok",
                        type="file",
                        path=target_path,
                        action="containers_add")
    ok_(op.lexists(target_path))
    # config wasn't changed:
    eq_(remote_file, ds.config.get("datalad.containers.second.url"))

    res = ds.containers_list()
    assert_result_count(res,
                        2,
                        status='ok',
                        type='file',
                        action='containers_list')
Example #26
0
    def __call__(source,
                 path=None,
                 dataset=None,
                 description=None,
                 reckless=False,
                 alt_sources=None):
        # TODO next ones should be there, but cannot go anywhere
        # git_opts=None,
        # git_clone_opts=None,
        # annex_opts=None,
        # annex_init_opts=None

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        dataset = require_dataset(
            dataset, check_installed=True, purpose='cloning') \
            if dataset is not None else dataset
        refds_path = dataset.path if dataset else None

        if isinstance(source, Dataset):
            source = source.path

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "clone `source` and destination `path` are identical [{}]. "
                "If you are trying to add a subdataset simply use `add`".
                format(path))

        if path is not None:
            path = resolve_path(path, dataset)

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source_url = source
        source_ = _get_git_url_from_source(source)
        lgr.debug("Resolved clone source from '%s' to '%s'", source, source_)
        source = source_

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)
            lgr.debug("Determined clone target path from source")
        lgr.debug("Resolved clone target path to: '%s'", path)

        # there is no other way -- my intoxicated brain tells me
        assert (path is not None)

        destination_dataset = Dataset(path)
        dest_path = path

        status_kwargs = dict(action='install',
                             ds=destination_dataset,
                             logger=lgr,
                             refds=refds_path,
                             source_url=source_url)

        # important test! based on this `rmtree` will happen below after failed clone
        if exists(dest_path) and listdir(dest_path):
            if destination_dataset.is_installed():
                # check if dest was cloned from the given source before
                # this is where we would have installed this from
                guessed_sources = _get_flexible_source_candidates(
                    source, dest_path)
                # this is where it was actually installed from
                track_name, track_url = _get_tracking_source(
                    destination_dataset)
                if track_url in guessed_sources or \
                        get_local_file_url(track_url) in guessed_sources:
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destination_dataset, source),
                        **status_kwargs)
                    return
            # anything else is an error
            yield get_status_dict(
                status='error',
                message=
                'target path already exists and not empty, refuse to clone into target path',
                **status_kwargs)
            return

        if dataset is not None and relpath(
                path, start=dataset.path).startswith(pardir):
            yield get_status_dict(
                status='error',
                message=
                ("clone target path '%s' not in specified target dataset '%s'",
                 path, dataset),
                **status_kwargs)
            return

        # generate candidate URLs from source argument to overcome a few corner cases
        # and hopefully be more robust than git clone
        candidate_sources = []
        # combine all given sources (incl. alternatives), maintain order
        for s in [source] + assure_list(alt_sources):
            candidate_sources.extend(_get_flexible_source_candidates(s))
        lgr.info("Cloning %s to '%s'", source, dest_path)
        for isource_, source_ in enumerate(candidate_sources):
            try:
                lgr.debug(
                    "Attempting to clone %s (%d out of %d candidates) to '%s'",
                    source_, isource_ + 1, len(candidate_sources), dest_path)
                GitRepo.clone(path=dest_path, url=source_, create=True)
                break  # do not bother with other sources if succeeded
            except GitCommandError as e:
                lgr.debug("Failed to clone from URL: %s (%s)", source_,
                          exc_str(e))
                if exists(dest_path):
                    lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                              dest_path)
                    rmtree(dest_path)
                if 'could not create work tree' in e.stderr.lower():
                    # this cannot be fixed by trying another URL
                    yield get_status_dict(
                        status='error',
                        message=re.match(r".*fatal: (.*)\n",
                                         e.stderr,
                                         flags=re.MULTILINE
                                         | re.DOTALL).group(1),
                        **status_kwargs)
                    return

        if not destination_dataset.is_installed():
            yield get_status_dict(
                status='error',
                message=(
                    "Failed to clone data from any candidate source URL: %s",
                    candidate_sources),
                **status_kwargs)
            return

        if dataset is not None:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.add(dest_path,
                                 save=True,
                                 ds2super=True,
                                 return_type='generator',
                                 result_filter=None,
                                 result_xfm=None,
                                 on_failure='ignore'):
                yield r

        _handle_possible_annex_dataset(destination_dataset,
                                       reckless,
                                       description=description)

        # yield successful clone of the base dataset now, as any possible
        # subdataset clone down below will not alter the Git-state of the
        # parent
        yield get_status_dict(status='ok', **status_kwargs)
Example #27
0
def test_container_from_subdataset(ds_path, src_subds_path, local_file):

    # prepare a to-be subdataset with a registered container
    src_subds = Dataset(src_subds_path).create()
    src_subds.containers_add(name="first",
                             url=get_local_file_url(
                                 op.join(local_file, 'some_container.img')))
    # add it as subdataset to a super ds:
    ds = Dataset(ds_path).create()
    subds = ds.install("sub", source=src_subds_path)
    # add it again one level down to see actual recursion:
    subds.install("subsub", source=src_subds_path)

    # We come up empty without recursive:
    res = ds.containers_list(recursive=False, **RAW_KWDS)
    assert_result_count(res, 0)

    # query available containers from within super:
    res = ds.containers_list(recursive=True, **RAW_KWDS)
    assert_result_count(res, 2)
    assert_in_results(res, action="containers", refds=ds.path)

    # default location within the subdataset:
    target_path = op.join(subds.path, '.datalad', 'environments', 'first',
                          'image')
    assert_result_count(res,
                        1,
                        name='sub/first',
                        type='file',
                        action='containers',
                        status='ok',
                        path=target_path,
                        parentds=subds.path)

    # not installed subdataset doesn't pose an issue:
    sub2 = ds.create("sub2")
    assert_result_count(ds.subdatasets(), 2, type="dataset")
    ds.uninstall("sub2")
    from datalad.tests.utils import assert_false
    assert_false(sub2.is_installed())

    # same results as before, not crashing or somehow confused by a not present
    # subds:
    res = ds.containers_list(recursive=True, **RAW_KWDS)
    assert_result_count(res, 2)
    assert_result_count(res,
                        1,
                        name='sub/first',
                        type='file',
                        action='containers',
                        status='ok',
                        path=target_path,
                        parentds=subds.path)

    # The default renderer includes the image names.
    with swallow_outputs() as out:
        ds.containers_list(recursive=True)
        lines = out.out.splitlines()
    assert_re_in("sub/first", lines)
    assert_re_in("sub/subsub/first", lines)
    # But we are careful not to render partial names from subdataset traversals
    # (i.e. we recurse with containers_list(..., result_renderer=None)).
    with assert_raises(AssertionError):
        assert_re_in("subsub/first", lines)
Example #28
0
def test_nested_pushclone_cycle_allplatforms(origpath, storepath, clonepath):
    if 'DATALAD_SEED' in os.environ:
        # we are using create-sibling-ria via the cmdline in here
        # this will create random UUIDs for datasets
        # however, given a fixed seed each call to this command will start
        # with the same RNG seed, hence yield the same UUID on the same
        # machine -- leading to a collision
        raise SkipTest(
            'Test incompatible with fixed random number generator seed')
    # the aim here is this high-level test a std create-push-clone cycle for a
    # dataset with a subdataset, with the goal to ensure that correct branches
    # and commits are tracked, regardless of platform behavior and condition
    # of individual clones. Nothing fancy, just that the defaults behave in
    # sensible ways
    from datalad.cmd import WitlessRunner as Runner
    run = Runner().run

    # create original nested dataset
    with chpwd(origpath):
        run(['datalad', 'create', 'super'])
        run(['datalad', 'create', '-d', 'super', str(Path('super', 'sub'))])

    # verify essential linkage properties
    orig_super = Dataset(Path(origpath, 'super'))
    orig_sub = Dataset(orig_super.pathobj / 'sub')

    (orig_super.pathobj / 'file1.txt').write_text('some1')
    (orig_sub.pathobj / 'file2.txt').write_text('some1')
    with chpwd(orig_super.path):
        run(['datalad', 'save', '--recursive'])

    # TODO not yet reported clean with adjusted branches
    #assert_repo_status(orig_super.path)

    # the "true" branch that sub is on, and the gitsha of the HEAD commit of it
    orig_sub_corr_branch = \
        orig_sub.repo.get_corresponding_branch() or orig_sub.repo.get_active_branch()
    orig_sub_corr_commit = orig_sub.repo.get_hexsha(orig_sub_corr_branch)

    # make sure the super trackes this commit
    assert_in_results(
        orig_super.subdatasets(),
        path=orig_sub.path,
        gitshasum=orig_sub_corr_commit,
        # TODO it should also track the branch name
        # Attempted: https://github.com/datalad/datalad/pull/3817
        # But reverted: https://github.com/datalad/datalad/pull/4375
    )

    # publish to a store, to get into a platform-agnostic state
    # (i.e. no impact of an annex-init of any kind)
    store_url = 'ria+' + get_local_file_url(storepath)
    with chpwd(orig_super.path):
        run([
            'datalad', 'create-sibling-ria', '--recursive', '-s', 'store',
            store_url
        ])
        run(['datalad', 'push', '--recursive', '--to', 'store'])

    # we are using the 'store' sibling's URL, which should be a plain path
    store_super = AnnexRepo(orig_super.siblings(name='store')[0]['url'],
                            init=False)
    store_sub = AnnexRepo(orig_sub.siblings(name='store')[0]['url'],
                          init=False)

    # both datasets in the store only carry the real branches, and nothing
    # adjusted
    for r in (store_super, store_sub):
        eq_(set(r.get_branches()), set([orig_sub_corr_branch, 'git-annex']))

    # and reobtain from a store
    cloneurl = 'ria+' + get_local_file_url(str(storepath), compatibility='git')
    with chpwd(clonepath):
        run(['datalad', 'clone', cloneurl + '#' + orig_super.id, 'super'])
        run(['datalad', '-C', 'super', 'get', '--recursive', '.'])

    # verify that nothing has changed as a result of a push/clone cycle
    clone_super = Dataset(Path(clonepath, 'super'))
    clone_sub = Dataset(clone_super.pathobj / 'sub')
    assert_in_results(
        clone_super.subdatasets(),
        path=clone_sub.path,
        gitshasum=orig_sub_corr_commit,
    )

    for ds1, ds2, f in ((orig_super, clone_super, 'file1.txt'),
                        (orig_sub, clone_sub, 'file2.txt')):
        eq_((ds1.pathobj / f).read_text(), (ds2.pathobj / f).read_text())

    # get status info that does not recursive into subdatasets, i.e. not
    # looking for uncommitted changes
    # we should see no modification reported
    assert_not_in_results(clone_super.status(eval_subdataset_state='commit'),
                          state='modified')
    # and now the same for a more expensive full status
    assert_not_in_results(clone_super.status(recursive=True), state='modified')
Example #29
0
def test_install_simple_local(src_repo=None, path=None, *, type_):

    src_ds = Dataset(src_repo).create(result_renderer='disabled',
                                      force=True,
                                      annex=(type_ == "annex"))
    src_ds.save(['INFO.txt', 'test.dat'], to_git=True)
    if type_ == 'annex':
        src_ds.save('test-annex.dat', to_git=False)
    elif type_ == 'git':
        pass
    else:
        raise ValueError("'type' must be 'git' or 'annex'")
    # equivalent repo on github:
    url = "https://github.com/datalad/testrepo--basic--r1.git"
    sources = [
        src_ds.path,
        get_local_file_url(src_ds.path, compatibility='git')
    ]
    if not dl_cfg.get('datalad.tests.nonetwork'):
        sources.append(url)

    for src in sources:
        origin = Dataset(path)

        # now install it somewhere else
        ds = install(path, source=src, description='mydummy')
        eq_(ds.path, path)
        ok_(ds.is_installed())
        if not isinstance(origin.repo, AnnexRepo):
            # this means it is a GitRepo
            ok_(isinstance(origin.repo, GitRepo))
            # stays plain Git repo
            ok_(isinstance(ds.repo, GitRepo))
            ok_(not isinstance(ds.repo, AnnexRepo))
            ok_(GitRepo.is_valid_repo(ds.path))
            files = ds.repo.get_indexed_files()
            assert_in('test.dat', files)
            assert_in('INFO.txt', files)
            assert_repo_status(path, annex=False)
        else:
            # must be an annex
            ok_(isinstance(ds.repo, AnnexRepo))
            ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
            files = ds.repo.get_indexed_files()
            assert_in('test.dat', files)
            assert_in('INFO.txt', files)
            assert_in('test-annex.dat', files)
            assert_repo_status(path, annex=True)
            # no content was installed:
            ok_(not ds.repo.file_has_content('test-annex.dat'))
            uuid_before = ds.repo.uuid
            ok_(uuid_before)  # we actually have an uuid
            eq_(ds.repo.get_description(), 'mydummy')

        # installing it again, shouldn't matter:
        res = install(path, source=src, result_xfm=None, return_type='list')
        assert_status('notneeded', res)
        ok_(ds.is_installed())
        if isinstance(origin.repo, AnnexRepo):
            eq_(uuid_before, ds.repo.uuid)

        # cleanup before next iteration
        rmtree(path)
Example #30
0
    def __call__(
            path=None,
            source=None,
            dataset=None,
            get_data=False,
            description=None,
            recursive=False,
            recursion_limit=None,
            if_dirty='save-before',
            save=True,
            reckless=False,
            git_opts=None,
            git_clone_opts=None,
            annex_opts=None,
            annex_init_opts=None,
            jobs=None):

        # normalize path argument to be equal when called from cmdline and
        # python and nothing was passed into `path`
        path = assure_list(path)

        if not source and not path:
            raise InsufficientArgumentsError(
                "Please provide at least a source or a path")


        ## Common kwargs to pass to underlying git/install calls.
        #  They might need adjustments (e.g. for recursion_limit, but
        #  otherwise would be applicable throughout
        #
        # There should have been more of common options!
        # since underneath get could do similar installs, but now they
        # have duplicated implementations which differ (e.g. get does not
        # annex init installed annexes)
        common_kwargs = dict(
            get_data=get_data,
            recursive=recursive,
            recursion_limit=recursion_limit,
            git_opts=git_opts,
            annex_opts=annex_opts,
            reckless=reckless,
            jobs=jobs,
        )

        installed_items = []
        failed_items = []

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = None
        if dataset is not None:
            ds = require_dataset(dataset, check_installed=True,
                                 purpose='installation')
            handle_dirty_dataset(ds, if_dirty)

        # switch into scenario without --source:
        if source is None:
            # we need to collect URLs and paths
            to_install = []
            to_get = []
            for urlpath in path:
                ri = RI(urlpath)
                (to_get if isinstance(ri, PathRI) else to_install).append(urlpath)

            common_kwargs['dataset'] = dataset

            # first install, and then get
            for s in to_install:
                lgr.debug("Install passes into install source=%s", s)
                try:
                    result = Install.__call__(
                                    source=s,
                                    description=description,
                                    if_dirty=if_dirty,
                                    save=save,
                                    git_clone_opts=git_clone_opts,
                                    annex_init_opts=annex_init_opts,
                                    **common_kwargs
                                )
                    installed_items += assure_list(result)
                except Exception as exc:
                    lgr.warning("Installation of %s has failed: %s",
                                s, exc_str(exc))
                    failed_items.append(s)

            if to_get:
                lgr.debug("Install passes into get %d items", len(to_get))
                # all commented out hint on inability to pass those options
                # into underlying install-related calls.
                # Also need to pass from get:
                #  annex_get_opts
                try:
                    installed_datasets = Get.__call__(
                        to_get,
                        # description=description,
                        # if_dirty=if_dirty,
                        # save=save,
                        # git_clone_opts=git_clone_opts,
                        # annex_init_opts=annex_init_opts
                        _return_datasets=True,
                        **common_kwargs
                    )
                except IncompleteResultsError as exc:
                    exc_str_ = ': ' + exc_str(exc) if exc.results else ''
                    lgr.warning("Some items failed to install: %s",
                                exc_str_)
                    installed_datasets = exc.results
                    failed_items.extend(exc.failed)

                # compose content_by_ds into result
                for dspath in installed_datasets:
                    ds_ = Dataset(dspath)
                    if ds_.is_installed():
                        installed_items.append(ds_)
                    else:
                        lgr.warning("%s was not installed", ds_)

            return Install._handle_and_return_installed_items(
                ds, installed_items, failed_items, save)

        if source and path and len(path) > 1:
            raise ValueError(
                "install needs a single PATH when source is provided.  "
                "Was given mutliple PATHs: %s" % str(path))

        # parameter constraints:
        if not source:
            raise InsufficientArgumentsError(
                "a `source` is required for installation")

        # code below deals with a single path only
        path = path[0] if path else None

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "installation `source` and destination `path` are identical. "
                "If you are trying to add a subdataset simply use `save` %s".format(
                    path))

        # resolve the target location (if local) against the provided dataset
        # or CWD:
        if path is not None:
            # Should work out just fine for regular paths, so no additional
            # conditioning is necessary
            try:
                path_ri = RI(path)
            except Exception as e:
                raise ValueError(
                    "invalid path argument {}: ({})".format(path, exc_str(e)))
            try:
                # Wouldn't work for SSHRI ATM, see TODO within SSHRI
                # yoh: path should be a local path, and mapping note within
                #      SSHRI about mapping localhost:path to path is kinda
                #      a peculiar use-case IMHO
                path = resolve_path(path_ri.localpath, dataset)
                # any `path` argument that point to something local now
                # resolved and is no longer a URL
            except ValueError:
                # URL doesn't point to a local something
                # so we have an actual URL in `path`. Since this is valid as a
                # single positional argument, `source` has to be None at this
                # point.
                if is_datalad_compat_ri(path) and source is None:
                    # we have an actual URL -> this should be the source
                    lgr.debug(
                        "Single argument given to install, that doesn't seem to "
                        "be a local path. "
                        "Assuming the argument identifies a source location.")
                    source = path
                    path = None

                else:
                    # `path` is neither a valid source nor a local path.
                    # TODO: The only thing left is a known subdataset with a
                    # name, that is not a path; Once we correctly distinguish
                    # between path and name of a submodule, we need to consider
                    # this.
                    # For now: Just raise
                    raise ValueError("Invalid path argument {0}".format(path))
        # `path` resolved, if there was any.

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source = _get_git_url_from_source(source)
        lgr.debug("Resolved source: {0}".format(source))
        # TODO: we probably need to resolve source, if it is a local path;
        # expandpath, normpath, ... Where exactly is the point to do it?

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            lgr.debug(
                "Neither dataset nor target installation path provided. "
                "Deriving destination path from given source %s",
                source)
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)

        # there is no other way -- my intoxicated brain tells me
        assert(path is not None)

        lgr.debug("Resolved installation target: {0}".format(path))
        destination_dataset = Dataset(path)

        if destination_dataset.is_installed():
            # this should not be, check if this is an error, or a reinstall
            # from the same source
            # this is where we would have installed this from
            candidate_sources = _get_flexible_source_candidates(
                source, destination_dataset.path)
            # this is where it was installed from
            track_name, track_url = _get_tracking_source(destination_dataset)
            if track_url in candidate_sources or get_local_file_url(track_url):
                # TODO: this one breaks "promise" assumptions of the repeated
                # invocations of install.
                # yoh thinks that we actually should be the ones to run update
                # (without merge) after basic
                # check that it is clean and up-to-date with its super dataset
                # and if so, not return here but continue with errands (recursive
                # installation and get_data) so we could provide the same
                # result if we rerun the same install twice.
                lgr.info(
                    "%s was already installed from %s. Use `update` to obtain "
                    "latest updates, or `get` or `install` with a path, not URL, "
                    "to (re)fetch data and / or subdatasets",
                    destination_dataset, track_url)
                return destination_dataset
            else:
                raise ValueError("There is already a dataset installed at the "
                                 "destination: %s", destination_dataset)

        ###########
        # we should know everything necessary by now
        # actual installation starts
        ###########

        # FLOW GUIDE:
        # four cases:
        # 1. install into a dataset
        #   1.1. we install a known subdataset
        #        => git submodule update --init
        #   1.2. we install an existing repo as a subdataset inplace
        #        => git submodule add + magic
        #   1.3. we (recursively) try to install implicit subdatasets between
        #        ds and path
        #   1.4. we install a new subdataset from an explicit source
        #        => git submodule add
        # 2. we "just" install from an explicit source
        #    => git clone

        if ds is not None:
            # FLOW GUIDE: 1.

            # express the destination path relative to the root of
            # the dataset
            relativepath = relpath(path, start=ds.path)
            if relativepath.startswith(pardir):
                raise ValueError("installation path outside dataset "
                                 "({0})".format(path))
            lgr.debug("Resolved installation target relative to dataset "
                      "{0}: {1}".format(ds, relativepath))

            # FLOW_GUIDE 1.4.
            lgr.info("Installing subdataset from '{0}' at: {0}".format(
                source, relativepath))
            destination_dataset = _install_subds_from_flexible_source(
                ds,
                relativepath,
                source,
                reckless)
        else:
            # FLOW GUIDE: 2.
            lgr.info("Installing dataset at {0} from {1}".format(path, source))

            # Currently assuming there is nothing at the target to deal with
            # and rely on failures raising from the git call ...

            # We possibly need to consider /.git URL
            candidate_sources = _get_flexible_source_candidates(source)
            _clone_from_any_source(candidate_sources, destination_dataset.path)

        # FLOW GUIDE: All four cases done.
        if not destination_dataset.is_installed():
            # XXX  shouldn't we just fail!? (unless some explicit --skip-failing?)
            lgr.error("Installation failed.")
            return None

        _handle_possible_annex_dataset(destination_dataset, reckless)

        lgr.debug("Installation of %s done.", destination_dataset)

        if not destination_dataset.is_installed():
            # log error and don't report as installed item, but don't raise,
            # since we might be in a process of recursive installation where
            # a lot of other datasets can still be installed successfully.
            lgr.error("Installation of {0} failed.".format(destination_dataset))
        else:
            installed_items.append(destination_dataset)

        # we need to decrease the recursion limit, relative to
        # subdatasets now
        subds_recursion_limit = max(0, recursion_limit - 1) \
                                  if isinstance(recursion_limit, int) \
                                  else recursion_limit
        # Now, recursive calls:
        if recursive:
            if description:
                # yoh: why?  especially if we somehow allow for templating them
                # with e.g. '%s' to catch the subdataset path
                lgr.warning("Description can't be assigned recursively.")

            subs = destination_dataset.get_subdatasets(
                # yes, it does make sense to combine no recursion with
                # recursion_limit: when the latter is 0 we get no subdatasets
                # reported, otherwise we always get the 1st-level subs
                recursive=False,
                recursion_limit=recursion_limit,
                absolute=False)

            if subs:
                lgr.debug("Obtaining subdatasets of %s: %s",
                          destination_dataset,
                          subs)

                kwargs = common_kwargs.copy()
                kwargs['recursion_limit'] = subds_recursion_limit
                rec_installed = Get.__call__(
                    subs,  # all at once
                    dataset=destination_dataset,
                    # TODO expose this
                    # yoh: exactly!
                    #annex_get_opts=annex_get_opts,
                    **kwargs
                )
                # TODO do we want to filter this so `install` only returns
                # the datasets?
                if isinstance(rec_installed, list):
                    installed_items.extend(rec_installed)
                else:
                    installed_items.append(rec_installed)

        if get_data:
            lgr.debug("Getting data of {0}".format(destination_dataset))
            kwargs = common_kwargs.copy()
            kwargs['recursive'] = False
            destination_dataset.get(curdir, **kwargs)

        return Install._handle_and_return_installed_items(
            ds, installed_items, failed_items, save)