Ejemplo n.º 1
0
def test_install_known_subdataset(src=None, path=None):

    _mk_submodule_annex(src, fname="test-annex.dat", fcontent="whatever")

    # get the superdataset:
    ds = install(path, source=src)
    # subdataset not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    assert_in('subm 1', ds.subdatasets(state='absent', result_xfm='relpaths'))
    assert_not_in('subm 1',
                  ds.subdatasets(state='present', result_xfm='relpaths'))
    # install it:
    ds.install('subm 1')
    ok_(subds.is_installed())
    ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False))
    # Verify that it is the correct submodule installed and not
    # new repository initiated
    assert_in("test-annex.dat", subds.repo.get_indexed_files()),
    assert_not_in('subm 1',
                  ds.subdatasets(state='absent', result_xfm='relpaths'))
    assert_in('subm 1', ds.subdatasets(state='present', result_xfm='relpaths'))

    # now, get the data by reinstalling with -g:
    ok_(subds.repo.file_has_content('test-annex.dat') is False)
    with chpwd(ds.path):
        result = get(path='subm 1', dataset=os.curdir)
        assert_in_results(result, path=opj(subds.path, 'test-annex.dat'))
        ok_(subds.repo.file_has_content('test-annex.dat') is True)
        ok_(subds.is_installed())
Ejemplo n.º 2
0
def test_install_known_subdataset(src, path):

    # get the superdataset:
    ds = install(path, source=src)
    # subdataset not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_not_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
    # install it:
    ds.install('subm 1')
    ok_(subds.is_installed())
    ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False))
    # Verify that it is the correct submodule installed and not
    # new repository initiated
    eq_(set(subds.repo.get_indexed_files()),
        {'test.dat', 'INFO.txt', 'test-annex.dat'})
    assert_not_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))

    # now, get the data by reinstalling with -g:
    ok_(subds.repo.file_has_content('test-annex.dat') is False)
    with chpwd(ds.path):
        result = get(path='subm 1', dataset=os.curdir)
        assert_in_results(result, path=opj(subds.path, 'test-annex.dat'))
        ok_(subds.repo.file_has_content('test-annex.dat') is True)
        ok_(subds.is_installed())
Ejemplo n.º 3
0
    def newfunc(*arg, **kw):

        if DATALAD_TESTS_CACHE:
            # Note: We can't pass keys based on `paths` parameter to
            # get_cached_dataset yet, since translation to keys depends on a
            # worktree. We'll have the worktree of `version` only after cloning.
            ds = get_cached_dataset(url, version=version)
            clone_ds = Clone()(ds.pathobj, arg[-1])
        else:
            clone_ds = Clone()(url, arg[-1])
        if version:
            clone_ds.repo.checkout(version)
        if paths and AnnexRepo.is_valid_repo(clone_ds.path):
            # just assume ds is annex as well. Otherwise `Clone` wouldn't
            # work correctly - we don't need to test its implementation here
            if DATALAD_TESTS_CACHE:
                # cache is enabled; we need to make sure it has the desired
                # content, so clone_ds can get it from there. However, we got
                # `paths` and potentially a `version` they refer to. We can't
                # assume the same (or any) worktree in cache. Hence we need to
                # translate to keys.
                keys = clone_ds.repo.get_file_key(paths)
                ds.repo.get(keys, key=True)
                clone_ds.repo.fsck(remote='origin', fast=True)

            clone_ds.get(paths)
        return f(*(arg[:-1] + (clone_ds, )), **kw)
Ejemplo n.º 4
0
def test_install_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = install(path, source=src, description='mydummy')
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt'})
        ok_clean_git(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        ok_clean_git(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid
        eq_(ds.repo.get_description(), 'mydummy')

    # installing it again, shouldn't matter:
    res = install(path, source=src, result_xfm=None, return_type='list')
    assert_status('notneeded', res)
    ok_(ds.is_installed())
    if isinstance(origin.repo, AnnexRepo):
        eq_(uuid_before, ds.repo.uuid)
Ejemplo n.º 5
0
def test_notclone_known_subdataset(src, path):
    # get the superdataset:
    ds = clone(src, path, result_xfm='datasets', return_type='item-or-list')

    # subdataset not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_not_in('subm 1',
                  ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
    # clone is not meaningful
    res = ds.clone('subm 1', on_failure='ignore')
    assert_status('error', res)
    assert_message('Failed to clone data from any candidate source URL: %s',
                   res)
    # get does the job
    res = ds.get(path='subm 1', get_data=False)
    assert_status('ok', res)
    ok_(subds.is_installed())
    ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False))
    # Verify that it is the correct submodule installed and not
    # new repository initiated
    eq_(set(subds.repo.get_indexed_files()),
        {'test.dat', 'INFO.txt', 'test-annex.dat'})
    assert_not_in('subm 1',
                  ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
Ejemplo n.º 6
0
def test_install_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = install(path, source=src, description='mydummy')
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'})
        ok_clean_git(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        ok_clean_git(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid
        eq_(ds.repo.get_description(), 'mydummy')

    # installing it again, shouldn't matter:
    res = install(path, source=src, result_xfm=None, return_type='list')
    assert_status('notneeded', res)
    ok_(ds.is_installed())
    if isinstance(origin.repo, AnnexRepo):
        eq_(uuid_before, ds.repo.uuid)
Ejemplo n.º 7
0
def test_notclone_known_subdataset(src, path):
    # get the superdataset:
    ds = clone(src, path,
               result_xfm='datasets', return_type='item-or-list')

    # subdataset not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_not_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
    # clone is not meaningful
    res = ds.clone('subm 1', on_failure='ignore')
    assert_status('error', res)
    assert_message('Failed to clone from any candidate source URL. '
                   'Encountered errors per each url were: %s',
                   res)
    # get does the job
    res = ds.get(path='subm 1', get_data=False)
    assert_status('ok', res)
    ok_(subds.is_installed())
    ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False))
    # Verify that it is the correct submodule installed and not
    # new repository initiated
    eq_(set(subds.repo.get_indexed_files()),
        {'test.dat', 'INFO.txt', 'test-annex.dat'})
    assert_not_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
Ejemplo n.º 8
0
def test_install_known_subdataset(src, path):

    # get the superdataset:
    ds = install(path, source=src)
    # subdataset not installed:
    subds = Dataset(opj(path, 'subm 1'))
    assert_false(subds.is_installed())
    assert_in('subm 1', ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_not_in('subm 1',
                  ds.subdatasets(fulfilled=True, result_xfm='relpaths'))
    # install it:
    ds.install('subm 1')
    ok_(subds.is_installed())
    ok_(AnnexRepo.is_valid_repo(subds.path, allow_noninitialized=False))
    # Verify that it is the correct submodule installed and not
    # new repository initiated
    eq_(set(subds.repo.get_indexed_files()),
        {'test.dat', 'INFO.txt', 'test-annex.dat'})
    assert_not_in('subm 1',
                  ds.subdatasets(fulfilled=False, result_xfm='relpaths'))
    assert_in('subm 1', ds.subdatasets(fulfilled=True, result_xfm='relpaths'))

    # now, get the data by reinstalling with -g:
    ok_(subds.repo.file_has_content('test-annex.dat') is False)
    with chpwd(ds.path):
        result = get(path='subm 1', dataset=os.curdir)
        assert_in_results(result, path=opj(subds.path, 'test-annex.dat'))
        ok_(subds.repo.file_has_content('test-annex.dat') is True)
        ok_(subds.is_installed())
Ejemplo n.º 9
0
    def _wrap_cached_dataset(*arg, **kw):

        if DATALAD_TESTS_CACHE:
            # Note: We can't pass keys based on `paths` parameter to
            # get_cached_dataset yet, since translation to keys depends on a
            # worktree. We'll have the worktree of `version` only after cloning.
            ds = get_cached_dataset(url, version=version)
            clone_ds = Clone()(ds.pathobj, arg[-1])
        else:
            clone_ds = Clone()(url, arg[-1])
        #save some cycles
        clone_repo = clone_ds.repo
        if version:
            clone_repo.checkout(version)
        if paths and AnnexRepo.is_valid_repo(clone_ds.path):
            # just assume ds is annex as well. Otherwise `Clone` wouldn't
            # work correctly - we don't need to test its implementation here
            if DATALAD_TESTS_CACHE:
                # cache is enabled; we need to make sure it has the desired
                # content, so clone_ds can get it from there. However, we got
                # `paths` and potentially a `version` they refer to. We can't
                # assume the same (or any) worktree in cache. Hence we need to
                # translate to keys.
                # MIH Despite the variable names used in this function
                # (pathS, keyS) they ultimately are passed to get(..., key=True)
                # which means that it can ever only be a single path and a
                # single key -- this is very confusing.
                # the key determination could hence be done with
                # get_file_annexinfo() in a much simpler way, but it seems this
                # function wants to be ready for more, sigh
                keys = [
                    p['key'] for p in clone_repo.get_content_annexinfo(
                        ensure_list(paths), init=None).values() if 'key' in p
                ]
                if keys:
                    ds.repo.get(keys, key=True)
                clone_repo.fsck(remote=DEFAULT_REMOTE, fast=True)

            clone_ds.get(paths)
        return f(*(arg[:-1] + (clone_ds, )), **kw)
Ejemplo n.º 10
0
def test_clone_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = clone(src,
               path,
               description='mydummy',
               result_xfm='datasets',
               return_type='item-or-list')
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'})
        assert_repo_status(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        assert_repo_status(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid
        eq_(ds.repo.get_description(), 'mydummy')

    # installing it again, shouldn't matter:
    res = clone(src, path, result_xfm=None, return_type='list')
    assert_result_values_equal(res, 'source_url', [src])
    assert_status('notneeded', res)
    assert_message("dataset %s was already cloned from '%s'", res)
    ok_(ds.is_installed())
    if isinstance(origin.repo, AnnexRepo):
        eq_(uuid_before, ds.repo.uuid)
Ejemplo n.º 11
0
def test_install_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = install(path, source=src)
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()), {'test.dat', 'INFO.txt'})
        ok_clean_git(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        ok_clean_git(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid

    # installing it again, shouldn't matter:
    with swallow_logs(new_level=logging.INFO) as cml:
        ds = install(path, source=src)
        cml.assert_logged(msg="{0} was already installed from".format(ds),
                          regex=False,
                          level="INFO")
        ok_(ds.is_installed())
        if isinstance(origin.repo, AnnexRepo):
            eq_(uuid_before, ds.repo.uuid)
Ejemplo n.º 12
0
def test_install_simple_local(src, path):
    origin = Dataset(path)

    # now install it somewhere else
    ds = install(path, source=src)
    eq_(ds.path, path)
    ok_(ds.is_installed())
    if not isinstance(origin.repo, AnnexRepo):
        # this means it is a GitRepo
        ok_(isinstance(origin.repo, GitRepo))
        # stays plain Git repo
        ok_(isinstance(ds.repo, GitRepo))
        ok_(not isinstance(ds.repo, AnnexRepo))
        ok_(GitRepo.is_valid_repo(ds.path))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt'})
        ok_clean_git(path, annex=False)
    else:
        # must be an annex
        ok_(isinstance(ds.repo, AnnexRepo))
        ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
        eq_(set(ds.repo.get_indexed_files()),
            {'test.dat', 'INFO.txt', 'test-annex.dat'})
        ok_clean_git(path, annex=True)
        # no content was installed:
        ok_(not ds.repo.file_has_content('test-annex.dat'))
        uuid_before = ds.repo.uuid

    # installing it again, shouldn't matter:
    with swallow_logs(new_level=logging.INFO) as cml:
        ds = install(path, source=src)
        cml.assert_logged(msg="{0} was already installed from".format(ds),
                          regex=False, level="INFO")
        ok_(ds.is_installed())
        if isinstance(origin.repo, AnnexRepo):
            eq_(uuid_before, ds.repo.uuid)
Ejemplo n.º 13
0
def test_target_ssh_simple(origin, src_path, target_rootpath):

    # prepare src
    source = install(src_path, source=origin)

    target_path = opj(target_rootpath, "basic")
    # it will try to fetch it so would fail as well since sshurl is wrong
    with swallow_logs(new_level=logging.ERROR) as cml, \
        assert_raises(GitCommandError):
            create_sibling(
                dataset=source,
                target="local_target",
                sshurl="ssh://localhost",
                target_dir=target_path,
                ui=True)
        # is not actually happening on one of the two basic cases -- TODO figure it out
        # assert_in('enableremote local_target failed', cml.out)

    GitRepo(target_path, create=False)  # raises if not a git repo
    assert_in("local_target", source.repo.get_remotes())
    eq_("ssh://localhost", source.repo.get_remote_url("local_target"))
    # should NOT be able to push now, since url isn't correct:
    # TODO:  assumption is wrong if ~ does have .git! fix up!
    assert_raises(GitCommandError, publish, dataset=source, to="local_target")

    # Both must be annex or git repositories
    src_is_annex = AnnexRepo.is_valid_repo(src_path)
    eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path))
    # And target one should be known to have a known UUID within the source if annex
    if src_is_annex:
        annex = AnnexRepo(src_path)
        local_target_cfg = annex.repo.remotes["local_target"].config_reader.get
        # for some reason this was "correct"
        # eq_(local_target_cfg('annex-ignore'), 'false')
        # but after fixing creating siblings in
        # 21f6dd012b2c7b9c0b8b348dcfb3b0ace7e8b2ec it started to fail
        # I think it is legit since we are trying to fetch now before calling
        # annex.enable_remote so it doesn't set it up, and fails before
        assert_raises(Exception, local_target_cfg, 'annex-ignore')
        # hm, but ATM wouldn't get a uuid since url is wrong
        assert_raises(Exception, local_target_cfg, 'annex-uuid')

    # do it again without force:
    with assert_raises(RuntimeError) as cm:
        assert_create_sshwebserver(
            dataset=source,
            target="local_target",
            sshurl="ssh://localhost",
            target_dir=target_path)
    eq_("Target directory %s already exists." % target_path,
        str(cm.exception))
    if src_is_annex:
        target_description = AnnexRepo(target_path, create=False).get_description()
        assert_not_equal(target_description, None)
        assert_not_equal(target_description, target_path)
        ok_endswith(target_description, target_path)
    # now, with force and correct url, which is also used to determine
    # target_dir
    # Note: on windows absolute path is not url conform. But this way it's easy
    # to test, that ssh path is correctly used.
    if not on_windows:
        # add random file under target_path, to explicitly test existing=replace
        open(opj(target_path, 'random'), 'w').write('123')

        assert_create_sshwebserver(
            dataset=source,
            target="local_target",
            sshurl="ssh://localhost" + target_path,
            existing='replace')
        eq_("ssh://localhost" + target_path,
            source.repo.get_remote_url("local_target"))
        ok_(source.repo.get_remote_url("local_target", push=True) is None)

        # ensure target tree actually replaced by source
        assert_false(exists(opj(target_path, 'random')))

        if src_is_annex:
            annex = AnnexRepo(src_path)
            local_target_cfg = annex.repo.remotes["local_target"].config_reader.get
            eq_(local_target_cfg('annex-ignore'), 'false')
            eq_(local_target_cfg('annex-uuid').count('-'), 4)  # valid uuid

        # again, by explicitly passing urls. Since we are on localhost, the
        # local path should work:
        cpkwargs = dict(
            dataset=source,
            target="local_target",
            sshurl="ssh://localhost",
            target_dir=target_path,
            target_url=target_path,
            target_pushurl="ssh://localhost" + target_path,
            ui=True,
        )
        assert_create_sshwebserver(existing='replace', **cpkwargs)
        if src_is_annex:
            target_description = AnnexRepo(target_path,
                                           create=False).get_description()
            eq_(target_description, target_path)

        eq_(target_path,
            source.repo.get_remote_url("local_target"))
        eq_("ssh://localhost" + target_path,
            source.repo.get_remote_url("local_target", push=True))

        _test_correct_publish(target_path)

        # now, push should work:
        publish(dataset=source, to="local_target")

        # and we should be able to 'reconfigure'
        def process_digests_mtimes(digests, mtimes):
            # it should have triggered a hook, which would have created log and metadata files
            check_metadata = False
            for part in 'logs', 'metadata':
                metafiles = [k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part))]
                # This is in effect ONLY if we have "compatible" datalad installed on remote
                # end. ATM we don't have easy way to guarantee that AFAIK (yoh),
                # so let's not check/enforce (TODO)
                # assert(len(metafiles) >= 1)  # we might have 2 logs if timestamps do not collide ;)
                # Let's actually do it to some degree
                if part == 'logs':
                    # always should have those:
                    assert (len(metafiles) >= 1)
                    with open(opj(target_path, metafiles[0])) as f:
                        if 'no datalad found' not in f.read():
                            check_metadata = True
                if part == 'metadata':
                    eq_(len(metafiles), bool(check_metadata))
                for f in metafiles:
                    digests.pop(f)
                    mtimes.pop(f)
            # and just pop some leftovers from annex
            for f in list(digests):
                if f.startswith('.git/annex/mergedrefs'):
                    digests.pop(f)
                    mtimes.pop(f)

        orig_digests, orig_mtimes = get_mtimes_and_digests(target_path)
        process_digests_mtimes(orig_digests, orig_mtimes)

        import time; time.sleep(0.1)  # just so that mtimes change
        assert_create_sshwebserver(existing='reconfigure', **cpkwargs)
        digests, mtimes = get_mtimes_and_digests(target_path)
        process_digests_mtimes(digests, mtimes)

        assert_dict_equal(orig_digests, digests)  # nothing should change in terms of content

        # but some files should have been modified
        modified_files = {k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0)}
        # collect which files were expected to be modified without incurring any changes
        ok_modified_files = {
            _path_('.git/hooks/post-update'), 'index.html',
            # files which hook would manage to generate
            _path_('.git/info/refs'), '.git/objects/info/packs'
        }
        if external_versions['cmd:system-git'] >= '2.4':
            # on elderly git we don't change receive setting
            ok_modified_files.add(_path_('.git/config'))
        ok_modified_files.update({f for f in digests if f.startswith(_path_('.git/datalad/web'))})
        assert_set_equal(modified_files, ok_modified_files)
Ejemplo n.º 14
0
def test_target_ssh_simple(origin, src_path, target_rootpath):

    # prepare src
    source = install(src_path,
                     source=origin,
                     result_xfm='datasets',
                     return_type='item-or-list')

    target_path = opj(target_rootpath, "basic")
    with swallow_logs(new_level=logging.ERROR) as cml:
        create_sibling(dataset=source,
                       name="local_target",
                       sshurl="ssh://localhost:22",
                       target_dir=target_path,
                       ui=True)
        assert_not_in('enableremote local_target failed', cml.out)

    GitRepo(target_path, create=False)  # raises if not a git repo
    assert_in("local_target", source.repo.get_remotes())
    # Both must be annex or git repositories
    src_is_annex = AnnexRepo.is_valid_repo(src_path)
    eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path))
    # And target one should be known to have a known UUID within the source if annex
    if src_is_annex:
        annex = AnnexRepo(src_path)
        local_target_cfg = annex.repo.remotes["local_target"].config_reader.get
        # basic config in place
        eq_(local_target_cfg('annex-ignore'), 'false')
        ok_(local_target_cfg('annex-uuid'))

    # do it again without force, but use a different name to avoid initial checks
    # for existing remotes:
    with assert_raises(RuntimeError) as cm:
        assert_create_sshwebserver(dataset=source,
                                   name="local_target_alt",
                                   sshurl="ssh://localhost",
                                   target_dir=target_path)
    ok_(
        text_type(cm.exception).startswith(
            "Target path %s already exists. And it fails to rmdir" %
            target_path))
    if src_is_annex:
        target_description = AnnexRepo(target_path,
                                       create=False).get_description()
        assert_not_equal(target_description, None)
        assert_not_equal(target_description, target_path)
        # on yoh's laptop TMPDIR is under HOME, so things start to become
        # tricky since then target_path is shortened and we would need to know
        # remote $HOME.  To not over-complicate and still test, test only for
        # the basename of the target_path
        ok_endswith(target_description, basename(target_path))
    # now, with force and correct url, which is also used to determine
    # target_dir
    # Note: on windows absolute path is not url conform. But this way it's easy
    # to test, that ssh path is correctly used.
    if not on_windows:
        # add random file under target_path, to explicitly test existing=replace
        open(opj(target_path, 'random'), 'w').write('123')

        assert_create_sshwebserver(dataset=source,
                                   name="local_target",
                                   sshurl="ssh://localhost" + target_path,
                                   publish_by_default='master',
                                   existing='replace')
        eq_("ssh://localhost" + urlquote(target_path),
            source.repo.get_remote_url("local_target"))
        ok_(source.repo.get_remote_url("local_target", push=True) is None)

        # ensure target tree actually replaced by source
        assert_false(exists(opj(target_path, 'random')))

        if src_is_annex:
            annex = AnnexRepo(src_path)
            local_target_cfg = annex.repo.remotes[
                "local_target"].config_reader.get
            eq_(local_target_cfg('annex-ignore'), 'false')
            eq_(local_target_cfg('annex-uuid').count('-'), 4)  # valid uuid
            # should be added too, even if URL matches prior state
            eq_(local_target_cfg('push'), 'master')

        # again, by explicitly passing urls. Since we are on localhost, the
        # local path should work:
        cpkwargs = dict(
            dataset=source,
            name="local_target",
            sshurl="ssh://localhost",
            target_dir=target_path,
            target_url=target_path,
            target_pushurl="ssh://localhost" + target_path,
            ui=True,
        )
        assert_create_sshwebserver(existing='replace', **cpkwargs)
        if src_is_annex:
            target_description = AnnexRepo(target_path,
                                           create=False).get_description()
            eq_(target_description, target_path)

        eq_(target_path, source.repo.get_remote_url("local_target"))
        eq_("ssh://localhost" + target_path,
            source.repo.get_remote_url("local_target", push=True))

        _test_correct_publish(target_path)

        # now, push should work:
        publish(dataset=source, to="local_target")

        # and we should be able to 'reconfigure'
        def process_digests_mtimes(digests, mtimes):
            # it should have triggered a hook, which would have created log and metadata files
            check_metadata = False
            for part in 'logs', 'metadata':
                metafiles = [
                    k for k in digests
                    if k.startswith(_path_('.git/datalad/%s/' % part))
                ]
                # This is in effect ONLY if we have "compatible" datalad installed on remote
                # end. ATM we don't have easy way to guarantee that AFAIK (yoh),
                # so let's not check/enforce (TODO)
                # assert(len(metafiles) >= 1)  # we might have 2 logs if timestamps do not collide ;)
                # Let's actually do it to some degree
                if part == 'logs':
                    # always should have those:
                    assert (len(metafiles) >= 1)
                    with open(opj(target_path, metafiles[0])) as f:
                        if 'no datalad found' not in f.read():
                            check_metadata = True
                if part == 'metadata':
                    eq_(len(metafiles), bool(check_metadata))
                for f in metafiles:
                    digests.pop(f)
                    mtimes.pop(f)
            # and just pop some leftovers from annex
            for f in list(digests):
                if f.startswith('.git/annex/mergedrefs'):
                    digests.pop(f)
                    mtimes.pop(f)

        orig_digests, orig_mtimes = get_mtimes_and_digests(target_path)
        process_digests_mtimes(orig_digests, orig_mtimes)

        import time
        time.sleep(0.1)  # just so that mtimes change
        assert_create_sshwebserver(existing='reconfigure', **cpkwargs)
        digests, mtimes = get_mtimes_and_digests(target_path)
        process_digests_mtimes(digests, mtimes)

        assert_dict_equal(orig_digests,
                          digests)  # nothing should change in terms of content

        # but some files should have been modified
        modified_files = {
            k
            for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0)
        }
        # collect which files were expected to be modified without incurring any changes
        ok_modified_files = {
            _path_('.git/hooks/post-update'),
            'index.html',
            # files which hook would manage to generate
            _path_('.git/info/refs'),
            '.git/objects/info/packs'
        }
        # on elderly git we don't change receive setting
        ok_modified_files.add(_path_('.git/config'))
        ok_modified_files.update(
            {f
             for f in digests if f.startswith(_path_('.git/datalad/web'))})
        # it seems that with some recent git behavior has changed a bit
        # and index might get touched
        if _path_('.git/index') in modified_files:
            ok_modified_files.add(_path_('.git/index'))
        assert_set_equal(modified_files, ok_modified_files)
Ejemplo n.º 15
0
def test_target_ssh_simple(origin, src_path, target_rootpath):

    # prepare src
    source = install(src_path, source=origin)

    target_path = opj(target_rootpath, "basic")
    # it will try to fetch it so would fail as well since sshurl is wrong
    with swallow_logs(new_level=logging.ERROR) as cml, \
        assert_raises(GitCommandError):
        create_sibling(dataset=source,
                       target="local_target",
                       sshurl="ssh://localhost",
                       target_dir=target_path,
                       ui=True)
    # is not actually happening on one of the two basic cases -- TODO figure it out
    # assert_in('enableremote local_target failed', cml.out)

    GitRepo(target_path, create=False)  # raises if not a git repo
    assert_in("local_target", source.repo.get_remotes())
    eq_("ssh://localhost", source.repo.get_remote_url("local_target"))
    # should NOT be able to push now, since url isn't correct:
    # TODO:  assumption is wrong if ~ does have .git! fix up!
    assert_raises(GitCommandError, publish, dataset=source, to="local_target")

    # Both must be annex or git repositories
    src_is_annex = AnnexRepo.is_valid_repo(src_path)
    eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path))
    # And target one should be known to have a known UUID within the source if annex
    if src_is_annex:
        annex = AnnexRepo(src_path)
        local_target_cfg = annex.repo.remotes["local_target"].config_reader.get
        # for some reason this was "correct"
        # eq_(local_target_cfg('annex-ignore'), 'false')
        # but after fixing creating siblings in
        # 21f6dd012b2c7b9c0b8b348dcfb3b0ace7e8b2ec it started to fail
        # I think it is legit since we are trying to fetch now before calling
        # annex.enable_remote so it doesn't set it up, and fails before
        assert_raises(Exception, local_target_cfg, 'annex-ignore')
        # hm, but ATM wouldn't get a uuid since url is wrong
        assert_raises(Exception, local_target_cfg, 'annex-uuid')

    # do it again without force:
    with assert_raises(RuntimeError) as cm:
        assert_create_sshwebserver(dataset=source,
                                   target="local_target",
                                   sshurl="ssh://localhost",
                                   target_dir=target_path)
    eq_("Target directory %s already exists." % target_path, str(cm.exception))
    if src_is_annex:
        target_description = AnnexRepo(target_path,
                                       create=False).get_description()
        assert_not_equal(target_description, None)
        assert_not_equal(target_description, target_path)
        ok_endswith(target_description, target_path)
    # now, with force and correct url, which is also used to determine
    # target_dir
    # Note: on windows absolute path is not url conform. But this way it's easy
    # to test, that ssh path is correctly used.
    if not on_windows:
        # add random file under target_path, to explicitly test existing=replace
        open(opj(target_path, 'random'), 'w').write('123')

        assert_create_sshwebserver(dataset=source,
                                   target="local_target",
                                   sshurl="ssh://localhost" + target_path,
                                   existing='replace')
        eq_("ssh://localhost" + target_path,
            source.repo.get_remote_url("local_target"))
        ok_(source.repo.get_remote_url("local_target", push=True) is None)

        # ensure target tree actually replaced by source
        assert_false(exists(opj(target_path, 'random')))

        if src_is_annex:
            annex = AnnexRepo(src_path)
            local_target_cfg = annex.repo.remotes[
                "local_target"].config_reader.get
            eq_(local_target_cfg('annex-ignore'), 'false')
            eq_(local_target_cfg('annex-uuid').count('-'), 4)  # valid uuid

        # again, by explicitly passing urls. Since we are on localhost, the
        # local path should work:
        cpkwargs = dict(
            dataset=source,
            target="local_target",
            sshurl="ssh://localhost",
            target_dir=target_path,
            target_url=target_path,
            target_pushurl="ssh://localhost" + target_path,
            ui=True,
        )
        assert_create_sshwebserver(existing='replace', **cpkwargs)
        if src_is_annex:
            target_description = AnnexRepo(target_path,
                                           create=False).get_description()
            eq_(target_description, target_path)

        eq_(target_path, source.repo.get_remote_url("local_target"))
        eq_("ssh://localhost" + target_path,
            source.repo.get_remote_url("local_target", push=True))

        _test_correct_publish(target_path)

        # now, push should work:
        publish(dataset=source, to="local_target")

        # and we should be able to 'reconfigure'
        def process_digests_mtimes(digests, mtimes):
            # it should have triggered a hook, which would have created log and metadata files
            check_metadata = False
            for part in 'logs', 'metadata':
                metafiles = [
                    k for k in digests
                    if k.startswith(_path_('.git/datalad/%s/' % part))
                ]
                # This is in effect ONLY if we have "compatible" datalad installed on remote
                # end. ATM we don't have easy way to guarantee that AFAIK (yoh),
                # so let's not check/enforce (TODO)
                # assert(len(metafiles) >= 1)  # we might have 2 logs if timestamps do not collide ;)
                # Let's actually do it to some degree
                if part == 'logs':
                    # always should have those:
                    assert (len(metafiles) >= 1)
                    with open(opj(target_path, metafiles[0])) as f:
                        if 'no datalad found' not in f.read():
                            check_metadata = True
                if part == 'metadata':
                    eq_(len(metafiles), bool(check_metadata))
                for f in metafiles:
                    digests.pop(f)
                    mtimes.pop(f)
            # and just pop some leftovers from annex
            for f in list(digests):
                if f.startswith('.git/annex/mergedrefs'):
                    digests.pop(f)
                    mtimes.pop(f)

        orig_digests, orig_mtimes = get_mtimes_and_digests(target_path)
        process_digests_mtimes(orig_digests, orig_mtimes)

        import time
        time.sleep(0.1)  # just so that mtimes change
        assert_create_sshwebserver(existing='reconfigure', **cpkwargs)
        digests, mtimes = get_mtimes_and_digests(target_path)
        process_digests_mtimes(digests, mtimes)

        assert_dict_equal(orig_digests,
                          digests)  # nothing should change in terms of content

        # but some files should have been modified
        modified_files = {
            k
            for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0)
        }
        # collect which files were expected to be modified without incurring any changes
        ok_modified_files = {
            _path_('.git/hooks/post-update'),
            'index.html',
            # files which hook would manage to generate
            _path_('.git/info/refs'),
            '.git/objects/info/packs'
        }
        if external_versions['cmd:system-git'] >= '2.4':
            # on elderly git we don't change receive setting
            ok_modified_files.add(_path_('.git/config'))
        ok_modified_files.update(
            {f
             for f in digests if f.startswith(_path_('.git/datalad/web'))})
        assert_set_equal(modified_files, ok_modified_files)
Ejemplo n.º 16
0
    def repo(self):
        """Get an instance of the version control system/repo for this dataset,
        or None if there is none yet (or none anymore).

        If testing the validity of an instance of GitRepo is guaranteed to be
        really cheap this could also serve as a test whether a repo is present.

        Note, that this property is evaluated every time it is used. If used
        multiple times within a function it's probably a good idea to store its
        value in a local variable and use this variable instead.

        Returns
        -------
        GitRepo or AnnexRepo
        """

        # If we already got a *Repo instance, check whether it's still valid;
        # Note, that this basically does part of the testing that would
        # (implicitly) be done in the loop below again. So, there's still
        # potential to speed up when we actually need to get a new instance
        # (or none). But it's still faster for the vast majority of cases.
        #
        # TODO: Dig deeper into it and melt with new instance guessing. This
        # should also involve to reduce redundancy of testing such things from
        # within Flyweight.__call__, AnnexRepo.__init__ and GitRepo.__init__!
        #
        # Also note, that this could be forged into a single big condition, but
        # that is hard to read and we should be well aware of the actual
        # criteria here:
        if self._repo is not None and realpath(self.path) == self._repo.path:
            # we got a repo and path references still match
            if isinstance(self._repo, AnnexRepo):
                # it's supposed to be an annex
                if self._repo is AnnexRepo._unique_instances.get(
                        self._repo.path, None) and \
                        AnnexRepo.is_valid_repo(self._repo.path,
                                                allow_noninitialized=True):
                    # it's still the object registered as flyweight and it's a
                    # valid annex repo
                    return self._repo
            elif isinstance(self._repo, GitRepo):
                # it's supposed to be a plain git
                if self._repo is GitRepo._unique_instances.get(
                        self._repo.path, None) and \
                        GitRepo.is_valid_repo(self._repo.path) and not \
                        self._repo.is_with_annex():
                    # it's still the object registered as flyweight, it's a
                    # valid git repo and it hasn't turned into an annex
                    return self._repo

        # Note: Although it looks like the "self._repo = None" assignments
        # could be used instead of variable "valid", that's a big difference!
        # The *Repo instances are flyweights, not singletons. self._repo might
        # be the last reference, which would lead to those objects being
        # destroyed and therefore the constructor call would result in an
        # actually new instance. This is unnecessarily costly.
        valid = False
        for cls, ckw, kw in (
                # TODO: Do we really want to allow_noninitialized=True here?
                # And if so, leave a proper comment!
                (AnnexRepo, {'allow_noninitialized': True}, {'init': False}),
                (GitRepo, {}, {})
        ):
            if cls.is_valid_repo(self._path, **ckw):
                try:
                    lgr.log(5, "Detected %s at %s", cls, self._path)
                    self._repo = cls(self._path, create=False, **kw)
                    valid = True
                    break
                except (InvalidGitRepositoryError, NoSuchPathError,
                        InvalidAnnexRepositoryError) as exc:
                    lgr.log(5,
                            "Oops -- guess on repo type was wrong?: %s",
                            exc_str(exc))

        if not valid:
            self._repo = None

        if self._repo is None:
            # Often .repo is requested to 'sense' if anything is installed
            # under, and if so -- to proceed forward. Thus log here only
            # at DEBUG level and if necessary "complaint upstairs"
            lgr.log(5, "Failed to detect a valid repo at %s", self.path)
        elif due.active:
            # Makes sense only on installed dataset - @never_fail'ed
            duecredit_dataset(self)

        return self._repo
Ejemplo n.º 17
0
    def repo(self):
        """Get an instance of the version control system/repo for this dataset,
        or None if there is none yet (or none anymore).

        If testing the validity of an instance of GitRepo is guaranteed to be
        really cheap this could also serve as a test whether a repo is present.

        Note, that this property is evaluated every time it is used. If used
        multiple times within a function it's probably a good idea to store its
        value in a local variable and use this variable instead.

        Returns
        -------
        GitRepo or AnnexRepo
        """

        # If we already got a *Repo instance, check whether it's still valid;
        # Note, that this basically does part of the testing that would
        # (implicitly) be done in the loop below again. So, there's still
        # potential to speed up when we actually need to get a new instance
        # (or none). But it's still faster for the vast majority of cases.
        #
        # TODO: Dig deeper into it and melt with new instance guessing. This
        # should also involve to reduce redundancy of testing such things from
        # within Flyweight.__call__, AnnexRepo.__init__ and GitRepo.__init__!
        #
        # Also note, that this could be forged into a single big condition, but
        # that is hard to read and we should be well aware of the actual
        # criteria here:
        if self._repo is not None and realpath(self.path) == self._repo.path:
            # we got a repo and path references still match
            if isinstance(self._repo, AnnexRepo):
                # it's supposed to be an annex
                if self._repo is AnnexRepo._unique_instances.get(
                        self._repo.path, None) and \
                        AnnexRepo.is_valid_repo(self._repo.path,
                                                allow_noninitialized=True):
                    # it's still the object registered as flyweight and it's a
                    # valid annex repo
                    return self._repo
            elif isinstance(self._repo, GitRepo):
                # it's supposed to be a plain git
                if self._repo is GitRepo._unique_instances.get(
                        self._repo.path, None) and \
                        GitRepo.is_valid_repo(self._repo.path) and not \
                        self._repo.is_with_annex():
                    # it's still the object registered as flyweight, it's a
                    # valid git repo and it hasn't turned into an annex
                    return self._repo

        # Note: Although it looks like the "self._repo = None" assignments
        # could be used instead of variable "valid", that's a big difference!
        # The *Repo instances are flyweights, not singletons. self._repo might
        # be the last reference, which would lead to those objects being
        # destroyed and therefore the constructor call would result in an
        # actually new instance. This is unnecessarily costly.
        valid = False
        for cls, ckw, kw in (
                # TODO: Do we really want to allow_noninitialized=True here?
                # And if so, leave a proper comment!
                (AnnexRepo, {'allow_noninitialized': True}, {'init': False}),
                (GitRepo, {}, {})
        ):
            if cls.is_valid_repo(self._path, **ckw):
                try:
                    lgr.log(5, "Detected %s at %s", cls, self._path)
                    self._repo = cls(self._path, create=False, **kw)
                    valid = True
                    break
                except (InvalidGitRepositoryError, NoSuchPathError,
                        InvalidAnnexRepositoryError) as exc:
                    lgr.log(5,
                            "Oops -- guess on repo type was wrong?: %s",
                            exc_str(exc))

        if not valid:
            self._repo = None

        if self._repo is None:
            # Often .repo is requested to 'sense' if anything is installed
            # under, and if so -- to proceed forward. Thus log here only
            # at DEBUG level and if necessary "complaint upstairs"
            lgr.log(5, "Failed to detect a valid repo at %s", self.path)

        return self._repo
Ejemplo n.º 18
0
def test_install_simple_local(src_repo=None, path=None, *, type_):

    src_ds = Dataset(src_repo).create(result_renderer='disabled',
                                      force=True,
                                      annex=(type_ == "annex"))
    src_ds.save(['INFO.txt', 'test.dat'], to_git=True)
    if type_ == 'annex':
        src_ds.save('test-annex.dat', to_git=False)
    elif type_ == 'git':
        pass
    else:
        raise ValueError("'type' must be 'git' or 'annex'")
    # equivalent repo on github:
    url = "https://github.com/datalad/testrepo--basic--r1.git"
    sources = [
        src_ds.path,
        get_local_file_url(src_ds.path, compatibility='git')
    ]
    if not dl_cfg.get('datalad.tests.nonetwork'):
        sources.append(url)

    for src in sources:
        origin = Dataset(path)

        # now install it somewhere else
        ds = install(path, source=src, description='mydummy')
        eq_(ds.path, path)
        ok_(ds.is_installed())
        if not isinstance(origin.repo, AnnexRepo):
            # this means it is a GitRepo
            ok_(isinstance(origin.repo, GitRepo))
            # stays plain Git repo
            ok_(isinstance(ds.repo, GitRepo))
            ok_(not isinstance(ds.repo, AnnexRepo))
            ok_(GitRepo.is_valid_repo(ds.path))
            files = ds.repo.get_indexed_files()
            assert_in('test.dat', files)
            assert_in('INFO.txt', files)
            assert_repo_status(path, annex=False)
        else:
            # must be an annex
            ok_(isinstance(ds.repo, AnnexRepo))
            ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False))
            files = ds.repo.get_indexed_files()
            assert_in('test.dat', files)
            assert_in('INFO.txt', files)
            assert_in('test-annex.dat', files)
            assert_repo_status(path, annex=True)
            # no content was installed:
            ok_(not ds.repo.file_has_content('test-annex.dat'))
            uuid_before = ds.repo.uuid
            ok_(uuid_before)  # we actually have an uuid
            eq_(ds.repo.get_description(), 'mydummy')

        # installing it again, shouldn't matter:
        res = install(path, source=src, result_xfm=None, return_type='list')
        assert_status('notneeded', res)
        ok_(ds.is_installed())
        if isinstance(origin.repo, AnnexRepo):
            eq_(uuid_before, ds.repo.uuid)

        # cleanup before next iteration
        rmtree(path)
Ejemplo n.º 19
0
def get_cached_dataset(url, version=None, keys=None):
    """ Helper to get a cached clone from url

    Intended for use from within `cached_dataset` and `cached_url` decorators.
    Clones `url` into user's cache under datalad/tests/`name`. If such a clone
    already exists, don't clone but return the existing one. So, it's supposed
    to cache the original source in order to reduce time and traffic for tests,
    by letting subsequent requests clone from a local location directly.

    If it's an annex get the content as provided by `keys`, too.
    Note, that as a transparent cache replacing the repo at URL from the POV of
    a test, we can't address content via paths, since those are valid only with
    respect to a particular worktree. If different tests clone from the same
    cached dataset, each requesting different versions and different paths
    thereof, we run into trouble if the cache itself checks out a particular
    requested version.

    Verifies that `version` can be checked out, but doesn't actually do it,
    since the cached dataset is intended to be used as origin instead of the
    original remote at URL by the `cached_dataset` test decorator. Checkout of
    a particular version should happen in its clone.

    Parameters
    ----------
    url: str
        URL to clone from
    keys: str or list or None
        (list of) annex keys to get content for.
    version: str or None
        A commit or an object that can be dereferenced to one.

    Returns
    -------
    Dataset
    """

    # TODO: What about recursive? Might be complicated. We would need to make
    #       sure we can recursively clone _from_ here then, potentially
    #       requiring submodule URL rewrites. Not sure about that ATM.

    # TODO: Given that it is supposed to be a cache for the original repo at
    #       `url`, we prob. should make this a bare repository. We don't need
    #       a potentially expensive checkout here. Need to double check
    #       `annex-get --key` in bare repos, though. Plus datalad-clone doesn't
    #       have --bare yet. But we want all the annex/special-remote/ria magic
    #       of datalad. So, plain git-clone --bare is not an option.

    if not DATALAD_TESTS_CACHE:
        raise ValueError("Caching disabled by config")

    ds = Dataset(DATALAD_TESTS_CACHE / url2filename(url))

    if not ds.is_installed():
        ds = Clone()(url, ds.pathobj)

    # When/How to update a dataset in cache? If version is a commit SHA and we
    # have it, there's no need for an update. Otherwise it gets tricky, because
    # this is a cache, not a checkout a test would operate on. It needs to
    # behave as if it was the thing at `url` from the point of view of the test
    # using it (cloning/getting content from here). We would need to update all
    # references, not just fetch them!
    #
    # Can we even (cheaply) tell whether `version` is an absolute reference
    # (actual SHA, not a branch/tag)?
    #
    # NOTE: - consider git-clone --mirror, but as w/ --bare: not an option for
    #         datalad-clone yet.
    #       - --reference[-if-able] might also be worth thinking about for
    #         the clone @cached_dataset creates wrt clone in cacheq
    #
    # So, for now fetch, figure whether there actually was something to fetch
    # and if so simply invalidate cache and re-clone/get. Don't overcomplicate
    # things. It's about datasets used in the tests - they shouldn't change too
    # frequently.
    elif any('uptodate' not in c['operations']
             for c in ds.repo.fetch(DEFAULT_REMOTE)):
        rmtree(ds.path)
        ds = Clone()(url, ds.pathobj)

    if version:
        # check whether version is available
        assert ds.repo.commit_exists(version)
    if keys and AnnexRepo.is_valid_repo(ds.path):
        ds.repo.get(keys, key=True)

    return ds
Ejemplo n.º 20
0
def test_target_ssh_simple(origin, src_path, target_rootpath):

    # prepare src
    source = install(
        src_path, source=origin,
        result_xfm='datasets', return_type='item-or-list')

    target_path = opj(target_rootpath, "basic")
    with swallow_logs(new_level=logging.ERROR) as cml:
        create_sibling(
            dataset=source,
            name="local_target",
            sshurl="ssh://localhost",
            target_dir=target_path,
            ui=True)
        assert_not_in('enableremote local_target failed', cml.out)

    GitRepo(target_path, create=False)  # raises if not a git repo
    assert_in("local_target", source.repo.get_remotes())
    # Both must be annex or git repositories
    src_is_annex = AnnexRepo.is_valid_repo(src_path)
    eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path))
    # And target one should be known to have a known UUID within the source if annex
    if src_is_annex:
        annex = AnnexRepo(src_path)
        local_target_cfg = annex.repo.remotes["local_target"].config_reader.get
        # basic config in place
        eq_(local_target_cfg('annex-ignore'), 'false')
        ok_(local_target_cfg('annex-uuid'))

    # do it again without force, but use a different name to avoid initial checks
    # for existing remotes:
    with assert_raises(RuntimeError) as cm:
        assert_create_sshwebserver(
            dataset=source,
            name="local_target_alt",
            sshurl="ssh://localhost",
            target_dir=target_path)
    ok_(text_type(cm.exception).startswith(
        "Target path %s already exists. And it fails to rmdir" % target_path))
    if src_is_annex:
        target_description = AnnexRepo(target_path, create=False).get_description()
        assert_not_equal(target_description, None)
        assert_not_equal(target_description, target_path)
        # on yoh's laptop TMPDIR is under HOME, so things start to become
        # tricky since then target_path is shortened and we would need to know
        # remote $HOME.  To not over-complicate and still test, test only for
        # the basename of the target_path
        ok_endswith(target_description, basename(target_path))
    # now, with force and correct url, which is also used to determine
    # target_dir
    # Note: on windows absolute path is not url conform. But this way it's easy
    # to test, that ssh path is correctly used.
    if not on_windows:
        # add random file under target_path, to explicitly test existing=replace
        open(opj(target_path, 'random'), 'w').write('123')

        assert_create_sshwebserver(
            dataset=source,
            name="local_target",
            sshurl="ssh://localhost" + target_path,
            publish_by_default='master',
            existing='replace')
        eq_("ssh://localhost" + urlquote(target_path),
            source.repo.get_remote_url("local_target"))
        ok_(source.repo.get_remote_url("local_target", push=True) is None)

        # ensure target tree actually replaced by source
        assert_false(exists(opj(target_path, 'random')))

        if src_is_annex:
            annex = AnnexRepo(src_path)
            local_target_cfg = annex.repo.remotes["local_target"].config_reader.get
            eq_(local_target_cfg('annex-ignore'), 'false')
            eq_(local_target_cfg('annex-uuid').count('-'), 4)  # valid uuid
            # should be added too, even if URL matches prior state
            eq_(local_target_cfg('push'), 'master')

        # again, by explicitly passing urls. Since we are on localhost, the
        # local path should work:
        cpkwargs = dict(
            dataset=source,
            name="local_target",
            sshurl="ssh://localhost",
            target_dir=target_path,
            target_url=target_path,
            target_pushurl="ssh://localhost" + target_path,
            ui=True,
        )
        assert_create_sshwebserver(existing='replace', **cpkwargs)
        if src_is_annex:
            target_description = AnnexRepo(target_path,
                                           create=False).get_description()
            eq_(target_description, target_path)

        eq_(target_path,
            source.repo.get_remote_url("local_target"))
        eq_("ssh://localhost" + target_path,
            source.repo.get_remote_url("local_target", push=True))

        _test_correct_publish(target_path)

        # now, push should work:
        publish(dataset=source, to="local_target")

        # and we should be able to 'reconfigure'
        def process_digests_mtimes(digests, mtimes):
            # it should have triggered a hook, which would have created log and metadata files
            check_metadata = False
            for part in 'logs', 'metadata':
                metafiles = [k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part))]
                # This is in effect ONLY if we have "compatible" datalad installed on remote
                # end. ATM we don't have easy way to guarantee that AFAIK (yoh),
                # so let's not check/enforce (TODO)
                # assert(len(metafiles) >= 1)  # we might have 2 logs if timestamps do not collide ;)
                # Let's actually do it to some degree
                if part == 'logs':
                    # always should have those:
                    assert (len(metafiles) >= 1)
                    with open(opj(target_path, metafiles[0])) as f:
                        if 'no datalad found' not in f.read():
                            check_metadata = True
                if part == 'metadata':
                    eq_(len(metafiles), bool(check_metadata))
                for f in metafiles:
                    digests.pop(f)
                    mtimes.pop(f)
            # and just pop some leftovers from annex
            for f in list(digests):
                if f.startswith('.git/annex/mergedrefs'):
                    digests.pop(f)
                    mtimes.pop(f)

        orig_digests, orig_mtimes = get_mtimes_and_digests(target_path)
        process_digests_mtimes(orig_digests, orig_mtimes)

        import time
        time.sleep(0.1)  # just so that mtimes change
        assert_create_sshwebserver(existing='reconfigure', **cpkwargs)
        digests, mtimes = get_mtimes_and_digests(target_path)
        process_digests_mtimes(digests, mtimes)

        assert_dict_equal(orig_digests, digests)  # nothing should change in terms of content

        # but some files should have been modified
        modified_files = {k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0)}
        # collect which files were expected to be modified without incurring any changes
        ok_modified_files = {
            _path_('.git/hooks/post-update'), 'index.html',
            # files which hook would manage to generate
            _path_('.git/info/refs'), '.git/objects/info/packs'
        }
        # on elderly git we don't change receive setting
        ok_modified_files.add(_path_('.git/config'))
        ok_modified_files.update({f for f in digests if f.startswith(_path_('.git/datalad/web'))})
        # it seems that with some recent git behavior has changed a bit
        # and index might get touched
        if _path_('.git/index') in modified_files:
            ok_modified_files.add(_path_('.git/index'))
        assert_set_equal(modified_files, ok_modified_files)
def test_get_cached_dataset(cache_dir):

    # patch DATALAD_TESTS_CACHE to not use the actual cache with
    # the test testing that very cache.
    cache_dir = Path(cache_dir)

    # store file-based values for testrepo-minimalds for readability:
    annexed_file = opj('inannex', 'animated.gif')
    annexed_file_key = "MD5E-s144625--4c458c62b7ac8ec8e19c8ff14b2e34ad.gif"

    with patch(CACHE_PATCH_STR, new=cache_dir):

        # tuples to test (url, version, keys, class):
        test_cases = [

            # a simple testrepo
            ("https://github.com/datalad/testrepo--minimalds",
             "541cf855d13c2a338ff2803d4488daf0035e568f", None, AnnexRepo),
            # Same repo, but request paths to be present. This should work
            # with a subsequent call, although the first one did not already
            # request any:
            ("https://github.com/datalad/testrepo--minimalds",
             "9dd8b56cc706ab56185f2ceb75fbe9de9b606724", annexed_file_key,
             AnnexRepo),
            # Same repo again, but invalid version
            (
                "https://github.com/datalad/testrepo--minimalds",
                "nonexistent",
                "irrelevantkey",  # invalid version; don't even try to get the key
                AnnexRepo),
            # same thing with different name should be treated as a new thing:
            ("https://github.com/datalad/testrepo--minimalds", "git-annex",
             None, AnnexRepo),
            # try a plain git repo to make sure we can deal with that:
            # Note, that we first need a test case w/o a `key` parameter to not
            # blow up the test when Clone is patched, resulting in a MagicMock
            # instead of a Dataset instance within get_cached_dataset. In the
            # second case it's already cached then, so the patched Clone is
            # never executed.
            ("https://github.com/datalad/datalad.org", None, None, GitRepo),
            (
                "https://github.com/datalad/datalad.org",
                "gh-pages",
                "ignored-key",  # it's a git repo; don't even try to get a key
                GitRepo),
        ]
        for url, version, keys, cls in test_cases:
            target = cache_dir / url2filename(url)

            # assuming it doesn't exist yet - patched cache dir!
            in_cache_before = target.exists()
            with patch(CLONE_PATCH_STR) as exec_clone:
                try:
                    ds = get_cached_dataset(url, version, keys)
                    invalid_version = False
                except AssertionError:
                    # should happen only if `version` wasn't found. Implies
                    # that the dataset exists in cache (although not returned
                    # due to exception)
                    assert_true(version)
                    assert_false(Dataset(target).repo.commit_exists(version))
                    # mark for later assertions (most of them should still hold
                    # true)
                    invalid_version = True

            assert_equal(exec_clone.call_count, 0 if in_cache_before else 1)

            # Patch prevents actual execution. Now do it for real. Note, that
            # this might be necessary for content retrieval even if dataset was
            # in cache before.
            try:
                ds = get_cached_dataset(url, version, keys)
            except AssertionError:
                # see previous call
                assert_true(invalid_version)

            assert_is_instance(ds, Dataset)
            assert_true(ds.is_installed())
            assert_equal(target, ds.pathobj)
            assert_is_instance(ds.repo, cls)

            if keys and not invalid_version and \
                    AnnexRepo.is_valid_repo(ds.path):
                # Note: it's not supposed to get that content if passed
                # `version` wasn't available. get_cached_dataset would then
                # raise before and not download anything only to raise
                # afterwards.
                here = ds.config.get("annex.uuid")
                where = ds.repo.whereis(ensure_list(keys), key=True)
                assert_true(all(here in remotes for remotes in where))

            # version check. Note, that all `get_cached_dataset` is supposed to
            # do, is verifying, that specified version exists - NOT check it
            # out"
            if version and not invalid_version:
                assert_true(ds.repo.commit_exists(version))

            # re-execution
            with patch(CLONE_PATCH_STR) as exec_clone:
                try:
                    ds2 = get_cached_dataset(url, version, keys)
                except AssertionError:
                    assert_true(invalid_version)
            exec_clone.assert_not_called()
            # returns the same Dataset as before:
            assert_is(ds, ds2)