Example #1
0
def test_newthings_coming_down(originpath=None, destpath=None):
    origin = GitRepo(originpath, create=True)
    create_tree(originpath, {'load.dat': 'heavy'})
    Dataset(originpath).save('load.dat')
    ds = install(source=originpath,
                 path=destpath,
                 result_xfm='datasets',
                 return_type='item-or-list')
    assert_is_instance(ds.repo, GitRepo)
    assert_in(DEFAULT_REMOTE, ds.repo.get_remotes())
    # turn origin into an annex
    origin = AnnexRepo(originpath, create=True)
    # clone doesn't know yet
    assert_false(knows_annex(ds.path))
    # but after an update it should
    # no merge, only one sibling, no parameters should be specific enough
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    assert (knows_annex(ds.path))
    # no branches appeared
    eq_(ds.repo.get_branches(), [DEFAULT_BRANCH])
    # now merge, and get an annex
    assert_result_count(ds.update(merge=True),
                        1,
                        action='update',
                        status='ok',
                        type='dataset')
    assert_in('git-annex', ds.repo.get_branches())
    assert_is_instance(ds.repo, AnnexRepo)
    # should be fully functional
    testfname = opj(ds.path, 'load.dat')
    assert_false(ds.repo.file_has_content(testfname))
    ds.get('.')
    ok_file_has_content(opj(ds.path, 'load.dat'), 'heavy')
    # check that a new tag comes down
    origin.tag('first!')
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    eq_(ds.repo.get_tags(output='name')[0], 'first!')

    # and now we destroy the remote annex
    origin.call_git(['config', '--remove-section', 'annex'])
    rmtree(opj(origin.path, '.git', 'annex'), chmod_files=True)
    origin.call_git(['branch', '-D', 'git-annex'])
    origin = GitRepo(originpath)
    assert_false(knows_annex(originpath))

    # and update the local clone
    # for now this should simply not fail (see gh-793), later might be enhanced to a
    # graceful downgrade
    before_branches = ds.repo.get_branches()
    ok_(any("git-annex" in b for b in ds.repo.get_remote_branches()))
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    eq_(before_branches, ds.repo.get_branches())
    # annex branch got pruned
    assert_false(any("git-annex" in b for b in ds.repo.get_remote_branches()))
    # check that a new tag comes down even if repo types mismatch
    origin.tag('second!')
    assert_result_count(ds.update(), 1, status='ok', type='dataset')
    eq_(ds.repo.get_tags(output='name')[-1], 'second!')
Example #2
0
def test_install_dataset_from_instance(src=None, dst=None):
    origin = Dataset(src).create(result_renderer='disabled', force=True)
    origin.save(['INFO.txt', 'test.dat'], to_git=True)
    origin.save('test-annex.dat', to_git=False)

    clone = install(source=origin, path=dst)

    assert_is_instance(clone, Dataset)
    ok_startswith(clone.path, dst)
    ok_(clone.is_installed())
    ok_(GitRepo.is_valid_repo(clone.path))
    assert_repo_status(clone.path, annex=None)
    assert_in('INFO.txt', clone.repo.get_indexed_files())
        def decorated_test2(ds):
            # we get a Dataset instance
            assert_is_instance(ds, Dataset)
            # it's a clone in a temp. location, not within the cache
            assert_not_in(cache_dir, ds.pathobj.parents)
            assert_result_count(ds.siblings(),
                                1,
                                type="sibling",
                                name=DEFAULT_REMOTE,
                                url=str(cache_dir / name_in_cache))
            here = ds.config.get("annex.uuid")
            origin = ds.config.get(f"remote.{DEFAULT_REMOTE}.annex-uuid")
            where = ds.repo.whereis(str(annexed_file))
            assert_in(here, where)
            assert_in(origin, where)

            return ds.pathobj, ds.repo.pathobj
        def decorated_test3(ds):
            # we get a Dataset instance
            assert_is_instance(ds, Dataset)
            # it's a clone in a temp. location, not within the cache
            assert_not_in(cache_dir, ds.pathobj.parents)
            assert_result_count(ds.siblings(),
                                1,
                                type="sibling",
                                name=DEFAULT_REMOTE,
                                url=str(cache_dir / name_in_cache))
            # origin is the same cached dataset, that got this content in
            # decorated_test2 before. Should still be there. But "here" we
            # didn't request it
            here = ds.config.get("annex.uuid")
            origin = ds.config.get(f"remote.{DEFAULT_REMOTE}.annex-uuid")
            where = ds.repo.whereis(str(annexed_file))
            assert_not_in(here, where)
            assert_in(origin, where)

            return ds.pathobj, ds.repo.pathobj
Example #5
0
def test_ssh_get_connection():

    manager = SSHManager()
    if _ssh_manager_is_multiplex:
        assert manager._socket_dir is None, \
            "Should be unset upon initialization. Got %s" % str(manager._socket_dir)
    c1 = manager.get_connection('ssh://datalad-test')

    if _ssh_manager_is_multiplex:
        assert manager._socket_dir, "Should be set after interactions with the manager"
        assert_is_instance(c1, MultiplexSSHConnection)
        # subsequent call returns the very same instance:
        ok_(manager.get_connection('ssh://datalad-test') is c1)
    else:
        assert_is_instance(c1, NoMultiplexSSHConnection)

    # fail on malformed URls (meaning: our fancy URL parser can't correctly
    # deal with them):
    #assert_raises(ValueError, manager.get_connection, 'localhost')
    # we now allow those simple specifications of host to get_connection
    c2 = manager.get_connection('datalad-test')
    assert_is_instance(c2, SSHConnection)

    # but should fail if it looks like something else
    assert_raises(ValueError, manager.get_connection, 'datalad-test/')
    assert_raises(ValueError, manager.get_connection, ':datalad-test')

    # we can do what urlparse cannot
    # assert_raises(ValueError, manager.get_connection, 'someone@localhost')
    # next one is considered a proper url by urlparse (netloc:'',
    # path='/localhost), but eventually gets turned into SSHRI(hostname='ssh',
    # path='/localhost') -- which is fair IMHO -> invalid test
    # assert_raises(ValueError, manager.get_connection, 'ssh:/localhost')

    manager.close()
Example #6
0
def test_GitRepo_instance_from_not_existing(path=None, path2=None):
    # 1. create=False and path doesn't exist:
    repo = GitRepo(path)
    assert_false(op.exists(path))

    # 2. create=False, path exists, but no git repo:
    os.mkdir(path)
    ok_(op.exists(path))
    repo = GitRepo(path)
    assert_false(op.exists(op.join(path, '.git')))

    # 3. create=True, path doesn't exist:
    gr = GitRepo(path2).init()
    assert_is_instance(gr, GitRepo, "GitRepo was not created.")
    ok_(op.exists(op.join(path2, '.git')))
    # re-enable from core GitRepo has a status() method
    #assert_repo_status(path2, annex=False)

    # 4. create=True, path exists, but no git repo:
    gr = GitRepo(path).init()
    assert_is_instance(gr, GitRepo, "GitRepo was not created.")
    ok_(op.exists(op.join(path, '.git')))
Example #7
0
def test_datalad_credential_helper(path=None):

    ds = Dataset(path).create()

    # tell git to use git-credential-datalad
    ds.config.add('credential.helper', 'datalad', scope='local')
    ds.config.add('datalad.credentials.githelper.noninteractive',
                  'true',
                  scope='global')

    from datalad.downloaders.providers import Providers

    url1 = "https://datalad-test.org/some"
    url2 = "https://datalad-test.org/other"
    provider_name = "datalad-test.org"

    # `Providers` code is old and only considers a dataset root based on PWD
    # for config lookup. contextmanager below can be removed once the
    # provider/credential system is redesigned.
    with chpwd(ds.path):

        gitcred = GitCredentialInterface(url=url1, repo=ds)

        # There's nothing set up yet, helper should return empty
        gitcred.fill()
        eq_(gitcred['username'], '')
        eq_(gitcred['password'], '')

        # store new credentials
        # Note, that `Providers.enter_new()` currently uses user-level config
        # files for storage only. TODO: make that an option!
        # To not mess with existing ones, fail if it already exists:

        cfg_file = Path(Providers._get_providers_dirs()['user']) \
                   / f"{provider_name}.cfg"
        assert_false(cfg_file.exists())

        # Make sure we clean up
        from datalad.tests import _TEMP_PATHS_GENERATED
        _TEMP_PATHS_GENERATED.append(str(cfg_file))

        # Give credentials to git and ask it to store them:
        gitcred = GitCredentialInterface(url=url1,
                                         username="******",
                                         password="******",
                                         repo=ds)
        gitcred.approve()

        assert_true(cfg_file.exists())
        providers = Providers.from_config_files()
        p1 = providers.get_provider(url=url1, only_nondefault=True)
        assert_is_instance(p1.credential, UserPassword)
        eq_(p1.credential.get('user'), 'dl-user')
        eq_(p1.credential.get('password'), 'dl-pwd')

        # default regex should be host only, so matching url2, too
        p2 = providers.get_provider(url=url2, only_nondefault=True)
        assert_is_instance(p1.credential, UserPassword)
        eq_(p1.credential.get('user'), 'dl-user')
        eq_(p1.credential.get('password'), 'dl-pwd')

        # git, too, should now find it for both URLs
        gitcred = GitCredentialInterface(url=url1, repo=ds)
        gitcred.fill()
        eq_(gitcred['username'], 'dl-user')
        eq_(gitcred['password'], 'dl-pwd')

        gitcred = GitCredentialInterface(url=url2, repo=ds)
        gitcred.fill()
        eq_(gitcred['username'], 'dl-user')
        eq_(gitcred['password'], 'dl-pwd')

        # Rejection must not currently lead to deleting anything, since we would
        # delete too broadly.
        gitcred.reject()
        assert_true(cfg_file.exists())
        gitcred = GitCredentialInterface(url=url1, repo=ds)
        gitcred.fill()
        eq_(gitcred['username'], 'dl-user')
        eq_(gitcred['password'], 'dl-pwd')
        dlcred = UserPassword(name=provider_name)
        eq_(dlcred.get('user'), 'dl-user')
        eq_(dlcred.get('password'), 'dl-pwd')
Example #8
0
def test_Dataset_flyweight(path1=None, path2=None):

    import gc
    import sys

    ds1 = Dataset(path1)
    assert_is_instance(ds1, Dataset)
    # Don't create circular references or anything similar
    assert_equal(1, sys.getrefcount(ds1) - 1)

    ds1.create()

    # Due to issue 4862, we currently still require gc.collect() under unclear
    # circumstances to get rid of an exception traceback when creating in an
    # existing directory. That traceback references the respective function
    # frames which in turn reference the repo instance (they are methods).
    # Doesn't happen on all systems, though. Eventually we need to figure that
    # out.
    # However, still test for the refcount after gc.collect() to ensure we don't
    # introduce new circular references and make the issue worse!
    gc.collect()

    # refcount still fine after repo creation:
    assert_equal(1, sys.getrefcount(ds1) - 1)

    # instantiate again:
    ds2 = Dataset(path1)
    assert_is_instance(ds2, Dataset)
    # the very same object:
    ok_(ds1 is ds2)

    # reference the same via relative path:
    with chpwd(path1):
        ds3 = Dataset(relpath(path1, start=path2))
        ok_(ds1 == ds3)
        ok_(ds1 is ds3)

    # gc knows one such object only:
    eq_(
        1,
        len([
            o for o in gc.get_objects()
            if isinstance(o, Dataset) and o.path == path1
        ]))

    # on windows a symlink is not what you think it is
    if not on_windows:
        # reference the same via symlink:
        with chpwd(path2):
            os.symlink(path1, 'linked')
            ds4 = Dataset('linked')
            ds4_id = id(ds4)
            ok_(ds4 == ds1)
            ok_(ds4 is not ds1)

        # underlying repo, however, IS the same:
        ok_(ds4.repo is ds1.repo)

    # deleting one reference has no effect on the other:
    del ds1
    gc.collect()  # TODO: see first comment above
    ok_(ds2 is not None)
    ok_(ds2.repo is ds3.repo)
    if not on_windows:
        ok_(ds2.repo is ds4.repo)

    # deleting remaining references should lead to garbage collection
    del ds2

    with swallow_logs(new_level=1) as cml:
        del ds3
        gc.collect()  # TODO: see first comment above
        # flyweight vanished:
        assert_not_in(path1, Dataset._unique_instances.keys())
        # no such instance known to gc anymore:
        eq_([], [
            o for o in gc.get_objects()
            if isinstance(o, Dataset) and o.path == path1
        ])
        # underlying repo should only be cleaned up, if ds3 was the last
        # reference to it. Otherwise the repo instance should live on
        # (via symlinked ds4):
        finalizer_log = "Finalizer called on: AnnexRepo(%s)" % path1
        if on_windows:
            cml.assert_logged(msg=finalizer_log, level="Level 1", regex=False)
        else:
            assert_not_in(finalizer_log, cml.out)
            # symlinked is still there:
            ok_(ds4 is not None)
            eq_(ds4_id, id(ds4))
def test_get_cached_dataset(cache_dir=None):

    # patch DATALAD_TESTS_CACHE to not use the actual cache with
    # the test testing that very cache.
    cache_dir = Path(cache_dir)

    # store file-based values for testrepo-minimalds for readability:
    annexed_file = opj('inannex', 'animated.gif')
    annexed_file_key = "MD5E-s144625--4c458c62b7ac8ec8e19c8ff14b2e34ad.gif"

    with patch(CACHE_PATCH_STR, new=cache_dir):

        # tuples to test (url, version, keys, class):
        test_cases = [

            # a simple testrepo
            ("https://github.com/datalad/testrepo--minimalds",
             "541cf855d13c2a338ff2803d4488daf0035e568f", None, AnnexRepo),
            # Same repo, but request paths to be present. This should work
            # with a subsequent call, although the first one did not already
            # request any:
            ("https://github.com/datalad/testrepo--minimalds",
             "9dd8b56cc706ab56185f2ceb75fbe9de9b606724", annexed_file_key,
             AnnexRepo),
            # Same repo again, but invalid version
            (
                "https://github.com/datalad/testrepo--minimalds",
                "nonexistent",
                "irrelevantkey",  # invalid version; don't even try to get the key
                AnnexRepo),
            # same thing with different name should be treated as a new thing:
            ("https://github.com/datalad/testrepo--minimalds", "git-annex",
             None, AnnexRepo),
            # try a plain git repo to make sure we can deal with that:
            # Note, that we first need a test case w/o a `key` parameter to not
            # blow up the test when Clone is patched, resulting in a MagicMock
            # instead of a Dataset instance within get_cached_dataset. In the
            # second case it's already cached then, so the patched Clone is
            # never executed.
            ("https://github.com/datalad/datalad.org", None, None, GitRepo),
            (
                "https://github.com/datalad/datalad.org",
                "gh-pages",
                "ignored-key",  # it's a git repo; don't even try to get a key
                GitRepo),
        ]
        for url, version, keys, cls in test_cases:
            target = cache_dir / url2filename(url)

            # assuming it doesn't exist yet - patched cache dir!
            in_cache_before = target.exists()
            with patch(CLONE_PATCH_STR) as exec_clone:
                try:
                    ds = get_cached_dataset(url, version, keys)
                    invalid_version = False
                except AssertionError:
                    # should happen only if `version` wasn't found. Implies
                    # that the dataset exists in cache (although not returned
                    # due to exception)
                    assert_true(version)
                    assert_false(Dataset(target).repo.commit_exists(version))
                    # mark for later assertions (most of them should still hold
                    # true)
                    invalid_version = True

            assert_equal(exec_clone.call_count, 0 if in_cache_before else 1)

            # Patch prevents actual execution. Now do it for real. Note, that
            # this might be necessary for content retrieval even if dataset was
            # in cache before.
            try:
                ds = get_cached_dataset(url, version, keys)
            except AssertionError:
                # see previous call
                assert_true(invalid_version)

            assert_is_instance(ds, Dataset)
            assert_true(ds.is_installed())
            assert_equal(target, ds.pathobj)
            assert_is_instance(ds.repo, cls)

            if keys and not invalid_version and \
                    AnnexRepo.is_valid_repo(ds.path):
                # Note: it's not supposed to get that content if passed
                # `version` wasn't available. get_cached_dataset would then
                # raise before and not download anything only to raise
                # afterwards.
                here = ds.config.get("annex.uuid")
                where = ds.repo.whereis(ensure_list(keys), key=True)
                assert_true(all(here in remotes for remotes in where))

            # version check. Note, that all `get_cached_dataset` is supposed to
            # do, is verifying, that specified version exists - NOT check it
            # out"
            if version and not invalid_version:
                assert_true(ds.repo.commit_exists(version))

            # re-execution
            with patch(CLONE_PATCH_STR) as exec_clone:
                try:
                    ds2 = get_cached_dataset(url, version, keys)
                except AssertionError:
                    assert_true(invalid_version)
            exec_clone.assert_not_called()
            # returns the same Dataset as before:
            assert_is(ds, ds2)
Example #10
0
def test_GitRepo_instance_from_existing(path=None):
    GitRepo(path).init()

    gr = GitRepo(path)
    assert_is_instance(gr, GitRepo, "GitRepo was not created.")
    ok_(op.exists(op.join(path, '.git')))
Example #11
0
def test_GitRepo_flyweight(path1=None, path2=None):

    import gc

    repo1 = GitRepo(path1).init()
    assert_is_instance(repo1, GitRepo)

    # Due to issue 4862, we currently still require gc.collect() under unclear
    # circumstances to get rid of an exception traceback when creating in an
    # existing directory. That traceback references the respective function
    # frames which in turn reference the repo instance (they are methods).
    # Doesn't happen on all systems, though. Eventually we need to figure that
    # out.
    # However, still test for the refcount after gc.collect() to ensure we don't
    # introduce new circular references and make the issue worse!
    gc.collect()

    # As long as we don't reintroduce any circular references or produce
    # garbage during instantiation that isn't picked up immediately, `repo1`
    # should be the only counted reference to this instance.
    # Note, that sys.getrefcount reports its own argument and therefore one
    # reference too much.
    assert_equal(1, sys.getrefcount(repo1) - 1)

    # instantiate again:
    repo2 = GitRepo(path1).init()
    assert_is_instance(repo2, GitRepo)

    # the very same object:
    ok_(repo1 is repo2)

    # reference the same in a different way:
    with chpwd(path1):
        repo3 = GitRepo(op.relpath(path1, start=path2))

    # it's the same object:
    ok_(repo1 is repo3)

    # and realpath attribute is the same, so they are still equal:
    ok_(repo1 == repo3)

    orig_id = id(repo1)

    # Be sure we have exactly one object in memory:
    assert_equal(1, len([o for o in gc.get_objects()
                         if isinstance(o, GitRepo) and o.pathobj == Path(path1)]))

    # deleting one reference doesn't change anything - we still get the same
    # thing:
    gc.collect()  #  TODO: see first comment above
    del repo1
    ok_(repo2 is not None)
    ok_(repo2 is repo3)
    ok_(repo2 == repo3)

    # re-requesting still delivers the same thing:
    repo1 = GitRepo(path1)
    assert_equal(orig_id, id(repo1))

    # killing all references should result in the instance being gc'd and
    # re-request yields a new object:
    del repo1
    del repo2

    # Killing last reference will lead to garbage collection which will call
    # GitRepo's finalizer:
    with swallow_logs(new_level=1) as cml:
        del repo3
        gc.collect()  # TODO: see first comment above
        cml.assert_logged(msg="Finalizer called on: GitRepo(%s)" % path1,
                          level="Level 1",
                          regex=False)

    # Flyweight is gone:
    assert_not_in(path1, GitRepo._unique_instances.keys())
    # gc doesn't know any instance anymore:
    assert_equal([], [o for o in gc.get_objects()
                      if isinstance(o, GitRepo) and o.pathobj == Path(path1)])

    # new object is created on re-request:
    repo1 = GitRepo(path1)
    assert_equal(1, len([o for o in gc.get_objects()
                         if isinstance(o, GitRepo) and o.pathobj == Path(path1)]))