def test_newthings_coming_down(originpath=None, destpath=None): origin = GitRepo(originpath, create=True) create_tree(originpath, {'load.dat': 'heavy'}) Dataset(originpath).save('load.dat') ds = install(source=originpath, path=destpath, result_xfm='datasets', return_type='item-or-list') assert_is_instance(ds.repo, GitRepo) assert_in(DEFAULT_REMOTE, ds.repo.get_remotes()) # turn origin into an annex origin = AnnexRepo(originpath, create=True) # clone doesn't know yet assert_false(knows_annex(ds.path)) # but after an update it should # no merge, only one sibling, no parameters should be specific enough assert_result_count(ds.update(), 1, status='ok', type='dataset') assert (knows_annex(ds.path)) # no branches appeared eq_(ds.repo.get_branches(), [DEFAULT_BRANCH]) # now merge, and get an annex assert_result_count(ds.update(merge=True), 1, action='update', status='ok', type='dataset') assert_in('git-annex', ds.repo.get_branches()) assert_is_instance(ds.repo, AnnexRepo) # should be fully functional testfname = opj(ds.path, 'load.dat') assert_false(ds.repo.file_has_content(testfname)) ds.get('.') ok_file_has_content(opj(ds.path, 'load.dat'), 'heavy') # check that a new tag comes down origin.tag('first!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[0], 'first!') # and now we destroy the remote annex origin.call_git(['config', '--remove-section', 'annex']) rmtree(opj(origin.path, '.git', 'annex'), chmod_files=True) origin.call_git(['branch', '-D', 'git-annex']) origin = GitRepo(originpath) assert_false(knows_annex(originpath)) # and update the local clone # for now this should simply not fail (see gh-793), later might be enhanced to a # graceful downgrade before_branches = ds.repo.get_branches() ok_(any("git-annex" in b for b in ds.repo.get_remote_branches())) assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(before_branches, ds.repo.get_branches()) # annex branch got pruned assert_false(any("git-annex" in b for b in ds.repo.get_remote_branches())) # check that a new tag comes down even if repo types mismatch origin.tag('second!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[-1], 'second!')
def test_install_dataset_from_instance(src=None, dst=None): origin = Dataset(src).create(result_renderer='disabled', force=True) origin.save(['INFO.txt', 'test.dat'], to_git=True) origin.save('test-annex.dat', to_git=False) clone = install(source=origin, path=dst) assert_is_instance(clone, Dataset) ok_startswith(clone.path, dst) ok_(clone.is_installed()) ok_(GitRepo.is_valid_repo(clone.path)) assert_repo_status(clone.path, annex=None) assert_in('INFO.txt', clone.repo.get_indexed_files())
def decorated_test2(ds): # we get a Dataset instance assert_is_instance(ds, Dataset) # it's a clone in a temp. location, not within the cache assert_not_in(cache_dir, ds.pathobj.parents) assert_result_count(ds.siblings(), 1, type="sibling", name=DEFAULT_REMOTE, url=str(cache_dir / name_in_cache)) here = ds.config.get("annex.uuid") origin = ds.config.get(f"remote.{DEFAULT_REMOTE}.annex-uuid") where = ds.repo.whereis(str(annexed_file)) assert_in(here, where) assert_in(origin, where) return ds.pathobj, ds.repo.pathobj
def decorated_test3(ds): # we get a Dataset instance assert_is_instance(ds, Dataset) # it's a clone in a temp. location, not within the cache assert_not_in(cache_dir, ds.pathobj.parents) assert_result_count(ds.siblings(), 1, type="sibling", name=DEFAULT_REMOTE, url=str(cache_dir / name_in_cache)) # origin is the same cached dataset, that got this content in # decorated_test2 before. Should still be there. But "here" we # didn't request it here = ds.config.get("annex.uuid") origin = ds.config.get(f"remote.{DEFAULT_REMOTE}.annex-uuid") where = ds.repo.whereis(str(annexed_file)) assert_not_in(here, where) assert_in(origin, where) return ds.pathobj, ds.repo.pathobj
def test_ssh_get_connection(): manager = SSHManager() if _ssh_manager_is_multiplex: assert manager._socket_dir is None, \ "Should be unset upon initialization. Got %s" % str(manager._socket_dir) c1 = manager.get_connection('ssh://datalad-test') if _ssh_manager_is_multiplex: assert manager._socket_dir, "Should be set after interactions with the manager" assert_is_instance(c1, MultiplexSSHConnection) # subsequent call returns the very same instance: ok_(manager.get_connection('ssh://datalad-test') is c1) else: assert_is_instance(c1, NoMultiplexSSHConnection) # fail on malformed URls (meaning: our fancy URL parser can't correctly # deal with them): #assert_raises(ValueError, manager.get_connection, 'localhost') # we now allow those simple specifications of host to get_connection c2 = manager.get_connection('datalad-test') assert_is_instance(c2, SSHConnection) # but should fail if it looks like something else assert_raises(ValueError, manager.get_connection, 'datalad-test/') assert_raises(ValueError, manager.get_connection, ':datalad-test') # we can do what urlparse cannot # assert_raises(ValueError, manager.get_connection, 'someone@localhost') # next one is considered a proper url by urlparse (netloc:'', # path='/localhost), but eventually gets turned into SSHRI(hostname='ssh', # path='/localhost') -- which is fair IMHO -> invalid test # assert_raises(ValueError, manager.get_connection, 'ssh:/localhost') manager.close()
def test_GitRepo_instance_from_not_existing(path=None, path2=None): # 1. create=False and path doesn't exist: repo = GitRepo(path) assert_false(op.exists(path)) # 2. create=False, path exists, but no git repo: os.mkdir(path) ok_(op.exists(path)) repo = GitRepo(path) assert_false(op.exists(op.join(path, '.git'))) # 3. create=True, path doesn't exist: gr = GitRepo(path2).init() assert_is_instance(gr, GitRepo, "GitRepo was not created.") ok_(op.exists(op.join(path2, '.git'))) # re-enable from core GitRepo has a status() method #assert_repo_status(path2, annex=False) # 4. create=True, path exists, but no git repo: gr = GitRepo(path).init() assert_is_instance(gr, GitRepo, "GitRepo was not created.") ok_(op.exists(op.join(path, '.git')))
def test_datalad_credential_helper(path=None): ds = Dataset(path).create() # tell git to use git-credential-datalad ds.config.add('credential.helper', 'datalad', scope='local') ds.config.add('datalad.credentials.githelper.noninteractive', 'true', scope='global') from datalad.downloaders.providers import Providers url1 = "https://datalad-test.org/some" url2 = "https://datalad-test.org/other" provider_name = "datalad-test.org" # `Providers` code is old and only considers a dataset root based on PWD # for config lookup. contextmanager below can be removed once the # provider/credential system is redesigned. with chpwd(ds.path): gitcred = GitCredentialInterface(url=url1, repo=ds) # There's nothing set up yet, helper should return empty gitcred.fill() eq_(gitcred['username'], '') eq_(gitcred['password'], '') # store new credentials # Note, that `Providers.enter_new()` currently uses user-level config # files for storage only. TODO: make that an option! # To not mess with existing ones, fail if it already exists: cfg_file = Path(Providers._get_providers_dirs()['user']) \ / f"{provider_name}.cfg" assert_false(cfg_file.exists()) # Make sure we clean up from datalad.tests import _TEMP_PATHS_GENERATED _TEMP_PATHS_GENERATED.append(str(cfg_file)) # Give credentials to git and ask it to store them: gitcred = GitCredentialInterface(url=url1, username="******", password="******", repo=ds) gitcred.approve() assert_true(cfg_file.exists()) providers = Providers.from_config_files() p1 = providers.get_provider(url=url1, only_nondefault=True) assert_is_instance(p1.credential, UserPassword) eq_(p1.credential.get('user'), 'dl-user') eq_(p1.credential.get('password'), 'dl-pwd') # default regex should be host only, so matching url2, too p2 = providers.get_provider(url=url2, only_nondefault=True) assert_is_instance(p1.credential, UserPassword) eq_(p1.credential.get('user'), 'dl-user') eq_(p1.credential.get('password'), 'dl-pwd') # git, too, should now find it for both URLs gitcred = GitCredentialInterface(url=url1, repo=ds) gitcred.fill() eq_(gitcred['username'], 'dl-user') eq_(gitcred['password'], 'dl-pwd') gitcred = GitCredentialInterface(url=url2, repo=ds) gitcred.fill() eq_(gitcred['username'], 'dl-user') eq_(gitcred['password'], 'dl-pwd') # Rejection must not currently lead to deleting anything, since we would # delete too broadly. gitcred.reject() assert_true(cfg_file.exists()) gitcred = GitCredentialInterface(url=url1, repo=ds) gitcred.fill() eq_(gitcred['username'], 'dl-user') eq_(gitcred['password'], 'dl-pwd') dlcred = UserPassword(name=provider_name) eq_(dlcred.get('user'), 'dl-user') eq_(dlcred.get('password'), 'dl-pwd')
def test_Dataset_flyweight(path1=None, path2=None): import gc import sys ds1 = Dataset(path1) assert_is_instance(ds1, Dataset) # Don't create circular references or anything similar assert_equal(1, sys.getrefcount(ds1) - 1) ds1.create() # Due to issue 4862, we currently still require gc.collect() under unclear # circumstances to get rid of an exception traceback when creating in an # existing directory. That traceback references the respective function # frames which in turn reference the repo instance (they are methods). # Doesn't happen on all systems, though. Eventually we need to figure that # out. # However, still test for the refcount after gc.collect() to ensure we don't # introduce new circular references and make the issue worse! gc.collect() # refcount still fine after repo creation: assert_equal(1, sys.getrefcount(ds1) - 1) # instantiate again: ds2 = Dataset(path1) assert_is_instance(ds2, Dataset) # the very same object: ok_(ds1 is ds2) # reference the same via relative path: with chpwd(path1): ds3 = Dataset(relpath(path1, start=path2)) ok_(ds1 == ds3) ok_(ds1 is ds3) # gc knows one such object only: eq_( 1, len([ o for o in gc.get_objects() if isinstance(o, Dataset) and o.path == path1 ])) # on windows a symlink is not what you think it is if not on_windows: # reference the same via symlink: with chpwd(path2): os.symlink(path1, 'linked') ds4 = Dataset('linked') ds4_id = id(ds4) ok_(ds4 == ds1) ok_(ds4 is not ds1) # underlying repo, however, IS the same: ok_(ds4.repo is ds1.repo) # deleting one reference has no effect on the other: del ds1 gc.collect() # TODO: see first comment above ok_(ds2 is not None) ok_(ds2.repo is ds3.repo) if not on_windows: ok_(ds2.repo is ds4.repo) # deleting remaining references should lead to garbage collection del ds2 with swallow_logs(new_level=1) as cml: del ds3 gc.collect() # TODO: see first comment above # flyweight vanished: assert_not_in(path1, Dataset._unique_instances.keys()) # no such instance known to gc anymore: eq_([], [ o for o in gc.get_objects() if isinstance(o, Dataset) and o.path == path1 ]) # underlying repo should only be cleaned up, if ds3 was the last # reference to it. Otherwise the repo instance should live on # (via symlinked ds4): finalizer_log = "Finalizer called on: AnnexRepo(%s)" % path1 if on_windows: cml.assert_logged(msg=finalizer_log, level="Level 1", regex=False) else: assert_not_in(finalizer_log, cml.out) # symlinked is still there: ok_(ds4 is not None) eq_(ds4_id, id(ds4))
def test_get_cached_dataset(cache_dir=None): # patch DATALAD_TESTS_CACHE to not use the actual cache with # the test testing that very cache. cache_dir = Path(cache_dir) # store file-based values for testrepo-minimalds for readability: annexed_file = opj('inannex', 'animated.gif') annexed_file_key = "MD5E-s144625--4c458c62b7ac8ec8e19c8ff14b2e34ad.gif" with patch(CACHE_PATCH_STR, new=cache_dir): # tuples to test (url, version, keys, class): test_cases = [ # a simple testrepo ("https://github.com/datalad/testrepo--minimalds", "541cf855d13c2a338ff2803d4488daf0035e568f", None, AnnexRepo), # Same repo, but request paths to be present. This should work # with a subsequent call, although the first one did not already # request any: ("https://github.com/datalad/testrepo--minimalds", "9dd8b56cc706ab56185f2ceb75fbe9de9b606724", annexed_file_key, AnnexRepo), # Same repo again, but invalid version ( "https://github.com/datalad/testrepo--minimalds", "nonexistent", "irrelevantkey", # invalid version; don't even try to get the key AnnexRepo), # same thing with different name should be treated as a new thing: ("https://github.com/datalad/testrepo--minimalds", "git-annex", None, AnnexRepo), # try a plain git repo to make sure we can deal with that: # Note, that we first need a test case w/o a `key` parameter to not # blow up the test when Clone is patched, resulting in a MagicMock # instead of a Dataset instance within get_cached_dataset. In the # second case it's already cached then, so the patched Clone is # never executed. ("https://github.com/datalad/datalad.org", None, None, GitRepo), ( "https://github.com/datalad/datalad.org", "gh-pages", "ignored-key", # it's a git repo; don't even try to get a key GitRepo), ] for url, version, keys, cls in test_cases: target = cache_dir / url2filename(url) # assuming it doesn't exist yet - patched cache dir! in_cache_before = target.exists() with patch(CLONE_PATCH_STR) as exec_clone: try: ds = get_cached_dataset(url, version, keys) invalid_version = False except AssertionError: # should happen only if `version` wasn't found. Implies # that the dataset exists in cache (although not returned # due to exception) assert_true(version) assert_false(Dataset(target).repo.commit_exists(version)) # mark for later assertions (most of them should still hold # true) invalid_version = True assert_equal(exec_clone.call_count, 0 if in_cache_before else 1) # Patch prevents actual execution. Now do it for real. Note, that # this might be necessary for content retrieval even if dataset was # in cache before. try: ds = get_cached_dataset(url, version, keys) except AssertionError: # see previous call assert_true(invalid_version) assert_is_instance(ds, Dataset) assert_true(ds.is_installed()) assert_equal(target, ds.pathobj) assert_is_instance(ds.repo, cls) if keys and not invalid_version and \ AnnexRepo.is_valid_repo(ds.path): # Note: it's not supposed to get that content if passed # `version` wasn't available. get_cached_dataset would then # raise before and not download anything only to raise # afterwards. here = ds.config.get("annex.uuid") where = ds.repo.whereis(ensure_list(keys), key=True) assert_true(all(here in remotes for remotes in where)) # version check. Note, that all `get_cached_dataset` is supposed to # do, is verifying, that specified version exists - NOT check it # out" if version and not invalid_version: assert_true(ds.repo.commit_exists(version)) # re-execution with patch(CLONE_PATCH_STR) as exec_clone: try: ds2 = get_cached_dataset(url, version, keys) except AssertionError: assert_true(invalid_version) exec_clone.assert_not_called() # returns the same Dataset as before: assert_is(ds, ds2)
def test_GitRepo_instance_from_existing(path=None): GitRepo(path).init() gr = GitRepo(path) assert_is_instance(gr, GitRepo, "GitRepo was not created.") ok_(op.exists(op.join(path, '.git')))
def test_GitRepo_flyweight(path1=None, path2=None): import gc repo1 = GitRepo(path1).init() assert_is_instance(repo1, GitRepo) # Due to issue 4862, we currently still require gc.collect() under unclear # circumstances to get rid of an exception traceback when creating in an # existing directory. That traceback references the respective function # frames which in turn reference the repo instance (they are methods). # Doesn't happen on all systems, though. Eventually we need to figure that # out. # However, still test for the refcount after gc.collect() to ensure we don't # introduce new circular references and make the issue worse! gc.collect() # As long as we don't reintroduce any circular references or produce # garbage during instantiation that isn't picked up immediately, `repo1` # should be the only counted reference to this instance. # Note, that sys.getrefcount reports its own argument and therefore one # reference too much. assert_equal(1, sys.getrefcount(repo1) - 1) # instantiate again: repo2 = GitRepo(path1).init() assert_is_instance(repo2, GitRepo) # the very same object: ok_(repo1 is repo2) # reference the same in a different way: with chpwd(path1): repo3 = GitRepo(op.relpath(path1, start=path2)) # it's the same object: ok_(repo1 is repo3) # and realpath attribute is the same, so they are still equal: ok_(repo1 == repo3) orig_id = id(repo1) # Be sure we have exactly one object in memory: assert_equal(1, len([o for o in gc.get_objects() if isinstance(o, GitRepo) and o.pathobj == Path(path1)])) # deleting one reference doesn't change anything - we still get the same # thing: gc.collect() # TODO: see first comment above del repo1 ok_(repo2 is not None) ok_(repo2 is repo3) ok_(repo2 == repo3) # re-requesting still delivers the same thing: repo1 = GitRepo(path1) assert_equal(orig_id, id(repo1)) # killing all references should result in the instance being gc'd and # re-request yields a new object: del repo1 del repo2 # Killing last reference will lead to garbage collection which will call # GitRepo's finalizer: with swallow_logs(new_level=1) as cml: del repo3 gc.collect() # TODO: see first comment above cml.assert_logged(msg="Finalizer called on: GitRepo(%s)" % path1, level="Level 1", regex=False) # Flyweight is gone: assert_not_in(path1, GitRepo._unique_instances.keys()) # gc doesn't know any instance anymore: assert_equal([], [o for o in gc.get_objects() if isinstance(o, GitRepo) and o.pathobj == Path(path1)]) # new object is created on re-request: repo1 = GitRepo(path1) assert_equal(1, len([o for o in gc.get_objects() if isinstance(o, GitRepo) and o.pathobj == Path(path1)]))