def test_ssh_get_connection(): manager = SSHManager() assert manager._socket_dir is None, \ "Should be unset upon initialization. Got %s" % str(manager._socket_dir) c1 = manager.get_connection('ssh://localhost') assert manager._socket_dir, "Should be set after interactions with the manager" assert_is_instance(c1, SSHConnection) # subsequent call returns the very same instance: ok_(manager.get_connection('ssh://localhost') is c1) # fail on malformed URls (meaning: our fancy URL parser can't correctly # deal with them): #assert_raises(ValueError, manager.get_connection, 'localhost') # we now allow those simple specifications of host to get_connection c2 = manager.get_connection('localhost') assert_is_instance(c2, SSHConnection) # but should fail if it looks like something else assert_raises(ValueError, manager.get_connection, 'localhost/') assert_raises(ValueError, manager.get_connection, ':localhost') # we can do what urlparse cannot # assert_raises(ValueError, manager.get_connection, 'someone@localhost') # next one is considered a proper url by urlparse (netloc:'', # path='/localhost), but eventually gets turned into SSHRI(hostname='ssh', # path='/localhost') -- which is fair IMHO -> invalid test # assert_raises(ValueError, manager.get_connection, 'ssh:/localhost') manager.close()
def test_protocol_commons(protocol_file): for protocol_class in [ DryRunProtocol, DryRunExternalsProtocol, ExecutionTimeProtocol, ExecutionTimeExternalsProtocol, NullProtocol ]: protocol = protocol_class() assert_is_instance(protocol, ProtocolInterface) assert_equal(len(protocol), 0) protocol.add_section(['some_command', 'some_option'], Exception("Whatever exception")) protocol.add_section(['another_command'], None) assert_equal(len(protocol), 2 if protocol_class != NullProtocol else 0) # test iterable: assert_raises(AssertionError, assert_raises, TypeError, iter, protocol) for section in protocol: assert_in('command', section) for item in range(len(protocol)): assert_is_instance(protocol.__getitem__(item), dict) # test __str__: str_ = str(protocol) # test write_to_file: protocol.write_to_file(protocol_file) read_str = '' with open(protocol_file, 'r') as f: for line in f.readlines(): read_str += line assert_equal(str_, read_str)
def test_newthings_coming_down(originpath, destpath): origin = GitRepo(originpath, create=True) create_tree(originpath, {'load.dat': 'heavy'}) Dataset(originpath).save('load.dat') ds = install(source=originpath, path=destpath, result_xfm='datasets', return_type='item-or-list') assert_is_instance(ds.repo, GitRepo) assert_in('origin', ds.repo.get_remotes()) # turn origin into an annex origin = AnnexRepo(originpath, create=True) # clone doesn't know yet assert_false(knows_annex(ds.path)) # but after an update it should # no merge, only one sibling, no parameters should be specific enough assert_result_count(ds.update(), 1, status='ok', type='dataset') assert (knows_annex(ds.path)) # no branches appeared eq_(ds.repo.get_branches(), [DEFAULT_BRANCH]) # now merge, and get an annex assert_result_count(ds.update(merge=True), 1, action='update', status='ok', type='dataset') assert_in('git-annex', ds.repo.get_branches()) assert_is_instance(ds.repo, AnnexRepo) # should be fully functional testfname = opj(ds.path, 'load.dat') assert_false(ds.repo.file_has_content(testfname)) ds.get('.') ok_file_has_content(opj(ds.path, 'load.dat'), 'heavy') # check that a new tag comes down origin.tag('first!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[0], 'first!') # and now we destroy the remote annex origin.call_git(['config', '--remove-section', 'annex']) rmtree(opj(origin.path, '.git', 'annex'), chmod_files=True) origin.call_git(['branch', '-D', 'git-annex']) origin = GitRepo(originpath) assert_false(knows_annex(originpath)) # and update the local clone # for now this should simply not fail (see gh-793), later might be enhanced to a # graceful downgrade before_branches = ds.repo.get_branches() assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(before_branches, ds.repo.get_branches()) # annex branch got pruned eq_(['origin/HEAD', 'origin/' + DEFAULT_BRANCH], ds.repo.get_remote_branches()) # check that a new tag comes down even if repo types mismatch origin.tag('second!') assert_result_count(ds.update(), 1, status='ok', type='dataset') eq_(ds.repo.get_tags(output='name')[-1], 'second!')
def test_install_dataset_from_instance(src, dst): origin = Dataset(src) clone = install(source=origin, path=dst) assert_is_instance(clone, Dataset) ok_startswith(clone.path, dst) ok_(clone.is_installed()) ok_(GitRepo.is_valid_repo(clone.path)) ok_clean_git(clone.path, annex=None) assert_in('INFO.txt', clone.repo.get_indexed_files())
def test_ssh_get_connection(): manager = SSHManager() c1 = manager.get_connection('ssh://localhost') assert_is_instance(c1, SSHConnection) # subsequent call returns the very same instance: ok_(manager.get_connection('ssh://localhost') is c1) # fail on malformed URls (meaning: out fancy URL parser can't correctly # deal with them): assert_raises(ValueError, manager.get_connection, 'localhost')
def decorated_test2(ds): # we get a Dataset instance assert_is_instance(ds, Dataset) # it's a clone in a temp. location, not within the cache assert_not_in(cache_dir, ds.pathobj.parents) assert_result_count(ds.siblings(), 1, type="sibling", name="origin", url=str(cache_dir / name_in_cache)) here = ds.config.get("annex.uuid") origin = ds.config.get("remote.origin.annex-uuid") where = ds.repo.whereis(str(annexed_file)) assert_in(here, where) assert_in(origin, where) return ds.pathobj, ds.repo.pathobj
def decorated_test3(ds): # we get a Dataset instance assert_is_instance(ds, Dataset) # it's a clone in a temp. location, not within the cache assert_not_in(cache_dir, ds.pathobj.parents) assert_result_count(ds.siblings(), 1, type="sibling", name="origin", url=str(cache_dir / name_in_cache)) # origin is the same cached dataset, that got this content in # decorated_test2 before. Should still be there. But "here" we # didn't request it here = ds.config.get("annex.uuid") origin = ds.config.get("remote.origin.annex-uuid") where = ds.repo.whereis(str(annexed_file)) assert_not_in(here, where) assert_in(origin, where) return ds.pathobj, ds.repo.pathobj
def test_ssh_get_connection(): manager = SSHManager() c1 = manager.get_connection('ssh://localhost') assert_is_instance(c1, SSHConnection) # subsequent call returns the very same instance: ok_(manager.get_connection('ssh://localhost') is c1) # fail on malformed URls (meaning: our fancy URL parser can't correctly # deal with them): assert_raises(ValueError, manager.get_connection, 'localhost') # we can do what urlparse cannot # assert_raises(ValueError, manager.get_connection, 'someone@localhost') # next one is considered a proper url by urlparse (netloc:'', # path='/localhost), but eventually gets turned into SSHRI(hostname='ssh', # path='/localhost') -- which is fair IMHO -> invalid test # assert_raises(ValueError, manager.get_connection, 'ssh:/localhost') manager.close()
def decorated_test4(ds): # we get a Dataset instance assert_is_instance(ds, Dataset) # it's a clone in a temp. location, not within the cache assert_not_in(cache_dir, ds.pathobj.parents) assert_result_count(ds.siblings(), 1, type="sibling", name=DEFAULT_REMOTE, url=str(cache_dir / name_in_cache)) # origin is the same cached dataset, that got this content in # decorated_test2 before. Should still be there. But "here" we # didn't request it here = ds.config.get("annex.uuid") origin = ds.config.get(f"remote.{DEFAULT_REMOTE}.annex-uuid") where = ds.repo.whereis(str(annexed_file)) assert_not_in(here, where) assert_in(origin, where) assert_equal(ds.repo.get_hexsha(), "541cf855d13c2a338ff2803d4488daf0035e568f") return ds.pathobj, ds.repo.pathobj
def test_find_containers(path): ds = Dataset(path).create(force=True) ds.save(path=[op.join('sub', 'i.img')], message="dummy container") ds.containers_add("i", image=op.join('sub', 'i.img')) ok_clean_git(path) # find the only one res = find_container(ds) assert_is_instance(res, dict) assert_result_count([res], 1, status="ok", path=op.join(ds.path, "sub", "i.img")) # find by name res = find_container(ds, "i") assert_is_instance(res, dict) assert_result_count([res], 1, status="ok", path=op.join(ds.path, "sub", "i.img")) # find by path res = find_container(ds, op.join("sub", "i.img")) assert_is_instance(res, dict) assert_result_count([res], 1, status="ok", path=op.join(ds.path, "sub", "i.img")) # don't find another thing assert_raises(ValueError, find_container, ds, "nothere")
def test_get_repo_instance_annex(path): # get instance from path repo = get_repo_instance(path, AnnexRepo) assert_is_instance(repo, AnnexRepo) eq_(realpath(repo.path), realpath(path)) old_pwd = getpwd() # get instance from current dir chpwd(path) repo = get_repo_instance() assert_is_instance(repo, AnnexRepo) eq_(realpath(repo.path), realpath(path)) # get instance from current subdir new_subdir = opj(path, "subdir") mkdir(new_subdir) chpwd(new_subdir) eq_(new_subdir, getpwd()) repo = get_repo_instance() assert_is_instance(repo, AnnexRepo) eq_(realpath(repo.path), realpath(path)) chpwd(old_pwd)
def test_get_repo_instance_git(path): real_path = Path(path).resolve() # get instance from path repo = get_repo_instance(path, GitRepo) assert_is_instance(repo, GitRepo) eq_(repo.pathobj, real_path) old_pwd = getpwd() # get instance from current dir chpwd(path) repo = get_repo_instance() assert_is_instance(repo, GitRepo) eq_(repo.pathobj, real_path) # get instance from current subdir new_subdir = opj(path, "subdir") mkdir(new_subdir) chpwd(new_subdir) eq_(new_subdir, getpwd()) repo = get_repo_instance() assert_is_instance(repo, GitRepo) eq_(repo.pathobj, real_path) chpwd(old_pwd)
def test_GitRepo_instance_from_not_existing(path, path2): # 1. create=False and path doesn't exist: repo = GitRepo(path) assert_false(op.exists(path)) # 2. create=False, path exists, but no git repo: os.mkdir(path) ok_(op.exists(path)) repo = GitRepo(path) assert_false(op.exists(op.join(path, '.git'))) # 3. create=True, path doesn't exist: gr = GitRepo(path2).init() assert_is_instance(gr, GitRepo, "GitRepo was not created.") ok_(op.exists(op.join(path2, '.git'))) # re-enable from core GitRepo has a status() method #assert_repo_status(path2, annex=False) # 4. create=True, path exists, but no git repo: gr = GitRepo(path).init() assert_is_instance(gr, GitRepo, "GitRepo was not created.") ok_(op.exists(op.join(path, '.git')))
def test_Dataset_flyweight(path1, path2): ds1 = Dataset(path1) assert_is_instance(ds1, Dataset) # instantiate again: ds2 = Dataset(path1) assert_is_instance(ds2, Dataset) # the very same object: ok_(ds1 is ds2) # reference the same via relative path: with chpwd(path1): ds3 = Dataset(relpath(path1, start=path2)) ok_(ds1 == ds3) ok_(ds1 is ds3) # on windows as symlink is not what you think it is if not on_windows: # reference the same via symlink: with chpwd(path2): os.symlink(path1, 'linked') ds3 = Dataset('linked') ok_(ds3 == ds1) ok_(ds3 is not ds1)
def test_GitRepo_flyweight(path1, path2): import gc repo1 = GitRepo(path1).init() assert_is_instance(repo1, GitRepo) # Due to issue 4862, we currently still require gc.collect() under unclear # circumstances to get rid of an exception traceback when creating in an # existing directory. That traceback references the respective function # frames which in turn reference the repo instance (they are methods). # Doesn't happen on all systems, though. Eventually we need to figure that # out. # However, still test for the refcount after gc.collect() to ensure we don't # introduce new circular references and make the issue worse! gc.collect() # As long as we don't reintroduce any circular references or produce # garbage during instantiation that isn't picked up immediately, `repo1` # should be the only counted reference to this instance. # Note, that sys.getrefcount reports its own argument and therefore one # reference too much. assert_equal(1, sys.getrefcount(repo1) - 1) # instantiate again: repo2 = GitRepo(path1).init() assert_is_instance(repo2, GitRepo) # the very same object: ok_(repo1 is repo2) # reference the same in a different way: with chpwd(path1): repo3 = GitRepo(op.relpath(path1, start=path2)) # it's the same object: ok_(repo1 is repo3) # and realpath attribute is the same, so they are still equal: ok_(repo1 == repo3) orig_id = id(repo1) # Be sure we have exactly one object in memory: assert_equal( 1, len([ o for o in gc.get_objects() if isinstance(o, GitRepo) and o.pathobj == Path(path1) ])) # deleting one reference doesn't change anything - we still get the same # thing: gc.collect() # TODO: see first comment above del repo1 ok_(repo2 is not None) ok_(repo2 is repo3) ok_(repo2 == repo3) # re-requesting still delivers the same thing: repo1 = GitRepo(path1) assert_equal(orig_id, id(repo1)) # killing all references should result in the instance being gc'd and # re-request yields a new object: del repo1 del repo2 # Killing last reference will lead to garbage collection which will call # GitRepo's finalizer: with swallow_logs(new_level=1) as cml: del repo3 gc.collect() # TODO: see first comment above cml.assert_logged(msg="Finalizer called on: GitRepo(%s)" % path1, level="Level 1", regex=False) # Flyweight is gone: assert_not_in(path1, GitRepo._unique_instances.keys()) # gc doesn't know any instance anymore: assert_equal([], [ o for o in gc.get_objects() if isinstance(o, GitRepo) and o.pathobj == Path(path1) ]) # new object is created on re-request: repo1 = GitRepo(path1) assert_equal( 1, len([ o for o in gc.get_objects() if isinstance(o, GitRepo) and o.pathobj == Path(path1) ]))
def test_get_cached_dataset(cache_dir): # patch DATALAD_TESTS_CACHE to not use the actual cache with # the test testing that very cache. cache_dir = Path(cache_dir) # store file-based values for testrepo-minimalds for readability: annexed_file = opj('inannex', 'animated.gif') annexed_file_key = "MD5E-s144625--4c458c62b7ac8ec8e19c8ff14b2e34ad.gif" with patch(CACHE_PATCH_STR, new=cache_dir): # tuples to test (url, version, keys, class): test_cases = [ # a simple testrepo ("https://github.com/datalad/testrepo--minimalds", "541cf855d13c2a338ff2803d4488daf0035e568f", None, AnnexRepo), # Same repo, but request paths to be present. This should work # with a subsequent call, although the first one did not already # request any: ("https://github.com/datalad/testrepo--minimalds", "9dd8b56cc706ab56185f2ceb75fbe9de9b606724", annexed_file_key, AnnexRepo), # Same repo again, but invalid version ( "https://github.com/datalad/testrepo--minimalds", "nonexistent", "irrelevantkey", # invalid version; don't even try to get the key AnnexRepo), # same thing with different name should be treated as a new thing: ("https://github.com/datalad/testrepo--minimalds", "git-annex", None, AnnexRepo), # try a plain git repo to make sure we can deal with that: # Note, that we first need a test case w/o a `key` parameter to not # blow up the test when Clone is patched, resulting in a MagicMock # instead of a Dataset instance within get_cached_dataset. In the # second case it's already cached then, so the patched Clone is # never executed. ("https://github.com/datalad/datalad.org", None, None, GitRepo), ( "https://github.com/datalad/datalad.org", "gh-pages", "ignored-key", # it's a git repo; don't even try to get a key GitRepo), ] for url, version, keys, cls in test_cases: target = cache_dir / url2filename(url) # assuming it doesn't exist yet - patched cache dir! in_cache_before = target.exists() with patch(CLONE_PATCH_STR) as exec_clone: try: ds = get_cached_dataset(url, version, keys) invalid_version = False except AssertionError: # should happen only if `version` wasn't found. Implies # that the dataset exists in cache (although not returned # due to exception) assert_true(version) assert_false(Dataset(target).repo.commit_exists(version)) # mark for later assertions (most of them should still hold # true) invalid_version = True assert_equal(exec_clone.call_count, 0 if in_cache_before else 1) # Patch prevents actual execution. Now do it for real. Note, that # this might be necessary for content retrieval even if dataset was # in cache before. try: ds = get_cached_dataset(url, version, keys) except AssertionError: # see previous call assert_true(invalid_version) assert_is_instance(ds, Dataset) assert_true(ds.is_installed()) assert_equal(target, ds.pathobj) assert_is_instance(ds.repo, cls) if keys and not invalid_version and \ AnnexRepo.is_valid_repo(ds.path): # Note: it's not supposed to get that content if passed # `version` wasn't available. get_cached_dataset would then # raise before and not download anything only to raise # afterwards. here = ds.config.get("annex.uuid") where = ds.repo.whereis(ensure_list(keys), key=True) assert_true(all(here in remotes for remotes in where)) # version check. Note, that all `get_cached_dataset` is supposed to # do, is verifying, that specified version exists - NOT check it # out" if version and not invalid_version: assert_true(ds.repo.commit_exists(version)) # re-execution with patch(CLONE_PATCH_STR) as exec_clone: try: ds2 = get_cached_dataset(url, version, keys) except AssertionError: assert_true(invalid_version) exec_clone.assert_not_called() # returns the same Dataset as before: assert_is(ds, ds2)
def test_Dataset_flyweight(path1, path2): import gc import sys ds1 = Dataset(path1) assert_is_instance(ds1, Dataset) # Don't create circular references or anything similar assert_equal(1, sys.getrefcount(ds1) - 1) ds1.create() # Due to issue 4862, we currently still require gc.collect() under unclear # circumstances to get rid of an exception traceback when creating in an # existing directory. That traceback references the respective function # frames which in turn reference the repo instance (they are methods). # Doesn't happen on all systems, though. Eventually we need to figure that # out. # However, still test for the refcount after gc.collect() to ensure we don't # introduce new circular references and make the issue worse! gc.collect() # refcount still fine after repo creation: assert_equal(1, sys.getrefcount(ds1) - 1) # instantiate again: ds2 = Dataset(path1) assert_is_instance(ds2, Dataset) # the very same object: ok_(ds1 is ds2) # reference the same via relative path: with chpwd(path1): ds3 = Dataset(relpath(path1, start=path2)) ok_(ds1 == ds3) ok_(ds1 is ds3) # gc knows one such object only: eq_( 1, len([ o for o in gc.get_objects() if isinstance(o, Dataset) and o.path == path1 ])) # on windows a symlink is not what you think it is if not on_windows: # reference the same via symlink: with chpwd(path2): os.symlink(path1, 'linked') ds4 = Dataset('linked') ds4_id = id(ds4) ok_(ds4 == ds1) ok_(ds4 is not ds1) # underlying repo, however, IS the same: ok_(ds4.repo is ds1.repo) # deleting one reference has no effect on the other: del ds1 gc.collect() # TODO: see first comment above ok_(ds2 is not None) ok_(ds2.repo is ds3.repo) if not on_windows: ok_(ds2.repo is ds4.repo) # deleting remaining references should lead to garbage collection del ds2 with swallow_logs(new_level=1) as cml: del ds3 gc.collect() # TODO: see first comment above # flyweight vanished: assert_not_in(path1, Dataset._unique_instances.keys()) # no such instance known to gc anymore: eq_([], [ o for o in gc.get_objects() if isinstance(o, Dataset) and o.path == path1 ]) # underlying repo should only be cleaned up, if ds3 was the last # reference to it. Otherwise the repo instance should live on # (via symlinked ds4): finalizer_log = "Finalizer called on: AnnexRepo(%s)" % path1 if on_windows: cml.assert_logged(msg=finalizer_log, level="Level 1", regex=False) else: assert_not_in(finalizer_log, cml.out) # symlinked is still there: ok_(ds4 is not None) eq_(ds4_id, id(ds4))
def test_GitRepo_instance_from_existing(path): GitRepo(path).init() gr = GitRepo(path) assert_is_instance(gr, GitRepo, "GitRepo was not created.") ok_(op.exists(op.join(path, '.git')))