def test_get_local_file_url(): for path, url in ( # relpaths are special-cased below ('test.txt', 'test.txt'), # static copy of "most_obscore_name" (' "\';a&b&cΔЙקم๗あ `| ', # and translation by google chrome "%20%22%27%3Ba%26b%26c%CE%94%D0%99%D7%A7%D9%85%E0%B9%97%E3%81%82%20%60%7C%20"), ) + ( ('C:\\Windows\\Notepad.exe', 'file://C/Windows/Notepad.exe'), ) if on_windows else ( ('/a', 'file:///a'), ('/a/b/c', 'file:///a/b/c'), ('/a~', 'file:///a~'), # there are no files with trailing slashes in the name #('/a b/', 'file:///a%20b/'), ('/a b/name', 'file:///a%20b/name'), ): if isabs(path): eq_(get_local_file_url(path), url) else: eq_(get_local_file_url(path), '/'.join(( get_local_file_url(os.getcwd()), url)) )
def test_get_local_file_url(): for path, url in ( # relpaths are special-cased below ('test.txt', 'test.txt'), ) + ( ('C:\\Windows\\notepad.exe', 'file://C/Windows/notepad.exe'), ) if on_windows else ( # static copy of "most_obscore_name" ( ' "\';a&b&cΔЙקم๗あ `| ', # and translation by google chrome "%20%22%27%3Ba%26b%26c%CE%94%D0%99%D7%A7%D9%85%E0%B9%97%E3%81%82%20%60%7C%20" ), ('/a', 'file:///a'), ('/a/b/c', 'file:///a/b/c'), ('/a~', 'file:///a~'), # there are no files with trailing slashes in the name #('/a b/', 'file:///a%20b/'), ('/a b/name', 'file:///a%20b/name'), ): try: # Yarik found no better way to trigger. .decode() isn't enough print("D: %s" % path) except UnicodeEncodeError: if sys.version_info < (3, 7): # observed test failing on ubuntu 18.04 with python 3.6 # (reproduced in conda env locally with python 3.6.10 when LANG=C # We will just skip this tricky one continue raise if isabs(path): eq_(get_local_file_url(path), url) else: eq_(get_local_file_url(path), '/'.join( (get_local_file_url(os.getcwd()), url)))
def test_source_candidate_subdataset(store1=None, store2=None, intermediate=None, super=None, clone=None): # This tests the scenario of gh-6159. # However, the actual point is to test that `get` does not overwrite a # source candidate config in subdatasets, if they already have such a # config. This could come from any postclone_cfg routine, but the only one # actually doing this ATM is postclone_cfg_ria. ds = Dataset(intermediate).create(force=True) ds.create("sub1", force=True) ds.create("sub2", force=True) ds.save(recursive=True) ria_url_1 = "ria+" + get_local_file_url(store1, compatibility='git') ds.create_sibling_ria(ria_url_1, "firststore", recursive=True, new_store_ok=True) ds.push(".", to="firststore", recursive=True) superds = Dataset(super).create() superds.clone(source=ria_url_1 + "#" + ds.id, path="intermediate") ria_url_2 = "ria+" + get_local_file_url(store2, compatibility='git') superds.create_sibling_ria(ria_url_2, "secondstore", new_store_ok=True) superds.push(".", to="secondstore") cloneds = install(clone, source=ria_url_2 + "#" + superds.id) # This would fail if source candidates weren't right, since cloneds only # knows the second store so far (which doesn't have the subdatasets). cloneds.get("intermediate", recursive=True)
def test_no_storage(store1, store2, ds_path): store1_url = 'ria+' + get_local_file_url(store1) store2_url = 'ria+' + get_local_file_url(store2) ds = Dataset(ds_path).create(force=True) ds.save(recursive=True) assert_repo_status(ds.path) res = ds.create_sibling_ria(store1_url, "datastore1", storage_sibling=False) assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_({'datastore1', 'here'}, {s['name'] for s in ds.siblings(result_renderer=None)}) # deprecated way of disabling storage still works res = ds.create_sibling_ria(store2_url, "datastore2", disable_storage__=True) assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_({'datastore2', 'datastore1', 'here'}, {s['name'] for s in ds.siblings(result_renderer=None)}) # smoke test that we can push to it res = ds.push(to='datastore1') assert_status('ok', res) # but nothing was copied, because there is no storage sibling assert_result_count(res, 0, action='copy')
def test_install_dataset_from_just_source(src_repo=None, path=None): src_ds = Dataset(src_repo).create(result_renderer='disabled', force=True) src_ds.save(['INFO.txt', 'test.dat'], to_git=True) src_ds.save('test-annex.dat', to_git=False) # equivalent repo on github: src_url = "https://github.com/datalad/testrepo--basic--r1.git" sources = [ src_ds.path, get_local_file_url(src_ds.path, compatibility='git') ] if not dl_cfg.get('datalad.tests.nonetwork'): sources.append(src_url) for url in sources: with chpwd(path, mkdir=True): ds = install(source=url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) assert_repo_status(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files()) # cleanup before next iteration rmtree(path)
def test_custom_call_fmt(path, local_file): ds = Dataset(path).create() subds = ds.create('sub') # plug in a proper singularity image subds.containers_add( 'mycontainer', url=get_local_file_url(op.join(local_file, 'some_container.img')), image='righthere', call_fmt='echo image={img} cmd={cmd} img_dspath={img_dspath} ' # and environment variable being set/propagated by default 'name=$DATALAD_CONTAINER_NAME') ds.save() # record the effect in super-dataset # Running should work fine either withing sub or within super out = WitlessRunner(cwd=subds.path).run( ['datalad', 'containers-run', '-n', 'mycontainer', 'XXX'], protocol=StdOutCapture) assert_in('image=righthere cmd=XXX img_dspath=. name=mycontainer', out['stdout']) out = WitlessRunner(cwd=ds.path).run( ['datalad', 'containers-run', '-n', 'sub/mycontainer', 'XXX'], protocol=StdOutCapture) assert_in('image=sub/righthere cmd=XXX img_dspath=sub', out['stdout']) # Test within subdirectory of the super-dataset subdir = op.join(ds.path, 'subdir') os.mkdir(subdir) out = WitlessRunner(cwd=subdir).run( ['datalad', 'containers-run', '-n', 'sub/mycontainer', 'XXX'], protocol=StdOutCapture) assert_in('image=../sub/righthere cmd=XXX img_dspath=../sub', out['stdout'])
def test_optimized_cloning(path): # make test repo with one file and one commit originpath = op.join(path, 'origin') repo = GitRepo(originpath, create=True) with open(op.join(originpath, 'test'), 'w') as f: f.write('some') repo.add('test') repo.commit('init') ok_clean_git(originpath, annex=False) from glob import glob def _get_inodes(repo): return dict( [(os.path.join(*o.split(os.sep)[-2:]), os.stat(o).st_ino) for o in glob(os.path.join(repo.path, repo.get_git_dir(repo), 'objects', '*', '*'))]) origin_inodes = _get_inodes(repo) # now clone it in different ways and see what happens to the object storage from datalad.support.network import get_local_file_url clonepath = op.join(path, 'clone') for src in (originpath, get_local_file_url(originpath)): # deprecated assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath) clone = GitRepo.clone(url=src, path=clonepath, create=True) clone_inodes = _get_inodes(clone) eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src)) rmtree(clonepath)
def test_optimized_cloning(path): # make test repo with one file and one commit originpath = op.join(path, 'origin') repo = GitRepo(originpath, create=True) with open(op.join(originpath, 'test'), 'w') as f: f.write('some') repo.add('test') repo.commit('init') ok_clean_git(originpath, annex=False) from glob import glob def _get_inodes(repo): return dict([ (os.path.join(*o.split(os.sep)[-2:]), os.stat(o).st_ino) for o in glob(os.path.join(repo.repo.git_dir, 'objects', '*', '*')) ]) origin_inodes = _get_inodes(repo) # now clone it in different ways and see what happens to the object storage from datalad.support.network import get_local_file_url clonepath = op.join(path, 'clone') for src in (originpath, get_local_file_url(originpath)): # deprecated assert_raises(DeprecatedError, GitRepo, url=src, path=clonepath) clone = GitRepo.clone(url=src, path=clonepath, create=True) clone_inodes = _get_inodes(clone) eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src)) rmtree(clonepath)
def test_ria_push(srcpath, dstpath): # complex test involving a git remote, a special remote, and a # publication dependency src = Dataset(srcpath).create() testfile = src.pathobj / 'test_mod_annex_file' testfile.write_text("Heavy stuff.") src.save() assert_status( 'ok', src.create_sibling_ria( "ria+{}".format(get_local_file_url(dstpath, compatibility='git')), "datastore")) res = src.push(to='datastore') assert_in_results(res, action='publish', target='datastore', status='ok', refspec='refs/heads/master:refs/heads/master') assert_in_results(res, action='publish', target='datastore', status='ok', refspec='refs/heads/git-annex:refs/heads/git-annex') assert_in_results(res, action='copy', target='datastore-storage', status='ok', path=str(testfile))
def test_ria_postclonecfg(): if not has_symlink_capability(): # This is needed to create an ORA remote using an URL for upload, # that is then invalidated later on (delete the symlink it's based on). raise SkipTest("Can't create symlinks") from datalad.utils import make_tempfile from datalad.tests.utils import HTTPPath with make_tempfile(mkdir=True) as lcl, make_tempfile(mkdir=True) as store: id = _postclonetest_prepare(lcl, store) # test cloning via ria+file:// yield _test_ria_postclonecfg, get_local_file_url( store, compatibility='git'), id # Note: HTTP disabled for now. Requires proper implementation in ORA # remote. See # https://github.com/datalad/datalad/pull/4203#discussion_r410284649 # # test cloning via ria+http:// # with HTTPPath(store) as url: # yield _test_ria_postclonecfg, url, id # test cloning via ria+ssh:// yield skip_ssh(_test_ria_postclonecfg), \ "ssh://datalad-test:{}".format(Path(store).as_posix()), id
def test_custom_call_fmt(path, local_file): ds = Dataset(path).create() subds = ds.create('sub') # plug in a proper singularity image subds.containers_add( 'mycontainer', url=get_local_file_url(op.join(local_file, 'some_container.img')), image='righthere', call_fmt='echo image={img} cmd={cmd} img_dspath={img_dspath} ' # and environment variable being set/propagated by default 'name=$DATALAD_CONTAINER_NAME' ) ds.save() # record the effect in super-dataset # Running should work fine either withing sub or within super with swallow_outputs() as cmo: subds.containers_run('XXX', container_name='mycontainer') assert_in('image=righthere cmd=XXX img_dspath=. name=mycontainer', cmo.out) with swallow_outputs() as cmo: ds.containers_run('XXX', container_name='sub/mycontainer') assert_in('image=sub/righthere cmd=XXX img_dspath=sub', cmo.out) # Test within subdirectory of the super-dataset subdir = op.join(ds.path, 'subdir') os.mkdir(subdir) with chpwd(subdir): with swallow_outputs() as cmo: containers_run('XXX', container_name='sub/mycontainer') assert_in('image=../sub/righthere cmd=XXX img_dspath=../sub', cmo.out)
def test_get_local_file_url_compatibility(path): # smoke test for file:// URL compatibility with other datalad/git/annex # pieces path = Path(path) ds1 = Dataset(path / 'ds1').create() ds2 = Dataset(path / 'ds2').create() testfile = path / 'testfile.txt' testfile.write_text('some') # compat with annex addurl ds1.repo.add_url_to_file( 'test.txt', get_local_file_url(testfile, compatibility='git-annex')) # compat with git clone/submodule assert_status( 'ok', ds1.clone(get_local_file_url(ds2.path, compatibility='git'), result_xfm=None, return_type='generator'))
def test_container_update(ds_path, local_file, url): url_foo = get_local_file_url(op.join(local_file, 'foo.img')) url_bar = get_local_file_url(op.join(local_file, 'bar.img')) img = op.join(".datalad", "environments", "foo", "image") ds = Dataset(ds_path).create() ds.containers_add(name="foo", call_fmt="call-fmt1", url=url_foo) # Abort without --update flag. res = ds.containers_add(name="foo", on_failure="ignore") assert_result_count(res, 1, action="containers_add", status="impossible") # Abort if nothing to update is specified. res = ds.containers_add(name="foo", update=True, on_failure="ignore") assert_result_count(res, 1, action="containers_add", status="impossible", message="No values to update specified") # Update call format. ds.containers_add(name="foo", update=True, call_fmt="call-fmt2") assert_equal(ds.config.get("datalad.containers.foo.cmdexec"), "call-fmt2") ok_file_has_content(op.join(ds.path, img), "foo") # Update URL/image. ds.drop(img) # Make sure it works even with absent content. res = ds.containers_add(name="foo", update=True, url=url_bar) assert_result_count(res, 1, action="remove", status="ok", path=img) assert_result_count(res, 1, action="save", status="ok") ok_file_has_content(op.join(ds.path, img), "bar") # Test commit message # In the above case it was updating existing image so should have "Update " get_commit_msg = lambda *args: ds.repo.format_commit('%B') assert_in("Update ", get_commit_msg()) # If we add a new image with update=True should say Configure res = ds.containers_add(name="foo2", update=True, url=url_bar) assert_in("Configure ", get_commit_msg())
def test_get_local_file_url(): for path, url in ( # relpaths are special-cased below ('test.txt', 'test.txt'), ) + ( ('C:\\Windows\\notepad.exe', 'file://C/Windows/notepad.exe'), ) if on_windows else ( (OBSCURE_FILENAME, urlquote(OBSCURE_FILENAME)), ('/a', 'file:///a'), ('/a/b/c', 'file:///a/b/c'), ('/a~', 'file:///a~'), # there are no files with trailing slashes in the name #('/a b/', 'file:///a%20b/'), ('/a b/name', 'file:///a%20b/name'), ): # Yarik found no better way to trigger. .decode() isn't enough print("D: %s" % path) if isabs(path): eq_(get_local_file_url(path), url) else: eq_(get_local_file_url(path), '/'.join( (get_local_file_url(os.getcwd()), url)))
def test_container_files(ds_path, local_file, url): # setup things to add # # Note: Since "adding" as a container doesn't actually call anything or use # the container in some way, but simply registers it, for testing any file # is sufficient. local_file = get_local_file_url(op.join(local_file, 'some_container.img')) # prepare dataset: ds = Dataset(ds_path).create() # non-default location: ds.config.add("datalad.containers.location", value=op.join(".datalad", "test-environments"), where='dataset') ds.save(message="Configure container mountpoint") # no containers yet: res = ds.containers_list(**RAW_KWDS) assert_result_count(res, 0) # add first "image": must end up at the configured default location target_path = op.join(ds.path, ".datalad", "test-environments", "first", "image") res = ds.containers_add(name="first", url=local_file) ok_clean_git(ds.repo) assert_result_count(res, 1, status="ok", type="file", path=target_path, action="containers_add") ok_(op.lexists(target_path)) res = ds.containers_list(**RAW_KWDS) assert_result_count(res, 1) assert_result_count(res, 1, name='first', type='file', action='containers', status='ok', path=target_path) # and kill it again # but needs name assert_raises(TypeError, ds.containers_remove) res = ds.containers_remove('first', remove_image=True) assert_status('ok', res) assert_result_count(ds.containers_list(**RAW_KWDS), 0) # image removed assert (not op.lexists(target_path))
def test_no_storage(store1=None, store2=None, ds_path=None): store1_url = 'ria+' + get_local_file_url(store1) store2_url = 'ria+' + get_local_file_url(store2) ds = Dataset(ds_path).create(force=True) ds.save(recursive=True) assert_repo_status(ds.path) res = ds.create_sibling_ria(store1_url, "datastore1", storage_sibling=False, new_store_ok=True) assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_({'datastore1', 'here'}, {s['name'] for s in ds.siblings(result_renderer='disabled')}) # deprecated way of disabling storage still works res = ds.create_sibling_ria(store2_url, "datastore2", storage_sibling=False, new_store_ok=True) assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_({'datastore2', 'datastore1', 'here'}, {s['name'] for s in ds.siblings(result_renderer='disabled')}) # no annex/object dir should be created when there is no special remote # to use it. for s in [store1, store2]: p = Path(s) / ds.id[:3] / ds.id[3:] / 'annex' / 'objects' assert_false(p.exists()) # smoke test that we can push to it res = ds.push(to='datastore1') assert_status('ok', res) # but nothing was copied, because there is no storage sibling assert_result_count(res, 0, action='copy')
def _resolve_img_url(url): """Takes a URL and tries to resolve it to an actual download URL that `annex addurl` can handle""" if op.exists(url): lgr.debug('Convert local path specification into a file:// URL') # annex wants a real url url = get_local_file_url(url) elif url.startswith('shub://'): lgr.debug('Query singularity-hub for image download URL') import requests req = requests.get( 'https://www.singularity-hub.org/api/container/{}'.format(url[7:])) shub_info = loads(req.text) url = shub_info['image'] return url
def test_add_local_path(path, local_file): ds = Dataset(path).create() res = ds.containers_add(name="foobert", url=op.join(local_file, "foo.img")) foo_target = op.join(path, ".datalad", "environments", "foobert", "image") assert_result_count(res, 1, status="ok", type="file", path=foo_target, action="containers_add") # We've just copied and added the file. assert_not_in(ds.repo.WEB_UUID, ds.repo.whereis(foo_target)) # We can force the URL to be added. (Note: This works because datalad # overrides 'annex.security.allowed-url-schemes' in its tests.) ds.containers_add(name="barry", url=get_local_file_url(op.join(local_file, "bar.img"))) bar_target = op.join(path, ".datalad", "environments", "barry", "image") assert_in(ds.repo.WEB_UUID, ds.repo.whereis(bar_target))
def test_storage_only(base_path, ds_path): store_url = 'ria+' + get_local_file_url(base_path) ds = Dataset(ds_path).create(force=True) ds.save(recursive=True) assert_repo_status(ds.path) res = ds.create_sibling_ria(store_url, "datastore", storage_sibling='only') assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_(len(res), 1) # the storage sibling uses the main name, not -storage siblings = ds.siblings(result_renderer=None) eq_({'datastore', 'here'}, {s['name'] for s in siblings}) # smoke test that we can push to it res = ds.push(to='datastore') assert_status('ok', res) assert_result_count(res, 1, action='copy')
def clone_dataset( srcs, destds, reckless=None, description=None, result_props=None, cfg=None): """Internal helper to perform cloning without sanity checks (assumed done) This helper does not handle any saving of subdataset modification or adding in a superdataset. Parameters ---------- srcs : list Any suitable clone source specifications (paths, URLs) destds : Dataset Dataset instance for the clone destination reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e. sacrifice data safety for performance or resource footprint. When None and `cfg` is specified, use the value of `datalad.clone.reckless`. description : str, optional Location description for the annex of the dataset clone (if there is any). result_props : dict, optional Default properties for any yielded result, passed on to get_status_dict(). cfg : ConfigManager, optional Configuration for parent dataset. This will be queried instead of the global DataLad configuration. Yields ------ dict DataLad result records """ if not result_props: # in case the caller had no specific idea on how results should look # like, provide sensible defaults result_props = dict( action='install', logger=lgr, ds=destds, ) if reckless is None and cfg: # if reckless is not explicitly given, but we operate on a # superdataset, query whether it has been instructed to operate # in a reckless mode, and inherit it for the coming clone reckless = cfg.get('datalad.clone.reckless', None) dest_path = destds.pathobj # decode all source candidate specifications candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs] # now expand the candidate sources with additional variants of the decoded # giturl, while duplicating the other properties in the additional records # for simplicity. The hope is to overcome a few corner cases and be more # robust than git clone candidate_sources = [ dict(props, giturl=s) for props in candidate_sources for s in _get_flexible_source_candidates(props['giturl']) ] # important test! based on this `rmtree` will happen below after failed clone dest_path_existed = dest_path.exists() if dest_path_existed and any(dest_path.iterdir()): if destds.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from # this is where it was actually installed from track_name, track_url = _get_tracking_source(destds) try: # this will get us track_url in system native path conventions, # whenever it is a path (and not a URL) # this is needed to match it to any potentially incoming local # source path in the 'notneeded' test below track_path = str(Path(track_url)) except Exception: # this should never happen, because Path() will let any non-path stringification # pass through unmodified, but we do not want any potential crash due to # pathlib behavior changes lgr.debug("Unexpected behavior of pathlib!") track_path = None for cand in candidate_sources: src = cand['giturl'] if track_url == src \ or (not is_url(track_url) and get_local_file_url(track_url, compatibility='git') == src) \ or track_path == expanduser(src): yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destds, src), **result_props) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **result_props) return log_progress( lgr.info, 'cloneds', 'Cloning dataset to %s', destds, total=len(candidate_sources), label='Clone attempt', unit=' Candidate locations', ) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for cand in candidate_sources: log_progress( lgr.info, 'cloneds', 'Attempting to clone from %s to %s', cand['giturl'], dest_path, update=1, increment=True) clone_opts = {} if cand.get('version', None): clone_opts['branch'] = cand['version'] try: # TODO for now GitRepo.clone() cannot handle Path instances, and PY35 # doesn't make it happen seemlessly GitRepo.clone( path=str(dest_path), url=cand['giturl'], clone_options=clone_opts, create=True) except CommandError as e: e_stderr = e.stderr error_msgs[cand['giturl']] = e lgr.debug("Failed to clone from URL: %s (%s)", cand['giturl'], exc_str(e)) if dest_path.exists(): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it # TODO stringification can be removed once patlib compatible # or if PY35 is no longer supported rmtree(str(dest_path), children_only=dest_path_existed) if e_stderr and 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) # cancel progress bar log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) yield get_status_dict( status='error', message=re_match.group(1).strip() if re_match else "stderr: " + e_stderr, **result_props) return # next candidate continue result_props['source'] = cand # do not bother with other sources if succeeded break log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) if not destds.is_installed(): if len(error_msgs): if all(not e.stdout and not e.stderr for e in error_msgs.values()): # there is nothing we can learn from the actual exception, # the exit code is uninformative, the command is predictable error_msg = "Failed to clone from all attempted sources: %s" error_args = list(error_msgs.keys()) else: error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were:\n- %s" error_args = '\n- '.join( '{}\n {}'.format(url, exc_str(exc)) for url, exc in error_msgs.items() ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destds.path, cand['giturl']) yield get_status_dict( status='error', message=(error_msg, error_args), **result_props) return if not cand.get("version"): postclone_check_head(destds) # act on --reckless=shared-... # must happen prior git-annex-init, where we can cheaply alter the repo # setup through safe re-init'ing if reckless and reckless.startswith('shared-'): lgr.debug('Reinit %s to enable shared access permissions', destds) destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])]) yield from postclonecfg_annexdataset( destds, reckless, description) # perform any post-processing that needs to know details of the clone # source if result_props['source']['type'] == 'ria': yield from postclonecfg_ria(destds, result_props['source']) if reckless: # store the reckless setting in the dataset to make it # known to later clones of subdatasets via get() destds.config.set( 'datalad.clone.reckless', reckless, where='local', reload=True) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **result_props)
def __call__( source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`".format( path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert(path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict( action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source(destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath(path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message=("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) candidates_str = \ " [%d other candidates]" % (len(candidate_sources) - 1) \ if len(candidate_sources) > 1 \ else '' lgr.info("Cloning %s%s into '%s'", source, candidates_str, dest_path) dest_path_existed = exists(dest_path) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for isource_, source_ in enumerate(candidate_sources): try: lgr.debug("Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: error_msgs[source_] = exc_str_ = exc_str(e) lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str_) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it rmtree(dest_path, children_only=dest_path_existed) # Whenever progress reporting is enabled, as it is now, # we end up without e.stderr since it is "processed" out by # GitPython/our progress handler. e_stderr = e.stderr from datalad.support.gitrepo import GitPythonProgressBar if not e_stderr and GitPythonProgressBar._last_error_lines: e_stderr = os.linesep.join(GitPythonProgressBar._last_error_lines) if 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) yield get_status_dict( status='error', message=re_match.group(1) if re_match else "stderr: " + e_stderr, **status_kwargs) return if not destination_dataset.is_installed(): if len(error_msgs): error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were: %s" error_args = (error_msgs, ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destination_dataset.path, source_) yield get_status_dict( status='error', message=(error_msg, error_args), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.save( dest_path, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset( destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)
def __call__(source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`". format(path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert (path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict(action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source( destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message= 'target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath( path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message= ("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) candidates_str = \ " [%d other candidates]" % (len(candidate_sources) - 1) \ if len(candidate_sources) > 1 \ else '' lgr.info("Cloning %s%s into '%s'", source, candidates_str, dest_path) dest_path_existed = exists(dest_path) error_msgs = OrderedDict( ) # accumulate all error messages formatted per each url for isource_, source_ in enumerate(candidate_sources): try: lgr.debug( "Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: error_msgs[source_] = exc_str_ = exc_str(e) lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str_) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it rmtree(dest_path, children_only=dest_path_existed) # Whenever progress reporting is enabled, as it is now, # we end up without e.stderr since it is "processed" out by # GitPython/our progress handler. e_stderr = e.stderr from datalad.support.gitrepo import GitPythonProgressBar if not e_stderr and GitPythonProgressBar._last_error_lines: e_stderr = os.linesep.join( GitPythonProgressBar._last_error_lines) if 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) yield get_status_dict( status='error', message=re_match.group(1) if re_match else "stderr: " + e_stderr, **status_kwargs) return if not destination_dataset.is_installed(): if len(error_msgs): error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were: %s" error_args = (error_msgs, ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destination_dataset.path, source_) yield get_status_dict(status='error', message=(error_msg, error_args), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.add(dest_path, save=True, ds2super=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset(destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)
def _postclonetest_prepare(lcl, storepath, link): from datalad.customremotes.ria_utils import (create_store, create_ds_in_store, get_layout_locations) from datalad.distributed.ora_remote import ( LocalIO, ) create_tree(lcl, tree={ 'ds': { 'test.txt': 'some', 'subdir': { 'subds': { 'testsub.txt': 'somemore' }, 'subgit': { 'testgit.txt': 'even more' } }, }, }) # create a local dataset with a subdataset lcl = Path(lcl) storepath = Path(storepath) link = Path(link) link.symlink_to(storepath) subds = Dataset(lcl / 'ds' / 'subdir' / 'subds').create(force=True) subds.save() # add a plain git dataset as well subgit = Dataset(lcl / 'ds' / 'subdir' / 'subgit').create(force=True, annex=False) subgit.save() ds = Dataset(lcl / 'ds').create(force=True) ds.save(version_tag='original') assert_repo_status(ds.path) io = LocalIO() create_store(io, storepath, '1') # URL to use for upload. Point is, that this should be invalid for the clone # so that autoenable would fail. Therefore let it be based on a to be # deleted symlink upl_url = "ria+{}".format(get_local_file_url(str(link))) for d in (ds, subds, subgit): # TODO: create-sibling-ria required for config! => adapt to RF'd # creation (missed on rebase?) create_ds_in_store(io, storepath, d.id, '2', '1') d.create_sibling_ria(upl_url, "store") if d is not subgit: # Now, simulate the problem by reconfiguring the special remote to # not be autoenabled. # Note, however, that the actual intention is a URL, that isn't # valid from the point of view of the clone (doesn't resolve, no # credentials, etc.) and therefore autoenabling on git-annex-init # when datalad-cloning would fail to succeed. Runner(cwd=d.path).run([ 'git', 'annex', 'enableremote', 'store-storage', 'autoenable=false' ]) d.push('.', to='store') store_loc, _, _ = get_layout_locations(1, storepath, d.id) Runner(cwd=str(store_loc)).run(['git', 'update-server-info']) link.unlink() # We should now have a store with datasets that have an autoenabled ORA # remote relying on an inaccessible URL. # datalad-clone is supposed to reconfigure based on the URL we cloned from. # Test this feature for cloning via HTTP, SSH and FILE URLs. return ds.id
def __call__(path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, if_dirty='save-before', save=True, reckless=False, git_opts=None, git_clone_opts=None, annex_opts=None, annex_init_opts=None, jobs=None): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = assure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") ## Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs, but now they # have duplicated implementations which differ (e.g. get does not # annex init installed annexes) common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, git_opts=git_opts, annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) installed_items = [] failed_items = [] # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') handle_dirty_dataset(ds, if_dirty) # switch into scenario without --source: if source is None: # we need to collect URLs and paths to_install = [] to_get = [] for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) common_kwargs['dataset'] = dataset # first install, and then get for s in to_install: lgr.debug("Install passes into install source=%s", s) try: result = Install.__call__(source=s, description=description, if_dirty=if_dirty, save=save, git_clone_opts=git_clone_opts, annex_init_opts=annex_init_opts, **common_kwargs) installed_items += assure_list(result) except Exception as exc: lgr.warning("Installation of %s has failed: %s", s, exc_str(exc)) failed_items.append(s) if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts try: installed_datasets = Get.__call__( to_get, # description=description, # if_dirty=if_dirty, # save=save, # git_clone_opts=git_clone_opts, # annex_init_opts=annex_init_opts _return_datasets=True, **common_kwargs) except IncompleteResultsError as exc: exc_str_ = ': ' + exc_str(exc) if exc.results else '' lgr.warning("Some items failed to install: %s", exc_str_) installed_datasets = exc.results failed_items.extend(exc.failed) # compose content_by_ds into result for dspath in installed_datasets: ds_ = Dataset(dspath) if ds_.is_installed(): installed_items.append(ds_) else: lgr.warning("%s was not installed", ds_) return Install._handle_and_return_installed_items( ds, installed_items, failed_items, save) if source and path and len(path) > 1: raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use `save` %s". format(path)) # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError("invalid path argument {}: ({})".format( path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO path = resolve_path(path_ri.localpath, dataset) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # URL doesn't point to a local something # so we have an actual URL in `path`. Since this is valid as a # single positional argument, `source` has to be None at this # point. if is_datalad_compat_ri(path) and source is None: # we have an actual URL -> this should be the source lgr.debug( "Single argument given to install, that doesn't seem to " "be a local path. " "Assuming the argument identifies a source location.") source = path path = None else: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source = _get_git_url_from_source(source) lgr.debug("Resolved source: {0}".format(source)) # TODO: we probably need to resolve source, if it is a local path; # expandpath, normpath, ... Where exactly is the point to do it? # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue lgr.debug( "Neither dataset nor target installation path provided. " "Deriving destination path from given source %s", source) path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) # there is no other way -- my intoxicated brain tells me assert (path is not None) lgr.debug("Resolved installation target: {0}".format(path)) destination_dataset = Dataset(path) if destination_dataset.is_installed(): # this should not be, check if this is an error, or a reinstall # from the same source # this is where we would have installed this from candidate_sources = _get_flexible_source_candidates( source, destination_dataset.path) # this is where it was installed from track_name, track_url = _get_tracking_source(destination_dataset) if track_url in candidate_sources or get_local_file_url(track_url): # TODO: this one breaks "promise" assumptions of the repeated # invocations of install. # yoh thinks that we actually should be the ones to run update # (without merge) after basic # check that it is clean and up-to-date with its super dataset # and if so, not return here but continue with errands (recursive # installation and get_data) so we could provide the same # result if we rerun the same install twice. lgr.info( "%s was already installed from %s. Use `update` to obtain " "latest updates, or `get` or `install` with a path, not URL, " "to (re)fetch data and / or subdatasets", destination_dataset, track_url) return destination_dataset else: raise ValueError( "There is already a dataset installed at the " "destination: %s", destination_dataset) ########### # we should know everything necessary by now # actual installation starts ########### # FLOW GUIDE: # four cases: # 1. install into a dataset # 1.1. we install a known subdataset # => git submodule update --init # 1.2. we install an existing repo as a subdataset inplace # => git submodule add + magic # 1.3. we (recursively) try to install implicit subdatasets between # ds and path # 1.4. we install a new subdataset from an explicit source # => git submodule add # 2. we "just" install from an explicit source # => git clone if ds is not None: # FLOW GUIDE: 1. # express the destination path relative to the root of # the dataset relativepath = relpath(path, start=ds.path) if relativepath.startswith(pardir): raise ValueError("installation path outside dataset " "({0})".format(path)) lgr.debug("Resolved installation target relative to dataset " "{0}: {1}".format(ds, relativepath)) # FLOW_GUIDE 1.4. lgr.info("Installing subdataset from '{0}' at: {0}".format( source, relativepath)) destination_dataset = _install_subds_from_flexible_source( ds, relativepath, source, reckless) else: # FLOW GUIDE: 2. lgr.info("Installing dataset at {0} from {1}".format(path, source)) # Currently assuming there is nothing at the target to deal with # and rely on failures raising from the git call ... # We possibly need to consider /.git URL candidate_sources = _get_flexible_source_candidates(source) _clone_from_any_source(candidate_sources, destination_dataset.path) # FLOW GUIDE: All four cases done. if not destination_dataset.is_installed(): # XXX shouldn't we just fail!? (unless some explicit --skip-failing?) lgr.error("Installation failed.") return None _handle_possible_annex_dataset(destination_dataset, reckless) lgr.debug("Installation of %s done.", destination_dataset) if not destination_dataset.is_installed(): # log error and don't report as installed item, but don't raise, # since we might be in a process of recursive installation where # a lot of other datasets can still be installed successfully. lgr.error( "Installation of {0} failed.".format(destination_dataset)) else: installed_items.append(destination_dataset) # we need to decrease the recursion limit, relative to # subdatasets now subds_recursion_limit = max(0, recursion_limit - 1) \ if isinstance(recursion_limit, int) \ else recursion_limit # Now, recursive calls: if recursive: if description: # yoh: why? especially if we somehow allow for templating them # with e.g. '%s' to catch the subdataset path lgr.warning("Description can't be assigned recursively.") subs = destination_dataset.get_subdatasets( # yes, it does make sense to combine no recursion with # recursion_limit: when the latter is 0 we get no subdatasets # reported, otherwise we always get the 1st-level subs recursive=False, recursion_limit=recursion_limit, absolute=False) if subs: lgr.debug("Obtaining subdatasets of %s: %s", destination_dataset, subs) kwargs = common_kwargs.copy() kwargs['recursion_limit'] = subds_recursion_limit rec_installed = Get.__call__( subs, # all at once dataset=destination_dataset, # TODO expose this # yoh: exactly! #annex_get_opts=annex_get_opts, **kwargs) # TODO do we want to filter this so `install` only returns # the datasets? if isinstance(rec_installed, list): installed_items.extend(rec_installed) else: installed_items.append(rec_installed) if get_data: lgr.debug("Getting data of {0}".format(destination_dataset)) kwargs = common_kwargs.copy() kwargs['recursive'] = False destination_dataset.get(curdir, **kwargs) return Install._handle_and_return_installed_items( ds, installed_items, failed_items, save)
def test_container_files(ds_path, local_file, url): # setup things to add # # Note: Since "adding" as a container doesn't actually call anything or use # the container in some way, but simply registers it, for testing any file # is sufficient. local_file = get_local_file_url(op.join(local_file, 'some_container.img')) remote_file = urljoin(url, 'some_container.img') # prepare dataset: ds = Dataset(ds_path).create() # non-default location: ds.config.add("datalad.containers.location", value=op.join(".datalad", "test-environments"), where='dataset') ds.save(message="Configure container mountpoint") # no containers yet: res = ds.containers_list() assert_result_count(res, 0) # add first "image": res = ds.containers_add(name="first", url=local_file) ok_clean_git(ds.repo) target_path = op.join(ds.path, ".datalad", "test-environments", "first") assert_result_count(res, 1, status="ok", type="file", path=target_path, action="containers_add") ok_(op.lexists(target_path)) eq_(local_file, ds.config.get("datalad.containers.first.url")) # add a "remote" one: # don't provide url in the call, but in a config: ds.config.add("datalad.containers.second.url", value=remote_file, where='dataset') ds.save(message="Configure URL for container 'second'") res = ds.containers_add(name="second") ok_clean_git(ds.repo) target_path = op.join(ds.path, ".datalad", "test-environments", "second") assert_result_count(res, 1, status="ok", type="file", path=target_path, action="containers_add") ok_(op.lexists(target_path)) # config wasn't changed: eq_(remote_file, ds.config.get("datalad.containers.second.url")) res = ds.containers_list() assert_result_count(res, 2, status='ok', type='file', action='containers_list')
def __call__(source, path=None, dataset=None, description=None, reckless=False, alt_sources=None): # TODO next ones should be there, but cannot go anywhere # git_opts=None, # git_clone_opts=None, # annex_opts=None, # annex_init_opts=None # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. dataset = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = dataset.path if dataset else None if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `add`". format(path)) if path is not None: path = resolve_path(path, dataset) # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source_url = source source_ = _get_git_url_from_source(source) lgr.debug("Resolved clone source from '%s' to '%s'", source, source_) source = source_ # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert (path is not None) destination_dataset = Dataset(path) dest_path = path status_kwargs = dict(action='install', ds=destination_dataset, logger=lgr, refds=refds_path, source_url=source_url) # important test! based on this `rmtree` will happen below after failed clone if exists(dest_path) and listdir(dest_path): if destination_dataset.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from guessed_sources = _get_flexible_source_candidates( source, dest_path) # this is where it was actually installed from track_name, track_url = _get_tracking_source( destination_dataset) if track_url in guessed_sources or \ get_local_file_url(track_url) in guessed_sources: yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destination_dataset, source), **status_kwargs) return # anything else is an error yield get_status_dict( status='error', message= 'target path already exists and not empty, refuse to clone into target path', **status_kwargs) return if dataset is not None and relpath( path, start=dataset.path).startswith(pardir): yield get_status_dict( status='error', message= ("clone target path '%s' not in specified target dataset '%s'", path, dataset), **status_kwargs) return # generate candidate URLs from source argument to overcome a few corner cases # and hopefully be more robust than git clone candidate_sources = [] # combine all given sources (incl. alternatives), maintain order for s in [source] + assure_list(alt_sources): candidate_sources.extend(_get_flexible_source_candidates(s)) lgr.info("Cloning %s to '%s'", source, dest_path) for isource_, source_ in enumerate(candidate_sources): try: lgr.debug( "Attempting to clone %s (%d out of %d candidates) to '%s'", source_, isource_ + 1, len(candidate_sources), dest_path) GitRepo.clone(path=dest_path, url=source_, create=True) break # do not bother with other sources if succeeded except GitCommandError as e: lgr.debug("Failed to clone from URL: %s (%s)", source_, exc_str(e)) if exists(dest_path): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) rmtree(dest_path) if 'could not create work tree' in e.stderr.lower(): # this cannot be fixed by trying another URL yield get_status_dict( status='error', message=re.match(r".*fatal: (.*)\n", e.stderr, flags=re.MULTILINE | re.DOTALL).group(1), **status_kwargs) return if not destination_dataset.is_installed(): yield get_status_dict( status='error', message=( "Failed to clone data from any candidate source URL: %s", candidate_sources), **status_kwargs) return if dataset is not None: # we created a dataset in another dataset # -> make submodule for r in dataset.add(dest_path, save=True, ds2super=True, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r _handle_possible_annex_dataset(destination_dataset, reckless, description=description) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **status_kwargs)
def test_container_from_subdataset(ds_path, src_subds_path, local_file): # prepare a to-be subdataset with a registered container src_subds = Dataset(src_subds_path).create() src_subds.containers_add(name="first", url=get_local_file_url( op.join(local_file, 'some_container.img'))) # add it as subdataset to a super ds: ds = Dataset(ds_path).create() subds = ds.install("sub", source=src_subds_path) # add it again one level down to see actual recursion: subds.install("subsub", source=src_subds_path) # We come up empty without recursive: res = ds.containers_list(recursive=False, **RAW_KWDS) assert_result_count(res, 0) # query available containers from within super: res = ds.containers_list(recursive=True, **RAW_KWDS) assert_result_count(res, 2) assert_in_results(res, action="containers", refds=ds.path) # default location within the subdataset: target_path = op.join(subds.path, '.datalad', 'environments', 'first', 'image') assert_result_count(res, 1, name='sub/first', type='file', action='containers', status='ok', path=target_path, parentds=subds.path) # not installed subdataset doesn't pose an issue: sub2 = ds.create("sub2") assert_result_count(ds.subdatasets(), 2, type="dataset") ds.uninstall("sub2") from datalad.tests.utils import assert_false assert_false(sub2.is_installed()) # same results as before, not crashing or somehow confused by a not present # subds: res = ds.containers_list(recursive=True, **RAW_KWDS) assert_result_count(res, 2) assert_result_count(res, 1, name='sub/first', type='file', action='containers', status='ok', path=target_path, parentds=subds.path) # The default renderer includes the image names. with swallow_outputs() as out: ds.containers_list(recursive=True) lines = out.out.splitlines() assert_re_in("sub/first", lines) assert_re_in("sub/subsub/first", lines) # But we are careful not to render partial names from subdataset traversals # (i.e. we recurse with containers_list(..., result_renderer=None)). with assert_raises(AssertionError): assert_re_in("subsub/first", lines)
def test_nested_pushclone_cycle_allplatforms(origpath, storepath, clonepath): if 'DATALAD_SEED' in os.environ: # we are using create-sibling-ria via the cmdline in here # this will create random UUIDs for datasets # however, given a fixed seed each call to this command will start # with the same RNG seed, hence yield the same UUID on the same # machine -- leading to a collision raise SkipTest( 'Test incompatible with fixed random number generator seed') # the aim here is this high-level test a std create-push-clone cycle for a # dataset with a subdataset, with the goal to ensure that correct branches # and commits are tracked, regardless of platform behavior and condition # of individual clones. Nothing fancy, just that the defaults behave in # sensible ways from datalad.cmd import WitlessRunner as Runner run = Runner().run # create original nested dataset with chpwd(origpath): run(['datalad', 'create', 'super']) run(['datalad', 'create', '-d', 'super', str(Path('super', 'sub'))]) # verify essential linkage properties orig_super = Dataset(Path(origpath, 'super')) orig_sub = Dataset(orig_super.pathobj / 'sub') (orig_super.pathobj / 'file1.txt').write_text('some1') (orig_sub.pathobj / 'file2.txt').write_text('some1') with chpwd(orig_super.path): run(['datalad', 'save', '--recursive']) # TODO not yet reported clean with adjusted branches #assert_repo_status(orig_super.path) # the "true" branch that sub is on, and the gitsha of the HEAD commit of it orig_sub_corr_branch = \ orig_sub.repo.get_corresponding_branch() or orig_sub.repo.get_active_branch() orig_sub_corr_commit = orig_sub.repo.get_hexsha(orig_sub_corr_branch) # make sure the super trackes this commit assert_in_results( orig_super.subdatasets(), path=orig_sub.path, gitshasum=orig_sub_corr_commit, # TODO it should also track the branch name # Attempted: https://github.com/datalad/datalad/pull/3817 # But reverted: https://github.com/datalad/datalad/pull/4375 ) # publish to a store, to get into a platform-agnostic state # (i.e. no impact of an annex-init of any kind) store_url = 'ria+' + get_local_file_url(storepath) with chpwd(orig_super.path): run([ 'datalad', 'create-sibling-ria', '--recursive', '-s', 'store', store_url ]) run(['datalad', 'push', '--recursive', '--to', 'store']) # we are using the 'store' sibling's URL, which should be a plain path store_super = AnnexRepo(orig_super.siblings(name='store')[0]['url'], init=False) store_sub = AnnexRepo(orig_sub.siblings(name='store')[0]['url'], init=False) # both datasets in the store only carry the real branches, and nothing # adjusted for r in (store_super, store_sub): eq_(set(r.get_branches()), set([orig_sub_corr_branch, 'git-annex'])) # and reobtain from a store cloneurl = 'ria+' + get_local_file_url(str(storepath), compatibility='git') with chpwd(clonepath): run(['datalad', 'clone', cloneurl + '#' + orig_super.id, 'super']) run(['datalad', '-C', 'super', 'get', '--recursive', '.']) # verify that nothing has changed as a result of a push/clone cycle clone_super = Dataset(Path(clonepath, 'super')) clone_sub = Dataset(clone_super.pathobj / 'sub') assert_in_results( clone_super.subdatasets(), path=clone_sub.path, gitshasum=orig_sub_corr_commit, ) for ds1, ds2, f in ((orig_super, clone_super, 'file1.txt'), (orig_sub, clone_sub, 'file2.txt')): eq_((ds1.pathobj / f).read_text(), (ds2.pathobj / f).read_text()) # get status info that does not recursive into subdatasets, i.e. not # looking for uncommitted changes # we should see no modification reported assert_not_in_results(clone_super.status(eval_subdataset_state='commit'), state='modified') # and now the same for a more expensive full status assert_not_in_results(clone_super.status(recursive=True), state='modified')
def test_install_simple_local(src_repo=None, path=None, *, type_): src_ds = Dataset(src_repo).create(result_renderer='disabled', force=True, annex=(type_ == "annex")) src_ds.save(['INFO.txt', 'test.dat'], to_git=True) if type_ == 'annex': src_ds.save('test-annex.dat', to_git=False) elif type_ == 'git': pass else: raise ValueError("'type' must be 'git' or 'annex'") # equivalent repo on github: url = "https://github.com/datalad/testrepo--basic--r1.git" sources = [ src_ds.path, get_local_file_url(src_ds.path, compatibility='git') ] if not dl_cfg.get('datalad.tests.nonetwork'): sources.append(url) for src in sources: origin = Dataset(path) # now install it somewhere else ds = install(path, source=src, description='mydummy') eq_(ds.path, path) ok_(ds.is_installed()) if not isinstance(origin.repo, AnnexRepo): # this means it is a GitRepo ok_(isinstance(origin.repo, GitRepo)) # stays plain Git repo ok_(isinstance(ds.repo, GitRepo)) ok_(not isinstance(ds.repo, AnnexRepo)) ok_(GitRepo.is_valid_repo(ds.path)) files = ds.repo.get_indexed_files() assert_in('test.dat', files) assert_in('INFO.txt', files) assert_repo_status(path, annex=False) else: # must be an annex ok_(isinstance(ds.repo, AnnexRepo)) ok_(AnnexRepo.is_valid_repo(ds.path, allow_noninitialized=False)) files = ds.repo.get_indexed_files() assert_in('test.dat', files) assert_in('INFO.txt', files) assert_in('test-annex.dat', files) assert_repo_status(path, annex=True) # no content was installed: ok_(not ds.repo.file_has_content('test-annex.dat')) uuid_before = ds.repo.uuid ok_(uuid_before) # we actually have an uuid eq_(ds.repo.get_description(), 'mydummy') # installing it again, shouldn't matter: res = install(path, source=src, result_xfm=None, return_type='list') assert_status('notneeded', res) ok_(ds.is_installed()) if isinstance(origin.repo, AnnexRepo): eq_(uuid_before, ds.repo.uuid) # cleanup before next iteration rmtree(path)
def __call__( path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, if_dirty='save-before', save=True, reckless=False, git_opts=None, git_clone_opts=None, annex_opts=None, annex_init_opts=None, jobs=None): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = assure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") ## Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs, but now they # have duplicated implementations which differ (e.g. get does not # annex init installed annexes) common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, git_opts=git_opts, annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) installed_items = [] failed_items = [] # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') handle_dirty_dataset(ds, if_dirty) # switch into scenario without --source: if source is None: # we need to collect URLs and paths to_install = [] to_get = [] for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) common_kwargs['dataset'] = dataset # first install, and then get for s in to_install: lgr.debug("Install passes into install source=%s", s) try: result = Install.__call__( source=s, description=description, if_dirty=if_dirty, save=save, git_clone_opts=git_clone_opts, annex_init_opts=annex_init_opts, **common_kwargs ) installed_items += assure_list(result) except Exception as exc: lgr.warning("Installation of %s has failed: %s", s, exc_str(exc)) failed_items.append(s) if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts try: installed_datasets = Get.__call__( to_get, # description=description, # if_dirty=if_dirty, # save=save, # git_clone_opts=git_clone_opts, # annex_init_opts=annex_init_opts _return_datasets=True, **common_kwargs ) except IncompleteResultsError as exc: exc_str_ = ': ' + exc_str(exc) if exc.results else '' lgr.warning("Some items failed to install: %s", exc_str_) installed_datasets = exc.results failed_items.extend(exc.failed) # compose content_by_ds into result for dspath in installed_datasets: ds_ = Dataset(dspath) if ds_.is_installed(): installed_items.append(ds_) else: lgr.warning("%s was not installed", ds_) return Install._handle_and_return_installed_items( ds, installed_items, failed_items, save) if source and path and len(path) > 1: raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use `save` %s".format( path)) # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError( "invalid path argument {}: ({})".format(path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO path = resolve_path(path_ri.localpath, dataset) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # URL doesn't point to a local something # so we have an actual URL in `path`. Since this is valid as a # single positional argument, `source` has to be None at this # point. if is_datalad_compat_ri(path) and source is None: # we have an actual URL -> this should be the source lgr.debug( "Single argument given to install, that doesn't seem to " "be a local path. " "Assuming the argument identifies a source location.") source = path path = None else: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # Possibly do conversion from source into a git-friendly url # luckily GitRepo will undo any fancy file:/// url to make use of Git's # optimization for local clones.... source = _get_git_url_from_source(source) lgr.debug("Resolved source: {0}".format(source)) # TODO: we probably need to resolve source, if it is a local path; # expandpath, normpath, ... Where exactly is the point to do it? # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue lgr.debug( "Neither dataset nor target installation path provided. " "Deriving destination path from given source %s", source) path = _get_installationpath_from_url(source) # since this is a relative `path`, resolve it: path = resolve_path(path, dataset) # there is no other way -- my intoxicated brain tells me assert(path is not None) lgr.debug("Resolved installation target: {0}".format(path)) destination_dataset = Dataset(path) if destination_dataset.is_installed(): # this should not be, check if this is an error, or a reinstall # from the same source # this is where we would have installed this from candidate_sources = _get_flexible_source_candidates( source, destination_dataset.path) # this is where it was installed from track_name, track_url = _get_tracking_source(destination_dataset) if track_url in candidate_sources or get_local_file_url(track_url): # TODO: this one breaks "promise" assumptions of the repeated # invocations of install. # yoh thinks that we actually should be the ones to run update # (without merge) after basic # check that it is clean and up-to-date with its super dataset # and if so, not return here but continue with errands (recursive # installation and get_data) so we could provide the same # result if we rerun the same install twice. lgr.info( "%s was already installed from %s. Use `update` to obtain " "latest updates, or `get` or `install` with a path, not URL, " "to (re)fetch data and / or subdatasets", destination_dataset, track_url) return destination_dataset else: raise ValueError("There is already a dataset installed at the " "destination: %s", destination_dataset) ########### # we should know everything necessary by now # actual installation starts ########### # FLOW GUIDE: # four cases: # 1. install into a dataset # 1.1. we install a known subdataset # => git submodule update --init # 1.2. we install an existing repo as a subdataset inplace # => git submodule add + magic # 1.3. we (recursively) try to install implicit subdatasets between # ds and path # 1.4. we install a new subdataset from an explicit source # => git submodule add # 2. we "just" install from an explicit source # => git clone if ds is not None: # FLOW GUIDE: 1. # express the destination path relative to the root of # the dataset relativepath = relpath(path, start=ds.path) if relativepath.startswith(pardir): raise ValueError("installation path outside dataset " "({0})".format(path)) lgr.debug("Resolved installation target relative to dataset " "{0}: {1}".format(ds, relativepath)) # FLOW_GUIDE 1.4. lgr.info("Installing subdataset from '{0}' at: {0}".format( source, relativepath)) destination_dataset = _install_subds_from_flexible_source( ds, relativepath, source, reckless) else: # FLOW GUIDE: 2. lgr.info("Installing dataset at {0} from {1}".format(path, source)) # Currently assuming there is nothing at the target to deal with # and rely on failures raising from the git call ... # We possibly need to consider /.git URL candidate_sources = _get_flexible_source_candidates(source) _clone_from_any_source(candidate_sources, destination_dataset.path) # FLOW GUIDE: All four cases done. if not destination_dataset.is_installed(): # XXX shouldn't we just fail!? (unless some explicit --skip-failing?) lgr.error("Installation failed.") return None _handle_possible_annex_dataset(destination_dataset, reckless) lgr.debug("Installation of %s done.", destination_dataset) if not destination_dataset.is_installed(): # log error and don't report as installed item, but don't raise, # since we might be in a process of recursive installation where # a lot of other datasets can still be installed successfully. lgr.error("Installation of {0} failed.".format(destination_dataset)) else: installed_items.append(destination_dataset) # we need to decrease the recursion limit, relative to # subdatasets now subds_recursion_limit = max(0, recursion_limit - 1) \ if isinstance(recursion_limit, int) \ else recursion_limit # Now, recursive calls: if recursive: if description: # yoh: why? especially if we somehow allow for templating them # with e.g. '%s' to catch the subdataset path lgr.warning("Description can't be assigned recursively.") subs = destination_dataset.get_subdatasets( # yes, it does make sense to combine no recursion with # recursion_limit: when the latter is 0 we get no subdatasets # reported, otherwise we always get the 1st-level subs recursive=False, recursion_limit=recursion_limit, absolute=False) if subs: lgr.debug("Obtaining subdatasets of %s: %s", destination_dataset, subs) kwargs = common_kwargs.copy() kwargs['recursion_limit'] = subds_recursion_limit rec_installed = Get.__call__( subs, # all at once dataset=destination_dataset, # TODO expose this # yoh: exactly! #annex_get_opts=annex_get_opts, **kwargs ) # TODO do we want to filter this so `install` only returns # the datasets? if isinstance(rec_installed, list): installed_items.extend(rec_installed) else: installed_items.append(rec_installed) if get_data: lgr.debug("Getting data of {0}".format(destination_dataset)) kwargs = common_kwargs.copy() kwargs['recursive'] = False destination_dataset.get(curdir, **kwargs) return Install._handle_and_return_installed_items( ds, installed_items, failed_items, save)