def newfunc(*arg, **kw): if DATALAD_TESTS_CACHE: # Note: We can't pass keys based on `paths` parameter to # get_cached_dataset yet, since translation to keys depends on a # worktree. We'll have the worktree of `version` only after cloning. ds = get_cached_dataset(url, version=version) clone_ds = Clone()(ds.pathobj, arg[-1]) else: clone_ds = Clone()(url, arg[-1]) if version: clone_ds.repo.checkout(version) if paths and AnnexRepo.is_valid_repo(clone_ds.path): # just assume ds is annex as well. Otherwise `Clone` wouldn't # work correctly - we don't need to test its implementation here if DATALAD_TESTS_CACHE: # cache is enabled; we need to make sure it has the desired # content, so clone_ds can get it from there. However, we got # `paths` and potentially a `version` they refer to. We can't # assume the same (or any) worktree in cache. Hence we need to # translate to keys. keys = clone_ds.repo.get_file_key(paths) ds.repo.get(keys, key=True) clone_ds.repo.fsck(remote='origin', fast=True) clone_ds.get(paths) return f(*(arg[:-1] + (clone_ds, )), **kw)
def test_gh1811(srcpath, clonepath): orig = Dataset(srcpath).create() (orig.pathobj / 'some').write_text('some') orig.save() clone = Clone.__call__(source=orig.path, path=clonepath) (clone.pathobj / 'somemore').write_text('somemore') clone.save() clone.repo.call_git(['checkout', 'HEAD~1']) res = clone.push(to='origin', on_failure='ignore') assert_result_count(res, 1) assert_result_count( res, 1, path=clone.path, type='dataset', action='publish', status='impossible', message='There is no active branch, cannot determine remote ' 'branch', )
def _wrap_cached_dataset(*arg, **kw): if DATALAD_TESTS_CACHE: # Note: We can't pass keys based on `paths` parameter to # get_cached_dataset yet, since translation to keys depends on a # worktree. We'll have the worktree of `version` only after cloning. ds = get_cached_dataset(url, version=version) clone_ds = Clone()(ds.pathobj, arg[-1]) else: clone_ds = Clone()(url, arg[-1]) #save some cycles clone_repo = clone_ds.repo if version: clone_repo.checkout(version) if paths and AnnexRepo.is_valid_repo(clone_ds.path): # just assume ds is annex as well. Otherwise `Clone` wouldn't # work correctly - we don't need to test its implementation here if DATALAD_TESTS_CACHE: # cache is enabled; we need to make sure it has the desired # content, so clone_ds can get it from there. However, we got # `paths` and potentially a `version` they refer to. We can't # assume the same (or any) worktree in cache. Hence we need to # translate to keys. # MIH Despite the variable names used in this function # (pathS, keyS) they ultimately are passed to get(..., key=True) # which means that it can ever only be a single path and a # single key -- this is very confusing. # the key determination could hence be done with # get_file_annexinfo() in a much simpler way, but it seems this # function wants to be ready for more, sigh keys = [ p['key'] for p in clone_repo.get_content_annexinfo( ensure_list(paths), init=None).values() if 'key' in p ] if keys: ds.repo.get(keys, key=True) clone_repo.fsck(remote=DEFAULT_REMOTE, fast=True) clone_ds.get(paths) return f(*(arg[:-1] + (clone_ds, )), **kw)
def test_push_recursive(origin_path, src_path, dst_top, dst_sub, dst_subnoannex, dst_subsub): # dataset with two submodules and one subsubmodule origin = Dataset(origin_path).create() origin_subm1 = origin.create('sub m') origin_subm1.create('subsub m') origin.create('subm noannex', annex=False) origin.save() assert_repo_status(origin.path) # prepare src as a fresh clone with all subdatasets checkout out recursively # running on a clone should make the test scenario more different than # test_push(), even for the pieces that should be identical top = Clone.__call__(source=origin.path, path=src_path) sub, subsub, subnoannex = top.get('.', recursive=True, get_data=False, result_xfm='datasets') target_top = mk_push_target(top, 'target', dst_top, annex=True) # subdatasets have no remote yet, so recursive publishing should fail: res = top.push(to="target", recursive=True, on_failure='ignore') assert_in_results(res, path=top.path, type='dataset', refspec='refs/heads/master:refs/heads/master', operations=['new-branch'], action='publish', status='ok', target='target') for d in (sub, subsub, subnoannex): assert_in_results(res, status='error', type='dataset', path=d.path, message=("Unknown target sibling '%s'.", 'target')) # now fix that and set up targets for the submodules target_sub = mk_push_target(sub, 'target', dst_sub, annex=True) target_subnoannex = mk_push_target(subnoannex, 'target', dst_subnoannex, annex=False) target_subsub = mk_push_target(subsub, 'target', dst_subsub, annex=True) # and same push call as above res = top.push(to="target", recursive=True) # topds skipped assert_in_results(res, path=top.path, type='dataset', action='publish', status='notneeded', target='target') # the rest pushed for d in (sub, subsub, subnoannex): assert_in_results(res, status='ok', type='dataset', path=d.path, refspec='refs/heads/master:refs/heads/master') # all correspondig branches match across all datasets for s, d in zip( (top, sub, subnoannex, subsub), (target_top, target_sub, target_subnoannex, target_subsub)): eq_(list(s.repo.get_branch_commits_("master")), list(d.get_branch_commits_("master"))) if s != subnoannex: eq_(list(s.repo.get_branch_commits_("git-annex")), list(d.get_branch_commits_("git-annex"))) # rerun should not result in further pushes of master res = top.push(to="target", recursive=True) assert_not_in_results(res, status='ok', refspec="refs/heads/master:refs/heads/master") assert_in_results(res, status='notneeded', refspec="refs/heads/master:refs/heads/master") if top.repo.is_managed_branch(): raise SkipTest( 'Save/status of subdataset with managed branches is an still ' 'unresolved issue') # now annex a file in subsub test_copy_file = subsub.pathobj / 'test_mod_annex_file' test_copy_file.write_text("Heavy stuff.") # save all the way up assert_status(('ok', 'notneeded'), top.save(message='subsub got something', recursive=True)) assert_repo_status(top.path) # publish straight up, should be smart by default res = top.push(to="target", recursive=True) # we see 3 out of 4 datasets pushed (sub noannex was left unchanged) for d in (top, sub, subsub): assert_in_results(res, status='ok', type='dataset', path=d.path, refspec='refs/heads/master:refs/heads/master') # file content copied too assert_in_results(res, action='copy', status='ok', path=str(test_copy_file)) # verify it is accessible, drop and bring back assert_status('ok', top.drop(str(test_copy_file))) ok_(not subsub.repo.file_has_content('test_mod_annex_file')) top.get(test_copy_file) ok_file_has_content(test_copy_file, 'Heavy stuff.') # make two modification (sub.pathobj / 'test_mod_annex_file').write_text('annex') (subnoannex.pathobj / 'test_mod_file').write_text('git') # save separately top.save(sub.pathobj, message='annexadd', recursive=True) top.save(subnoannex.pathobj, message='gitadd', recursive=True) # now only publish the latter one res = top.push(to="target", since='HEAD~1', recursive=True) # nothing copied, no reports on the other modification assert_not_in_results(res, action='copy') assert_not_in_results(res, path=sub.path) for d in (top, subnoannex): assert_in_results(res, status='ok', type='dataset', path=d.path, refspec='refs/heads/master:refs/heads/master') # an unconditional push should now pick up the remaining changes res = top.push(to="target", recursive=True) assert_in_results(res, action='copy', status='ok', path=str(sub.pathobj / 'test_mod_annex_file')) assert_in_results(res, status='ok', type='dataset', path=sub.path, refspec='refs/heads/master:refs/heads/master') for d in (top, subnoannex, subsub): assert_in_results(res, status='notneeded', type='dataset', path=d.path, refspec='refs/heads/master:refs/heads/master')
def __call__(path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, reckless=None, jobs="auto"): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = ensure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") # Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, # git_opts=git_opts, # annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') common_kwargs['dataset'] = dataset # pre-compute for results below refds_path = Interface.get_refds_path(ds) # switch into the two scenarios without --source: # 1. list of URLs # 2. list of (sub)dataset content if source is None: # we need to collect URLs and paths to_install = [] to_get = [] # TODO: this approach is problematic, it disrupts the order of input args. # consequently results will be returned in an unexpected order when a # mixture of source URL and paths is given. Reordering is only possible when # everything in here is fully processed before any results can be yielded. # moreover, I think the semantics of the status quo implementation are a # bit complicated: in a mixture list a source URL will lead to a new dataset # at a generated default location, but a path will lead to a subdataset # at that exact location for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) # 1. multiple source URLs for s in to_install: lgr.debug("Install passes into install source=%s", s) for r in Install.__call__( source=s, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of the installed content on disk # should be necessary here, all done by code further # down that deals with an install from an actuall `source` # any necessary fixes should go there too! r['refds'] = refds_path yield r # 2. one or more dataset content paths if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts for r in Get.__call__( to_get, # TODO should pass-through description, not sure why disabled # description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of get'ed content on disk should be # necessary here, this is the responsibility of `get` # (incl. adjusting parent's gitmodules when submodules end # up in an "updated" state (done in get helpers) # any required fixes should go there! r['refds'] = refds_path yield r # we are done here # the rest is about install from a `source` return # an actual `source` was given if source and path and len(path) > 1: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination yield get_status_dict( 'install', path=path, status='impossible', logger=lgr, source_url=source, refds=refds_path, message= "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use the `save` command" ) return # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # MIH everything in here is highly similar to what common # interface helpers do (or should/could do), but at the same # is very much tailored to just apply to `install` -- I guess # it has to stay special # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError("invalid path argument {}: ({})".format( path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO # TODO Stringification can be removed once PY35 is no longer # supported path = str(resolve_path(path_ri.localpath, dataset)) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # clone dataset, will also take care of adding to superdataset, if one # is given res = Clone.__call__( source, path, dataset=ds, description=description, reckless=reckless, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" result_xfm=None, return_type='generator', result_filter=None, on_failure='ignore') # helper as_ds = YieldDatasets() destination_dataset = None for r in res: if r['action'] == 'install' and r['type'] == 'dataset': # make sure logic below is valid, only one dataset result is # coming back assert (destination_dataset is None) destination_dataset = as_ds(r) r['refds'] = refds_path yield r assert (destination_dataset) # Now, recursive calls: if recursive or get_data: # dataset argument must not be passed inside since we use bound .get # It is ok to do "inplace" as long as we still return right # after the loop ends common_kwargs.pop('dataset', '') for r in destination_dataset.get( curdir, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, **common_kwargs): r['refds'] = refds_path yield r # at this point no futher post-processing should be necessary, # `clone` and `get` must have done that (incl. parent handling) # if not, bugs should be fixed in those commands return
def get_cached_dataset(url, version=None, keys=None): """ Helper to get a cached clone from url Intended for use from within `cached_dataset` and `cached_url` decorators. Clones `url` into user's cache under datalad/tests/`name`. If such a clone already exists, don't clone but return the existing one. So, it's supposed to cache the original source in order to reduce time and traffic for tests, by letting subsequent requests clone from a local location directly. If it's an annex get the content as provided by `keys`, too. Note, that as a transparent cache replacing the repo at URL from the POV of a test, we can't address content via paths, since those are valid only with respect to a particular worktree. If different tests clone from the same cached dataset, each requesting different versions and different paths thereof, we run into trouble if the cache itself checks out a particular requested version. Verifies that `version` can be checked out, but doesn't actually do it, since the cached dataset is intended to be used as origin instead of the original remote at URL by the `cached_dataset` test decorator. Checkout of a particular version should happen in its clone. Parameters ---------- url: str URL to clone from keys: str or list or None (list of) annex keys to get content for. version: str or None A commit or an object that can be dereferenced to one. Returns ------- Dataset """ # TODO: What about recursive? Might be complicated. We would need to make # sure we can recursively clone _from_ here then, potentially # requiring submodule URL rewrites. Not sure about that ATM. # TODO: Given that it is supposed to be a cache for the original repo at # `url`, we prob. should make this a bare repository. We don't need # a potentially expensive checkout here. Need to double check # `annex-get --key` in bare repos, though. Plus datalad-clone doesn't # have --bare yet. But we want all the annex/special-remote/ria magic # of datalad. So, plain git-clone --bare is not an option. if not DATALAD_TESTS_CACHE: raise ValueError("Caching disabled by config") ds = Dataset(DATALAD_TESTS_CACHE / url2filename(url)) if not ds.is_installed(): ds = Clone()(url, ds.pathobj) # When/How to update a dataset in cache? If version is a commit SHA and we # have it, there's no need for an update. Otherwise it gets tricky, because # this is a cache, not a checkout a test would operate on. It needs to # behave as if it was the thing at `url` from the point of view of the test # using it (cloning/getting content from here). We would need to update all # references, not just fetch them! # # Can we even (cheaply) tell whether `version` is an absolute reference # (actual SHA, not a branch/tag)? # # NOTE: - consider git-clone --mirror, but as w/ --bare: not an option for # datalad-clone yet. # - --reference[-if-able] might also be worth thinking about for # the clone @cached_dataset creates wrt clone in cacheq # # So, for now fetch, figure whether there actually was something to fetch # and if so simply invalidate cache and re-clone/get. Don't overcomplicate # things. It's about datasets used in the tests - they shouldn't change too # frequently. elif any('uptodate' not in c['operations'] for c in ds.repo.fetch(DEFAULT_REMOTE)): rmtree(ds.path) ds = Clone()(url, ds.pathobj) if version: # check whether version is available assert ds.repo.commit_exists(version) if keys and AnnexRepo.is_valid_repo(ds.path): ds.repo.get(keys, key=True) return ds