Example #1
0
    def newfunc(*arg, **kw):

        if DATALAD_TESTS_CACHE:
            # Note: We can't pass keys based on `paths` parameter to
            # get_cached_dataset yet, since translation to keys depends on a
            # worktree. We'll have the worktree of `version` only after cloning.
            ds = get_cached_dataset(url, version=version)
            clone_ds = Clone()(ds.pathobj, arg[-1])
        else:
            clone_ds = Clone()(url, arg[-1])
        if version:
            clone_ds.repo.checkout(version)
        if paths and AnnexRepo.is_valid_repo(clone_ds.path):
            # just assume ds is annex as well. Otherwise `Clone` wouldn't
            # work correctly - we don't need to test its implementation here
            if DATALAD_TESTS_CACHE:
                # cache is enabled; we need to make sure it has the desired
                # content, so clone_ds can get it from there. However, we got
                # `paths` and potentially a `version` they refer to. We can't
                # assume the same (or any) worktree in cache. Hence we need to
                # translate to keys.
                keys = clone_ds.repo.get_file_key(paths)
                ds.repo.get(keys, key=True)
                clone_ds.repo.fsck(remote='origin', fast=True)

            clone_ds.get(paths)
        return f(*(arg[:-1] + (clone_ds, )), **kw)
Example #2
0
def test_gh1811(srcpath, clonepath):
    orig = Dataset(srcpath).create()
    (orig.pathobj / 'some').write_text('some')
    orig.save()
    clone = Clone.__call__(source=orig.path, path=clonepath)
    (clone.pathobj / 'somemore').write_text('somemore')
    clone.save()
    clone.repo.call_git(['checkout', 'HEAD~1'])
    res = clone.push(to='origin', on_failure='ignore')
    assert_result_count(res, 1)
    assert_result_count(
        res, 1,
        path=clone.path, type='dataset', action='publish',
        status='impossible',
        message='There is no active branch, cannot determine remote '
                'branch',
    )
Example #3
0
    def _wrap_cached_dataset(*arg, **kw):

        if DATALAD_TESTS_CACHE:
            # Note: We can't pass keys based on `paths` parameter to
            # get_cached_dataset yet, since translation to keys depends on a
            # worktree. We'll have the worktree of `version` only after cloning.
            ds = get_cached_dataset(url, version=version)
            clone_ds = Clone()(ds.pathobj, arg[-1])
        else:
            clone_ds = Clone()(url, arg[-1])
        #save some cycles
        clone_repo = clone_ds.repo
        if version:
            clone_repo.checkout(version)
        if paths and AnnexRepo.is_valid_repo(clone_ds.path):
            # just assume ds is annex as well. Otherwise `Clone` wouldn't
            # work correctly - we don't need to test its implementation here
            if DATALAD_TESTS_CACHE:
                # cache is enabled; we need to make sure it has the desired
                # content, so clone_ds can get it from there. However, we got
                # `paths` and potentially a `version` they refer to. We can't
                # assume the same (or any) worktree in cache. Hence we need to
                # translate to keys.
                # MIH Despite the variable names used in this function
                # (pathS, keyS) they ultimately are passed to get(..., key=True)
                # which means that it can ever only be a single path and a
                # single key -- this is very confusing.
                # the key determination could hence be done with
                # get_file_annexinfo() in a much simpler way, but it seems this
                # function wants to be ready for more, sigh
                keys = [
                    p['key'] for p in clone_repo.get_content_annexinfo(
                        ensure_list(paths), init=None).values() if 'key' in p
                ]
                if keys:
                    ds.repo.get(keys, key=True)
                clone_repo.fsck(remote=DEFAULT_REMOTE, fast=True)

            clone_ds.get(paths)
        return f(*(arg[:-1] + (clone_ds, )), **kw)
Example #4
0
def test_push_recursive(origin_path, src_path, dst_top, dst_sub,
                        dst_subnoannex, dst_subsub):
    # dataset with two submodules and one subsubmodule
    origin = Dataset(origin_path).create()
    origin_subm1 = origin.create('sub m')
    origin_subm1.create('subsub m')
    origin.create('subm noannex', annex=False)
    origin.save()
    assert_repo_status(origin.path)
    # prepare src as a fresh clone with all subdatasets checkout out recursively
    # running on a clone should make the test scenario more different than
    # test_push(), even for the pieces that should be identical
    top = Clone.__call__(source=origin.path, path=src_path)
    sub, subsub, subnoannex = top.get('.',
                                      recursive=True,
                                      get_data=False,
                                      result_xfm='datasets')

    target_top = mk_push_target(top, 'target', dst_top, annex=True)
    # subdatasets have no remote yet, so recursive publishing should fail:
    res = top.push(to="target", recursive=True, on_failure='ignore')
    assert_in_results(res,
                      path=top.path,
                      type='dataset',
                      refspec='refs/heads/master:refs/heads/master',
                      operations=['new-branch'],
                      action='publish',
                      status='ok',
                      target='target')
    for d in (sub, subsub, subnoannex):
        assert_in_results(res,
                          status='error',
                          type='dataset',
                          path=d.path,
                          message=("Unknown target sibling '%s'.", 'target'))
    # now fix that and set up targets for the submodules
    target_sub = mk_push_target(sub, 'target', dst_sub, annex=True)
    target_subnoannex = mk_push_target(subnoannex,
                                       'target',
                                       dst_subnoannex,
                                       annex=False)
    target_subsub = mk_push_target(subsub, 'target', dst_subsub, annex=True)

    # and same push call as above
    res = top.push(to="target", recursive=True)
    # topds skipped
    assert_in_results(res,
                      path=top.path,
                      type='dataset',
                      action='publish',
                      status='notneeded',
                      target='target')
    # the rest pushed
    for d in (sub, subsub, subnoannex):
        assert_in_results(res,
                          status='ok',
                          type='dataset',
                          path=d.path,
                          refspec='refs/heads/master:refs/heads/master')
    # all correspondig branches match across all datasets
    for s, d in zip(
        (top, sub, subnoannex, subsub),
        (target_top, target_sub, target_subnoannex, target_subsub)):
        eq_(list(s.repo.get_branch_commits_("master")),
            list(d.get_branch_commits_("master")))
        if s != subnoannex:
            eq_(list(s.repo.get_branch_commits_("git-annex")),
                list(d.get_branch_commits_("git-annex")))

    # rerun should not result in further pushes of master
    res = top.push(to="target", recursive=True)
    assert_not_in_results(res,
                          status='ok',
                          refspec="refs/heads/master:refs/heads/master")
    assert_in_results(res,
                      status='notneeded',
                      refspec="refs/heads/master:refs/heads/master")

    if top.repo.is_managed_branch():
        raise SkipTest(
            'Save/status of subdataset with managed branches is an still '
            'unresolved issue')

    # now annex a file in subsub
    test_copy_file = subsub.pathobj / 'test_mod_annex_file'
    test_copy_file.write_text("Heavy stuff.")
    # save all the way up
    assert_status(('ok', 'notneeded'),
                  top.save(message='subsub got something', recursive=True))
    assert_repo_status(top.path)
    # publish straight up, should be smart by default
    res = top.push(to="target", recursive=True)
    # we see 3 out of 4 datasets pushed (sub noannex was left unchanged)
    for d in (top, sub, subsub):
        assert_in_results(res,
                          status='ok',
                          type='dataset',
                          path=d.path,
                          refspec='refs/heads/master:refs/heads/master')
    # file content copied too
    assert_in_results(res,
                      action='copy',
                      status='ok',
                      path=str(test_copy_file))
    # verify it is accessible, drop and bring back
    assert_status('ok', top.drop(str(test_copy_file)))
    ok_(not subsub.repo.file_has_content('test_mod_annex_file'))
    top.get(test_copy_file)
    ok_file_has_content(test_copy_file, 'Heavy stuff.')

    # make two modification
    (sub.pathobj / 'test_mod_annex_file').write_text('annex')
    (subnoannex.pathobj / 'test_mod_file').write_text('git')
    # save separately
    top.save(sub.pathobj, message='annexadd', recursive=True)
    top.save(subnoannex.pathobj, message='gitadd', recursive=True)
    # now only publish the latter one
    res = top.push(to="target", since='HEAD~1', recursive=True)
    # nothing copied, no reports on the other modification
    assert_not_in_results(res, action='copy')
    assert_not_in_results(res, path=sub.path)
    for d in (top, subnoannex):
        assert_in_results(res,
                          status='ok',
                          type='dataset',
                          path=d.path,
                          refspec='refs/heads/master:refs/heads/master')
    # an unconditional push should now pick up the remaining changes
    res = top.push(to="target", recursive=True)
    assert_in_results(res,
                      action='copy',
                      status='ok',
                      path=str(sub.pathobj / 'test_mod_annex_file'))
    assert_in_results(res,
                      status='ok',
                      type='dataset',
                      path=sub.path,
                      refspec='refs/heads/master:refs/heads/master')
    for d in (top, subnoannex, subsub):
        assert_in_results(res,
                          status='notneeded',
                          type='dataset',
                          path=d.path,
                          refspec='refs/heads/master:refs/heads/master')
Example #5
0
    def __call__(path=None,
                 source=None,
                 dataset=None,
                 get_data=False,
                 description=None,
                 recursive=False,
                 recursion_limit=None,
                 reckless=None,
                 jobs="auto"):

        # normalize path argument to be equal when called from cmdline and
        # python and nothing was passed into `path`
        path = ensure_list(path)

        if not source and not path:
            raise InsufficientArgumentsError(
                "Please provide at least a source or a path")

        #  Common kwargs to pass to underlying git/install calls.
        #  They might need adjustments (e.g. for recursion_limit, but
        #  otherwise would be applicable throughout
        #
        # There should have been more of common options!
        # since underneath get could do similar installs
        common_kwargs = dict(
            get_data=get_data,
            recursive=recursive,
            recursion_limit=recursion_limit,
            # git_opts=git_opts,
            # annex_opts=annex_opts,
            reckless=reckless,
            jobs=jobs,
        )

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = None
        if dataset is not None:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='installation')
            common_kwargs['dataset'] = dataset
        # pre-compute for results below
        refds_path = Interface.get_refds_path(ds)

        # switch into the two scenarios without --source:
        # 1. list of URLs
        # 2. list of (sub)dataset content
        if source is None:
            # we need to collect URLs and paths
            to_install = []
            to_get = []
            # TODO: this approach is problematic, it disrupts the order of input args.
            # consequently results will be returned in an unexpected order when a
            # mixture of source URL and paths is given. Reordering is only possible when
            # everything in here is fully processed before any results can be yielded.
            # moreover, I think the semantics of the status quo implementation are a
            # bit complicated: in a mixture list a source URL will lead to a new dataset
            # at a generated default location, but a path will lead to a subdataset
            # at that exact location
            for urlpath in path:
                ri = RI(urlpath)
                (to_get
                 if isinstance(ri, PathRI) else to_install).append(urlpath)

            # 1. multiple source URLs
            for s in to_install:
                lgr.debug("Install passes into install source=%s", s)
                for r in Install.__call__(
                        source=s,
                        description=description,
                        # we need to disable error handling in order to have it done at
                        # the very top, otherwise we are not able to order a global
                        # "ignore-and-keep-going"
                        on_failure='ignore',
                        return_type='generator',
                        result_xfm=None,
                        result_filter=None,
                        **common_kwargs):
                    # no post-processing of the installed content on disk
                    # should be necessary here, all done by code further
                    # down that deals with an install from an actuall `source`
                    # any necessary fixes should go there too!
                    r['refds'] = refds_path
                    yield r

            # 2. one or more dataset content paths
            if to_get:
                lgr.debug("Install passes into get %d items", len(to_get))
                # all commented out hint on inability to pass those options
                # into underlying install-related calls.
                # Also need to pass from get:
                #  annex_get_opts

                for r in Get.__call__(
                        to_get,
                        # TODO should pass-through description, not sure why disabled
                        # description=description,
                        # we need to disable error handling in order to have it done at
                        # the very top, otherwise we are not able to order a global
                        # "ignore-and-keep-going"
                        on_failure='ignore',
                        return_type='generator',
                        result_xfm=None,
                        result_filter=None,
                        **common_kwargs):
                    # no post-processing of get'ed content on disk should be
                    # necessary here, this is the responsibility of `get`
                    # (incl. adjusting parent's gitmodules when submodules end
                    # up in an "updated" state (done in get helpers)
                    # any required fixes should go there!
                    r['refds'] = refds_path
                    yield r

            # we are done here
            # the rest is about install from a `source`
            return

        # an actual `source` was given
        if source and path and len(path) > 1:
            # exception is ok here, if this fails it is either direct user error
            # or we f****d up one of our internal calls
            raise ValueError(
                "install needs a single PATH when source is provided.  "
                "Was given mutliple PATHs: %s" % str(path))

        # parameter constraints:
        if not source:
            # exception is ok here, if this fails it is either direct user error
            # or we f****d up one of our internal calls
            raise InsufficientArgumentsError(
                "a `source` is required for installation")

        # code below deals with a single path only
        path = path[0] if path else None

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            yield get_status_dict(
                'install',
                path=path,
                status='impossible',
                logger=lgr,
                source_url=source,
                refds=refds_path,
                message=
                "installation `source` and destination `path` are identical. "
                "If you are trying to add a subdataset simply use the `save` command"
            )
            return

        # resolve the target location (if local) against the provided dataset
        # or CWD:
        if path is not None:
            # MIH everything in here is highly similar to what common
            # interface helpers do (or should/could do), but at the same
            # is very much tailored to just apply to `install` -- I guess
            # it has to stay special

            # Should work out just fine for regular paths, so no additional
            # conditioning is necessary
            try:
                path_ri = RI(path)
            except Exception as e:
                raise ValueError("invalid path argument {}: ({})".format(
                    path, exc_str(e)))
            try:
                # Wouldn't work for SSHRI ATM, see TODO within SSHRI
                # yoh: path should be a local path, and mapping note within
                #      SSHRI about mapping localhost:path to path is kinda
                #      a peculiar use-case IMHO
                # TODO Stringification can be removed once PY35 is no longer
                # supported
                path = str(resolve_path(path_ri.localpath, dataset))
                # any `path` argument that point to something local now
                # resolved and is no longer a URL
            except ValueError:
                # `path` is neither a valid source nor a local path.
                # TODO: The only thing left is a known subdataset with a
                # name, that is not a path; Once we correctly distinguish
                # between path and name of a submodule, we need to consider
                # this.
                # For now: Just raise
                raise ValueError("Invalid path argument {0}".format(path))
        # `path` resolved, if there was any.

        # clone dataset, will also take care of adding to superdataset, if one
        # is given
        res = Clone.__call__(
            source,
            path,
            dataset=ds,
            description=description,
            reckless=reckless,
            # we need to disable error handling in order to have it done at
            # the very top, otherwise we are not able to order a global
            # "ignore-and-keep-going"
            result_xfm=None,
            return_type='generator',
            result_filter=None,
            on_failure='ignore')
        # helper
        as_ds = YieldDatasets()
        destination_dataset = None
        for r in res:
            if r['action'] == 'install' and r['type'] == 'dataset':
                # make sure logic below is valid, only one dataset result is
                # coming back
                assert (destination_dataset is None)
                destination_dataset = as_ds(r)
            r['refds'] = refds_path
            yield r
        assert (destination_dataset)

        # Now, recursive calls:
        if recursive or get_data:
            # dataset argument must not be passed inside since we use bound .get
            # It is ok to do "inplace" as long as we still return right
            # after the loop ends
            common_kwargs.pop('dataset', '')
            for r in destination_dataset.get(
                    curdir,
                    description=description,
                    # we need to disable error handling in order to have it done at
                    # the very top, otherwise we are not able to order a global
                    # "ignore-and-keep-going"
                    on_failure='ignore',
                    return_type='generator',
                    result_xfm=None,
                    **common_kwargs):
                r['refds'] = refds_path
                yield r
        # at this point no futher post-processing should be necessary,
        # `clone` and `get` must have done that (incl. parent handling)
        # if not, bugs should be fixed in those commands
        return
Example #6
0
def get_cached_dataset(url, version=None, keys=None):
    """ Helper to get a cached clone from url

    Intended for use from within `cached_dataset` and `cached_url` decorators.
    Clones `url` into user's cache under datalad/tests/`name`. If such a clone
    already exists, don't clone but return the existing one. So, it's supposed
    to cache the original source in order to reduce time and traffic for tests,
    by letting subsequent requests clone from a local location directly.

    If it's an annex get the content as provided by `keys`, too.
    Note, that as a transparent cache replacing the repo at URL from the POV of
    a test, we can't address content via paths, since those are valid only with
    respect to a particular worktree. If different tests clone from the same
    cached dataset, each requesting different versions and different paths
    thereof, we run into trouble if the cache itself checks out a particular
    requested version.

    Verifies that `version` can be checked out, but doesn't actually do it,
    since the cached dataset is intended to be used as origin instead of the
    original remote at URL by the `cached_dataset` test decorator. Checkout of
    a particular version should happen in its clone.

    Parameters
    ----------
    url: str
        URL to clone from
    keys: str or list or None
        (list of) annex keys to get content for.
    version: str or None
        A commit or an object that can be dereferenced to one.

    Returns
    -------
    Dataset
    """

    # TODO: What about recursive? Might be complicated. We would need to make
    #       sure we can recursively clone _from_ here then, potentially
    #       requiring submodule URL rewrites. Not sure about that ATM.

    # TODO: Given that it is supposed to be a cache for the original repo at
    #       `url`, we prob. should make this a bare repository. We don't need
    #       a potentially expensive checkout here. Need to double check
    #       `annex-get --key` in bare repos, though. Plus datalad-clone doesn't
    #       have --bare yet. But we want all the annex/special-remote/ria magic
    #       of datalad. So, plain git-clone --bare is not an option.

    if not DATALAD_TESTS_CACHE:
        raise ValueError("Caching disabled by config")

    ds = Dataset(DATALAD_TESTS_CACHE / url2filename(url))

    if not ds.is_installed():
        ds = Clone()(url, ds.pathobj)

    # When/How to update a dataset in cache? If version is a commit SHA and we
    # have it, there's no need for an update. Otherwise it gets tricky, because
    # this is a cache, not a checkout a test would operate on. It needs to
    # behave as if it was the thing at `url` from the point of view of the test
    # using it (cloning/getting content from here). We would need to update all
    # references, not just fetch them!
    #
    # Can we even (cheaply) tell whether `version` is an absolute reference
    # (actual SHA, not a branch/tag)?
    #
    # NOTE: - consider git-clone --mirror, but as w/ --bare: not an option for
    #         datalad-clone yet.
    #       - --reference[-if-able] might also be worth thinking about for
    #         the clone @cached_dataset creates wrt clone in cacheq
    #
    # So, for now fetch, figure whether there actually was something to fetch
    # and if so simply invalidate cache and re-clone/get. Don't overcomplicate
    # things. It's about datasets used in the tests - they shouldn't change too
    # frequently.
    elif any('uptodate' not in c['operations']
             for c in ds.repo.fetch(DEFAULT_REMOTE)):
        rmtree(ds.path)
        ds = Clone()(url, ds.pathobj)

    if version:
        # check whether version is available
        assert ds.repo.commit_exists(version)
    if keys and AnnexRepo.is_valid_repo(ds.path):
        ds.repo.get(keys, key=True)

    return ds