Example #1
0
def test_resolve_path(path):
    if str(Path(path).resolve()) != path:
        raise SkipTest("Test assumptions require non-symlinked parent paths")
    # initially ran into on OSX https://github.com/datalad/datalad/issues/2406
    opath = op.join(path, "origin")
    os.makedirs(opath)
    if not on_windows:
        lpath = op.join(path, "linked")
        os.symlink('origin', lpath)

    ds_global = Dataset(path)
    # path resolution of absolute paths is not influenced by symlinks
    # ignore the linked path on windows, it is not a symlink in the POSIX sense
    for d in (opath, ) if on_windows else (opath, lpath):
        ds_local = Dataset(d)
        # no symlink resolution
        eq_(str(resolve_path(d)), d)
        # list comes out as a list
        eq_(resolve_path([d]), [Path(d)])
        # multiple OK
        eq_(resolve_path([d, d]), [Path(d), Path(d)])

        with chpwd(d):
            # be aware: knows about cwd, but this CWD has symlinks resolved
            eq_(str(resolve_path(d).cwd()), opath)
            # using pathlib's `resolve()` will resolve any
            # symlinks
            # also resolve `opath`, as on old windows systems the path might
            # come in crippled (e.g. C:\Users\MIKE~1/...)
            # and comparison would fails unjustified
            eq_(resolve_path('.').resolve(), ut.Path(opath).resolve())
            # no norming, but absolute paths, without resolving links
            eq_(resolve_path('.'), ut.Path(d))
            eq_(str(resolve_path('.')), d)

            # there is no concept of an "explicit" relative path anymore
            # relative is relative, regardless of the specific syntax
            eq_(resolve_path(op.join(os.curdir, 'bu'), ds=ds_global),
                ds_global.pathobj / 'bu')
            # there is no full normpath-ing or other funky resolution of
            # parent directory back-reference
            eq_(str(resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)),
                op.join(ds_global.path, os.pardir, 'bu'))

        # resolve against a dataset given as a path/str
        # (cmdline input scenario)
        eq_(resolve_path('bu', ds=ds_local.path), Path.cwd() / 'bu')
        eq_(resolve_path('bu', ds=ds_global.path), Path.cwd() / 'bu')
        # resolve against a dataset given as a dataset instance
        # (object method scenario)
        eq_(resolve_path('bu', ds=ds_local), ds_local.pathobj / 'bu')
        eq_(resolve_path('bu', ds=ds_global), ds_global.pathobj / 'bu')
        # not being inside a dataset doesn't change the resolution result
        eq_(resolve_path(op.join(os.curdir, 'bu'), ds=ds_global),
            ds_global.pathobj / 'bu')
        eq_(str(resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)),
            op.join(ds_global.path, os.pardir, 'bu'))
Example #2
0
 def custom_result_renderer(res, **kwargs):  # pragma: no cover
     if not (res['status'] == 'ok' \
             and res['action'] in ('status', 'diff') \
             and res.get('state', None) != 'clean'):
         # logging reported already
         return
     from datalad.ui import ui
     # when to render relative paths:
     #  1) if a dataset arg was given
     #  2) if CWD is the refds
     refds = res.get('refds', None)
     refds = refds if kwargs.get('dataset', None) is not None \
         or refds == os.getcwd() else None
     path = res['path'] if refds is None \
         else str(ut.Path(res['path']).relative_to(refds))
     type_ = res.get('type', res.get('type_src', ''))
     max_len = len('untracked')
     state = res['state']
     ui.message('{fill}{state}: {path}{type_}'.format(
         fill=' ' * max(0, max_len - len(state)),
         state=ac.color_word(
             state,
             STATE_COLOR_MAP.get(res['state'], ac.WHITE)),
         path=path,
         type_=' ({})'.format(
             ac.color_word(type_, ac.MAGENTA) if type_ else '')))
Example #3
0
 def pathobj(self):
     """pathobj for the dataset"""
     # XXX this relies on the assumption that self._path as managed
     # by the base class is always a native path
     if not self._pathobj:
         self._pathobj = ut.Path(self._path)
     return self._pathobj
Example #4
0
 def custom_result_renderer(res, **kwargs):  # pragma: no cover
     if not (res['status'] == 'ok' \
             and res['action'] in ('status', 'diff') \
             and res.get('state', None) != 'clean'):
         # logging reported already
         return
     from datalad.ui import ui
     # when to render relative paths:
     #  1) if a dataset arg was given
     #  2) if CWD is the refds
     refds = res.get('refds', None)
     refds = refds if kwargs.get('dataset', None) is not None \
         or refds == os.getcwd() else None
     # Note: We have to force unicode for res['path'] because
     # interface.utils encodes it on py2 before passing it to
     # custom_result_renderer().
     path = assure_unicode(res['path']) if refds is None \
         else text_type(ut.Path(res['path']).relative_to(refds))
     type_ = res.get('type', res.get('type_src', ''))
     max_len = len('untracked')
     state = res.get('state', 'unknown')
     ui.message(u'{fill}{state}: {path}{type_}'.format(
         fill=' ' * max(0, max_len - len(state)),
         state=ac.color_word(
             state, STATE_COLOR_MAP.get(res.get('state', 'unknown'))),
         path=path,
         type_=' ({})'.format(
             ac.color_word(type_, ac.MAGENTA) if type_ else '')))
Example #5
0
def test_status_basics(path, linkpath, otherdir):
    if has_symlink_capability():
        # make it more complicated by default
        ut.Path(linkpath).symlink_to(path, target_is_directory=True)
        path = linkpath

    with chpwd(path):
        assert_raises(NoDatasetFound, status)
    ds = Dataset(path).create()
    # outcome identical between ds= and auto-discovery
    with chpwd(path):
        assert_raises(IncompleteResultsError, status, path=otherdir)
        stat = status(result_renderer=None)
    eq_(stat, ds.status(result_renderer=None))
    assert_status('ok', stat)
    # we have a bunch of reports (be vague to be robust to future changes
    assert len(stat) > 2
    # check the composition
    for s in stat:
        eq_(s['status'], 'ok')
        eq_(s['action'], 'status')
        eq_(s['state'], 'clean')
        eq_(s['type'], 'file')
        assert_in('gitshasum', s)
        assert_in('bytesize', s)
        eq_(s['refds'], ds.path)
Example #6
0
 def custom_result_renderer(res, **kwargs):  # pragma: more cover
     if (res['status'] == 'ok' and res['action'] in ('status', 'diff')
             and res.get('state') == 'clean'):
         # this renderer will be silent for clean status|diff results
         return
     if res['status'] != 'ok' or res['action'] not in ('status', 'diff'):
         # whatever this renderer cannot account for, send to generic
         generic_result_renderer(res)
         return
     from datalad.ui import ui
     # when to render relative paths:
     #  1) if a dataset arg was given
     #  2) if CWD is the refds
     refds = res.get('refds', None)
     refds = refds if kwargs.get('dataset', None) is not None \
         or refds == os.getcwd() else None
     path = res['path'] if refds is None \
         else str(ut.Path(res['path']).relative_to(refds))
     type_ = res.get('type', res.get('type_src', ''))
     max_len = len('untracked')
     state = res.get('state', 'unknown')
     ui.message(u'{fill}{state}: {path}{type_}'.format(
         fill=' ' * max(0, max_len - len(state)),
         state=ac.color_word(
             state, STATE_COLOR_MAP.get(res.get('state', 'unknown'))),
         path=path,
         type_=' ({})'.format(
             ac.color_word(type_, ac.MAGENTA) if type_ else '')))
Example #7
0
def test_add_files(path):
    ds = Dataset(path).create(force=True)

    test_list_1 = ['test_annex.txt']
    test_list_2 = ['test.txt']
    test_list_3 = ['test1.dat', 'test2.dat']
    test_list_4 = [
        op.join('dir', 'testindir'),
        op.join('dir', OBSCURE_FILENAME)
    ]

    for arg in [(test_list_1[0], False), (test_list_2[0], True),
                (test_list_3, False), (test_list_4, False)]:
        # special case 4: give the dir:
        if arg[0] == test_list_4:
            result = ds.save('dir', to_git=arg[1])
            status = ds.repo.annexstatus(['dir'])
        else:
            result = ds.save(arg[0], to_git=arg[1])
            for a in assure_list(arg[0]):
                assert_result_count(result, 1, path=text_type(ds.pathobj / a))
            status = ds.repo.get_content_annexinfo(
                ut.Path(p) for p in assure_list(arg[0]))
        for f, p in iteritems(status):
            if arg[1]:
                assert p.get('key', None) is None, f
            else:
                assert p.get('key', None) is not None, f
Example #8
0
def test_rev_resolve_path(path):
    if op.realpath(path) != path:
        raise SkipTest("Test assumptions require non-symlinked parent paths")
    # initially ran into on OSX https://github.com/datalad/datalad/issues/2406
    opath = op.join(path, "origin")
    os.makedirs(opath)
    if not on_windows:
        lpath = op.join(path, "linked")
        os.symlink('origin', lpath)

    ds_global = Dataset(path)
    # path resolution of absolute paths is not influenced by symlinks
    # ignore the linked path on windows, it is not a symlink in the POSIX sense
    for d in (opath, ) if on_windows else (opath, lpath):
        ds_local = Dataset(d)
        # no symlink resolution
        eq_(str(rev_resolve_path(d)), d)
        with chpwd(d):
            # be aware: knows about cwd, but this CWD has symlinks resolved
            eq_(str(rev_resolve_path(d).cwd()), opath)
            # using pathlib's `resolve()` will resolve any
            # symlinks
            # also resolve `opath`, as on old windows systems the path might
            # come in crippled (e.g. C:\Users\MIKE~1/...)
            # and comparison would fails unjustified
            eq_(rev_resolve_path('.').resolve(), ut.Path(opath).resolve())
            # no norming, but absolute paths, without resolving links
            eq_(rev_resolve_path('.'), ut.Path(d))
            eq_(str(rev_resolve_path('.')), d)

            eq_(str(rev_resolve_path(op.join(os.curdir, 'bu'), ds=ds_global)),
                op.join(d, 'bu'))
            eq_(str(rev_resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)),
                op.join(ds_global.path, 'bu'))

        # resolve against a dataset
        eq_(str(rev_resolve_path('bu', ds=ds_local)), op.join(d, 'bu'))
        eq_(str(rev_resolve_path('bu', ds=ds_global)), op.join(path, 'bu'))
        # but paths outside the dataset are left untouched
        eq_(str(rev_resolve_path(op.join(os.curdir, 'bu'), ds=ds_global)),
            op.join(getpwd(), 'bu'))
        eq_(str(rev_resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)),
            op.normpath(op.join(getpwd(), os.pardir, 'bu')))
Example #9
0
    def _flyweight_postproc_path(cls, path):
        """perform any desired path post-processing (e.g., dereferencing etc)

        By default - realpath to guarantee reuse. Derived classes (e.g.,
        Dataset) could override to allow for symlinked datasets to have
        individual instances for multiple symlinks
        """
        # resolve symlinks to make sure we have exactly one instance per
        # physical repository at a time
        # do absolute() in addition to always get an absolute path
        # even with non-existing paths on windows
        resolved = str(ut.Path(path).resolve().absolute())
        if ut.on_windows and resolved.startswith('\\\\'):
            # resolve() ended up converting a mounted network drive into a UNC path.
            # such paths are not supoprted (e.g. as cmd.exe CWD), hence redo and take
            # absolute path at face value. This has the consequence we cannot determine
            # repo duplicates mounted on different drives, but this is no worse than
            # on UNIX
            return str(ut.Path(path).absolute())
        return resolved
Example #10
0
def test_hashable(path):
    path = ut.Path(path)
    tryme = set()
    # is it considered hashable at all
    tryme.add(Dataset(path / 'one'))
    eq_(len(tryme), 1)
    # do another one, same class different path
    tryme.add(Dataset(path / 'two'))
    eq_(len(tryme), 2)
    # test whether two different types of repo instances pointing
    # to the same repo on disk are considered different
    Dataset(path).create()
    tryme.add(GitRepo(path))
    eq_(len(tryme), 3)
    tryme.add(AnnexRepo(path))
    eq_(len(tryme), 4)
Example #11
0
def test_gh1597_simpler(path):
    ds = Dataset(path).create()
    # same goes for .gitattributes
    with open(op.join(ds.path, '.gitignore'), 'a') as f:
        f.write('*.swp\n')
    ds.save('.gitignore')
    assert_repo_status(ds.path)
    # put .gitattributes in some subdir and add all, should also go into Git
    attrfile = op.join('subdir', '.gitattributes')
    ds.repo.set_gitattributes([('*', dict(mycustomthing='this'))], attrfile)
    assert_repo_status(ds.path, untracked=[attrfile], untracked_mode='all')
    ds.save()
    assert_repo_status(ds.path)
    # no annex key, not in annex
    assert_not_in(
        'key',
        ds.repo.get_content_annexinfo([ut.Path(attrfile)]).popitem()[1])
Example #12
0
def test_resolve_path_symlink_edition(path):
    deepest = ut.Path(path) / 'one' / 'two' / 'three'
    deepest_str = str(deepest)
    os.makedirs(deepest_str)
    with chpwd(deepest_str):
        # direct absolute
        eq_(deepest, resolve_path(deepest))
        eq_(deepest, resolve_path(deepest_str))
        # explicit direct relative
        eq_(deepest, resolve_path('.'))
        eq_(deepest, resolve_path(op.join('.', '.')))
        eq_(deepest, resolve_path(op.join('..', 'three')))
        eq_(deepest, resolve_path(op.join('..', '..', 'two', 'three')))
        eq_(deepest,
            resolve_path(op.join('..', '..', '..', 'one', 'two', 'three')))
        # weird ones
        eq_(deepest, resolve_path(op.join('..', '.', 'three')))
        eq_(deepest, resolve_path(op.join('..', 'three', '.')))
        eq_(deepest, resolve_path(op.join('..', 'three', '.')))
        eq_(deepest, resolve_path(op.join('.', '..', 'three')))
Example #13
0
def path_under_rev_dataset(ds, path):
    ds_path = ds.pathobj
    try:
        rpath = text_type(ut.Path(path).relative_to(ds_path))
        if not rpath.startswith(op.pardir):
            # path is already underneath the dataset
            return path
    except Exception:
        # whatever went wrong, we gotta play save
        pass

    root = rev_get_dataset_root(text_type(path))
    while root is not None and not ds_path.samefile(root):
        # path and therefore root could be relative paths,
        # hence in the next round we cannot use dirname()
        # to jump in the the next directory up, but we have
        # to use ./.. and get_dataset_root() will handle
        # the rest just fine
        root = rev_get_dataset_root(op.join(root, op.pardir))
    if root is None:
        return None
    return ds_path / op.relpath(text_type(path), root)
Example #14
0
    def __call__(
        path=None,
        message=None,
        dataset=None,
        version_tag=None,
        recursive=False,
        recursion_limit=None,
        updated=False,
        message_file=None,
        to_git=None,
    ):
        if message and message_file:
            raise ValueError(
                "Both a message and message file were specified for save()")

        path = assure_list(path)

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        # we want 'normal' to achieve the most compact argument list
        # for git calls
        # untracked_mode = 'no' if updated else 'normal'
        # TODO however, Repo.add() would refuse to add any dotfiles
        # in a directory that is itself untracked, hence the only
        # choice is to go with potentially crazy long lists
        # until https://github.com/datalad/datalad/issues/1454
        # has a resolution
        untracked_mode = 'no' if updated else 'all'

        # there are three basic scenarios:
        # 1. save modifications to any already tracked content
        # 2. save any content (including removal of deleted content)
        #    to bring things to a clean state
        # 3. like (2), but only operate on a given subset of content
        #    identified by paths
        # - all three have to work in conjunction with --recursive
        # - the difference between (1) and (2) should be no more
        #   that a switch from --untracked=no to --untracked=all
        #   in Repo.save()

        # we do not support
        # - simultaneous operations on multiple datasets from disjoint
        #   dataset hierarchies, hence a single reference dataset must be
        #   identifiable from the either
        #   - curdir or
        #   - the `dataset` argument.
        #   This avoids complex annotation loops and hierarchy tracking.
        # - any modification upwards from the root dataset

        ds = require_dataset(dataset, check_installed=True, purpose='saving')

        # use status() to do all discovery and annotation of paths
        paths_by_ds = {}
        for s in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=path,
                untracked=untracked_mode,
                recursive=recursive,
                recursion_limit=recursion_limit,
                result_renderer='disabled'):
            # fish out status dict for this parent dataset
            ds_status = paths_by_ds.get(s['parentds'], {})
            # reassemble path status info as repo.status() would have made it
            ds_status[ut.Path(s['path'])] = \
                {k: v for k, v in iteritems(s)
                 if k not in (
                     'path', 'parentds', 'refds', 'status', 'action',
                     'logger')}
            paths_by_ds[s['parentds']] = ds_status

        lgr.debug('Determined %i datasets for saving from input arguments',
                  len(paths_by_ds))
        # figure out what datasets to process, start with the ones containing
        # the paths that were given as arguments
        discovered_datasets = list(paths_by_ds.keys())
        if dataset:
            # if a reference dataset was given we want to save all the way up
            # to it, so let's throw it into the mix
            discovered_datasets.append(ds.path)
        # sort the datasets into (potentially) disjoint hierarchies,
        # or a single one, if a reference dataset was given
        dataset_hierarchies = get_tree_roots(discovered_datasets)
        for rootds, children in iteritems(dataset_hierarchies):
            edges = {}
            discover_dataset_trace_to_targets(rootds,
                                              children, [],
                                              edges,
                                              includeds=children)
            for superds, subdss in iteritems(edges):
                superds_status = paths_by_ds.get(superds, {})
                for subds in subdss:
                    # TODO actually start from an entry that may already
                    # exist in the status record
                    superds_status[ut.Path(subds)] = dict(
                        # shot from the hip, some status config
                        # to trigger this specific super/sub
                        # relation to be saved
                        state='untracked',
                        type='dataset')
                paths_by_ds[superds] = superds_status

        # TODO parallelize, whenever we have multiple subdataset of a single
        # dataset they can all be processed simultaneously
        # sort list of dataset to handle, starting with the ones deep down
        for pdspath in sorted(paths_by_ds, reverse=True):
            pds = Dataset(pdspath)
            # pop status for this dataset, we are not coming back to it
            pds_status = {
                # for handing over to the low-level code, we recode any
                # path relative to the real repo location, this avoid
                # cumbersome symlink handling without context in the
                # lower levels
                pds.repo.pathobj / p.relative_to(pdspath): props
                for p, props in iteritems(paths_by_ds.pop(pdspath))
            }
            start_commit = pds.repo.get_hexsha()
            if not all(p['state'] == 'clean' for p in pds_status.values()):
                for res in pds.repo.save_(
                        message=message,
                        # make sure to have the `path` arg be None, as we want
                        # to prevent and bypass any additional repo.status()
                        # calls
                        paths=None,
                        # prevent whining of GitRepo
                        git=True
                        if not hasattr(ds.repo, 'annexstatus') else to_git,
                        # we are supplying the full status already, do not
                        # detect anything else
                        untracked='no',
                        _status=pds_status):
                    # TODO remove stringification when datalad-core can handle
                    # path objects, or when PY3.6 is the lowest supported
                    # version
                    for k in ('path', 'refds'):
                        if k in res:
                            res[k] = str(
                                # recode path back to dataset path anchor
                                pds.pathobj /
                                res[k].relative_to(pds.repo.pathobj))
                    yield res
            # report on the dataset itself
            dsres = dict(
                action='save',
                type='dataset',
                path=pds.path,
                refds=ds.path,
                status='ok'
                if start_commit != pds.repo.get_hexsha() else 'notneeded',
                logger=lgr,
            )
            if not version_tag:
                yield dsres
                continue
            try:
                pds.repo.tag(version_tag)
                dsres.update(status='ok', version_tag=version_tag)
                yield dsres
            except CommandError as e:
                if dsres['status'] == 'ok':
                    # first we yield the result for the actual save
                    yield dsres.copy()
                # and now complain that tagging didn't work
                dsres.update(status='error',
                             message=('cannot tag this version: %s',
                                      e.stderr.strip()))
                yield dsres
Example #15
0
def get_paths_by_ds(refds, dataset_arg, paths, subdsroot_mode='rsync'):
    """Resolve and sort any paths into their containing datasets

    Any path will be associated (sorted into) its nearest containing dataset.
    It is irrelevant whether or not a path presently exists on the file system.
    However, only datasets that exist on the file system are used for
    sorting/association -- known, but non-existent subdatasets are not
    considered.

    Parameters
    ----------
    refds: Dataset
    dataset_arg: Dataset or str or Path or None
      Any supported value given to a command's `dataset` argument. Given
      to `resolve_path()`.
    paths: list
      Any number of absolute or relative paths, in str-form or as
      Path instances, to be sorted into their respective datasets. See also
      the `subdsroot_mode` parameter.
    subdsroot_mode: {'rsync', 'super', 'sub'}
      Switch behavior for paths that are the root of a subdataset. By default
      ('rsync'), such a path is associated with its parent/superdataset,
      unless the path ends with a trailing directory separator, in which case
      it is sorted into the subdataset record (this resembles the path
      semantics of rsync, hence the label). In 'super' mode, the path is always
      placed with the superdataset record. Likewise, in 'sub' mode the path
      is always placed into the subdataset record.

    Returns
    -------
    dict, list
      The first return value is the main result, a dictionary with root
      directories of all discovered datasets as keys and a list of the
      associated paths inside these datasets as values.  Keys and values are
      normalized to be Path instances of absolute paths.
      The second return value is a list of all paths (again Path instances)
      that are not located underneath the reference dataset.
    """
    ds_path = refds.path
    paths_by_ds = OrderedDict()
    errors = []

    if not paths:
        # that was quick
        paths_by_ds[refds.pathobj] = None
        return paths_by_ds, errors

    # in order to guarantee proper path sorting, we first need to resolve all
    # of them (some may be str, some Path, some relative, some absolute)
    # step 1: normalize to unicode
    paths = map(ensure_unicode, paths)
    # step 2: resolve
    # for later comparison, we need to preserve the original value too
    paths = [(resolve_path(p, dataset_arg), str(p)) for p in paths]
    # OPT: store cache for dataset roots for each directory directly
    #      listed in paths, or containing the path (if file)
    roots_cache = {}
    # sort any path argument into the respective subdatasets
    # sort by comparing the resolved Path instances, this puts top-level
    # paths first, leading to their datasets to be injected into the result
    # dict first
    for p, orig_path in sorted(paths, key=lambda x: x[0]):
        # TODO (left from implementing caching OPT):
        # Logic here sounds duplicated with discover_dataset_trace_to_targets
        # and even get_tree_roots of save.
        str_p = str(p)

        # query get_dataset_root caching for repeated queries within the same
        # directory
        if p.is_dir():
            p_dir = str(p)
        else:  # symlink, file, whatnot - seems to match logic in get_dataset_root
            p_dir = str(p.parent)

        try:
            root = roots_cache[p_dir]
        except KeyError:
            root = roots_cache[p_dir] = get_dataset_root(p_dir)

        # to become the root of the dataset that contains the path in question
        # in the context of (same basepath) as the reference dataset
        qds_inrefds = None
        if root is not None:
            qds_inrefds = path_under_rev_dataset(refds, root)
        if root is None or qds_inrefds is None:
            # no root, not possibly underneath the refds
            # or root that is not underneath/equal the reference dataset root
            errors.append(p)
            continue

        if root != qds_inrefds:
            # try to recode the dataset path wrt to the reference
            # dataset
            # the path that it might have been located by could
            # have been a resolved path or another funky thing
            # the path this dataset was located by is not how it would
            # be referenced underneath the refds (possibly resolved
            # realpath) -> recode all paths to be underneath the refds
            p = qds_inrefds / p.relative_to(root)
            root = qds_inrefds

        # Note: Compare to Dataset(root).path rather
        # than root to get same path normalization.
        if root == str_p and not Dataset(root).path == ds_path and (
                subdsroot_mode == 'super' or
            (subdsroot_mode == 'rsync' and dataset_arg
             and not orig_path.endswith(op.sep))):
            # the given path is pointing to a subdataset
            # and we are either in 'super' mode, or in 'rsync' and found
            # rsync-link syntax to identify the dataset as whole
            # (e.g. 'ds') vs its content (e.g. 'ds/')
            root_dir = op.dirname(root)
            try:
                super_root = roots_cache[root_dir]
            except KeyError:
                super_root = roots_cache[root_dir] = get_dataset_root(root_dir)
            if super_root:
                # the dataset identified by the path argument
                # is contained in a superdataset, and no
                # trailing path separator was found in the
                # argument -> user wants to address the dataset
                # as a whole (in the superdataset)
                root = super_root

        root = ut.Path(root)
        ps = paths_by_ds.get(root, [])
        ps.append(p)
        paths_by_ds[root] = ps
    return paths_by_ds, errors
Example #16
0
    def __call__(path=None,
                 initopts=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 fake_dates=False,
                 cfg_proc=None):
        refds_path = dataset.path if hasattr(dataset, 'path') else dataset

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if path:
            path = rev_resolve_path(path, dataset)

        path = path if path \
            else getpwd() if dataset is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # prep for yield
        res = dict(action='create',
                   path=text_type(path),
                   logger=lgr,
                   type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != path:
            refds = require_dataset(refds_path,
                                    check_installed=True,
                                    purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", dataset,
                        text_type(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = rev_get_dataset_root(
            op.normpath(op.join(text_type(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if any(check_path == p or check_path in p.parents
                   for p in pstatus):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents
                ]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with content in parent dataset at %s: %s',
                     text_type(parentds_path),
                     [text_type(c) for c in conflict])
                })
                yield res
                return
            # another set of check to see whether the target path is pointing
            # into a known subdataset that is not around ATM
            subds_status = {
                parentds_path / k.relative_to(prepo.path)
                for k, v in iteritems(pstatus)
                if v.get('type', None) == 'dataset'
            }
            check_paths = [check_path]
            check_paths.extend(check_path.parents)
            if any(p in subds_status for p in check_paths):
                conflict = [p for p in check_paths if p in subds_status]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with %s (dataset) in dataset %s',
                     text_type(conflict[0]), text_type(parentds_path))
                })
                yield res
                return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and \
            dataset.path == path else Dataset(text_type(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`force` option to ignore'
            })
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(tbds.path,
                             url=None,
                             create=True,
                             create_sanity_checks=False,
                             git_opts=initopts,
                             fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {'type': 'file', 'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates)
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'),
                                       persistent=True,
                                       commit=False)
            add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'
            }
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (('config', 'annex.largefiles', 'nothing'), (
                'metadata/aggregate*', 'annex.largefiles', 'nothing'
            ), ('metadata/objects/**', 'annex.largefiles', '({})'.format(
                cfg.obtain('datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbds.repo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbds.repo.set_gitattributes(set_attrs,
                                            attrfile=op.join(
                                                '.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbds.repo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get('annex.largefiles',
                                             None) == 'nothing':
                tbds.repo.set_gitattributes([('**/.git*', {
                    'annex.largefiles': 'nothing'
                })])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'
                }

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(id_var,
                        tbds_id if tbds_id is not None else uuid_id,
                        where='dataset',
                        reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in iteritems(tbds.config.overrides):
            tbds.config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds.config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbds.repo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'
        }

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.repo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in refds.save(path=tbds.path, ):
                yield r

        res.update({'status': 'ok'})
        yield res

        for cfg_proc_ in cfg_proc or []:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r
Example #17
0
def rev_resolve_path(path, ds=None):
    """Resolve a path specification (against a Dataset location)

    Any explicit path (absolute or relative) is returned as an absolute path.
    In case of an explicit relative path (e.g. "./some", or ".\\some" on
    windows), the current working directory is used as reference. Any
    non-explicit relative path is resolved against as dataset location, i.e.
    considered relative to the location of the dataset. If no dataset is
    provided, the current working directory is used.

    Note however, that this function is not able to resolve arbitrarily
    obfuscated path specifications. All operations are purely lexical, and no
    actual path resolution against the filesystem content is performed.
    Consequently, common relative path arguments like '../something' (relative
    to PWD) can be handled properly, but things like 'down/../under' cannot, as
    resolving this path properly depends on the actual target of any
    (potential) symlink leading up to '..'.

    Parameters
    ----------
    path : str or PathLike
      Platform-specific path specific path specification.
    ds : Dataset or None
      Dataset instance to resolve non-explicit relative paths against.

    Returns
    -------
    `pathlib.Path` object
    """
    if ds is not None and not isinstance(ds, Dataset):
        ds = require_dataset(ds, check_installed=False, purpose='path resolution')
    if ds is None:
        # CWD is the reference
        path = ut.Path(path)
    # we have a dataset
    # stringify in case a pathobj came in
    elif not op.isabs(str(path)) and \
            not (str(path).startswith(os.curdir + os.sep) or
                 str(path).startswith(os.pardir + os.sep)):
        # we have a dataset and no abspath nor an explicit relative path ->
        # resolve it against the dataset
        path = ds.pathobj / path
    else:
        # CWD is the reference
        path = ut.Path(path)

    # make sure we return an absolute path, but without actually
    # resolving anything
    if not path.is_absolute():
        # in general it is almost impossible to use resolve() when
        # we can have symlinks in the root path of a dataset
        # (that we don't want to resolve here), symlinks to annex'ed
        # files (that we never want to resolve), and other within-repo
        # symlinks that we (sometimes) want to resolve (i.e. symlinked
        # paths for addressing content vs adding content)
        # CONCEPT: do the minimal thing to catch most real-world inputs
        # ASSUMPTION: the only sane relative path input that needs
        # handling and can be handled are upward references like
        # '../../some/that', wherease stuff like 'down/../someotherdown'
        # are intellectual excercises
        # ALGORITHM: match any number of leading '..' path components
        # and shorten the PWD by that number
        # NOT using ut.Path.cwd(), because it has symlinks resolved!!
        pwd_parts = ut.Path(getpwd()).parts
        path_parts = path.parts
        leading_parents = 0
        for p in path.parts:
            if p == op.pardir:
                leading_parents += 1
                path_parts = path_parts[1:]
            elif p == op.curdir:
                # we want to discard that, but without stripping
                # a corresponding parent
                path_parts = path_parts[1:]
            else:
                break
        path = ut.Path(
            op.join(
                *(pwd_parts[:-leading_parents if leading_parents else None]
                  + path_parts)))
    # note that we will not "normpath()" the result, check the
    # pathlib docs for why this is the only sane choice in the
    # face of the possibility of symlinks in the path
    return path
def test_get_content_info(path):
    repo = GitRepo(path)
    assert_equal(repo.get_content_info(), {})
    # an invalid reference causes an exception
    assert_raises(ValueError, repo.get_content_info, ref='HEAD')

    ds = get_convoluted_situation(path)
    repopath = ds.repo.pathobj

    assert_equal(ds.repo.pathobj, repopath)
    assert_equal(ds.pathobj, ut.Path(path))

    # verify general rules on fused info records that are incrementally
    # assembled: for git content info, amended with annex info on 'HEAD'
    # (to get the last committed stage and with it possibly vanished
    # content), and lastly annex info wrt to the present worktree, to
    # also get info on added/staged content
    # this fuses the info reported from
    # - git ls-files
    # - git annex findref HEAD
    # - git annex find --include '*'
    for f, r in ds.repo.annexstatus().items():
        if f.match('*_untracked'):
            assert (r.get('gitshasum', None) is None)
        if f.match('*_deleted'):
            assert (not f.exists() and not f.is_symlink() is None)
        if f.match('subds_*'):
            assert (r['type'] == 'dataset'
                    if r.get('gitshasum', None) else 'directory')
        if f.match('file_*'):
            # which one exactly depends on many things
            assert_in(r['type'], ('file', 'symlink'))
        if f.match('file_ingit*'):
            assert (r['type'] == 'file')
        elif '.datalad' not in f.parts and not f.match('.git*') and \
                r.get('gitshasum', None) and not f.match('subds*'):
            # this should be known to annex, one way or another
            # regardless of whether things add deleted or staged
            # or anything in between
            assert_in('key', r, f)
            assert_in('keyname', r, f)
            assert_in('backend', r, f)
            assert_in('bytesize', r, f)
            # no duplication with path
            assert_not_in('file', r, f)

    # query full untracked report
    res = ds.repo.get_content_info()
    assert_in(repopath.joinpath('dir_untracked', 'file_untracked'), res)
    assert_not_in(repopath.joinpath('dir_untracked'), res)
    # query for compact untracked report
    res = ds.repo.get_content_info(untracked='normal')
    assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res)
    assert_in(repopath.joinpath('dir_untracked'), res)
    # query no untracked report
    res = ds.repo.get_content_info(untracked='no')
    assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res)
    assert_not_in(repopath.joinpath('dir_untracked'), res)

    # git status integrity
    status = ds.repo.status()
    for t in ('subds', 'file'):
        for s in ('untracked', 'added', 'deleted', 'clean', 'ingit_clean',
                  'dropped_clean', 'modified', 'ingit_modified'):
            for l in ('', ut.PurePosixPath('subdir', '')):
                if t == 'subds' and 'ingit' in s or 'dropped' in s:
                    # invalid combination
                    continue
                if t == 'subds' and s == 'deleted':
                    # same as subds_unavailable -> clean
                    continue
                p = repopath.joinpath(l, '{}_{}'.format(t, s))
                assert p.match('*_{}'.format(status[p]['state'])), p
                if t == 'subds':
                    assert_in(status[p]['type'], ('dataset', 'directory'), p)
                else:
                    assert_in(status[p]['type'], ('file', 'symlink'), p)

    # git annex status integrity
    annexstatus = ds.repo.annexstatus()
    for t in ('file', ):
        for s in ('untracked', 'added', 'deleted', 'clean', 'ingit_clean',
                  'dropped_clean', 'modified', 'ingit_modified'):
            for l in ('', ut.PurePosixPath('subdir', '')):
                p = repopath.joinpath(l, '{}_{}'.format(t, s))
                if s in ('untracked', 'ingit_clean', 'ingit_modified'):
                    # annex knows nothing about these things
                    assert_not_in('key', annexstatus[p])
                    continue
                assert_in('key', annexstatus[p])
                # dear future,
                # if the next one fails, git-annex might have changed the
                # nature of the path that are being reported by
                # `annex find --json`
                # when this was written `hashir*` was a native path, but
                # `file` was a POSIX path
                assert_equal(annexstatus[p]['has_content'], 'dropped' not in s)

    # check the different subds evaluation modes
    someds = Dataset(ds.pathobj / 'subds_modified' / 'someds')
    dirtyds_path = someds.pathobj / 'dirtyds'
    assert_not_in('state',
                  someds.repo.status(eval_submodule_state='no')[dirtyds_path])
    assert_equal(
        'clean',
        someds.repo.status(
            eval_submodule_state='commit')[dirtyds_path]['state'])
    assert_equal(
        'modified',
        someds.repo.status(eval_submodule_state='full')[dirtyds_path]['state'])
Example #19
0
def test_repo_diff(path, norepo):
    ds = Dataset(path).create()
    assert_repo_status(ds.path)
    assert_raises(ValueError, ds.repo.diff, fr='WTF', to='MIKE')

    if ds.repo.is_managed_branch():
        fr_base = DEFAULT_BRANCH
        to = DEFAULT_BRANCH
    else:
        fr_base = "HEAD"
        to = None

    # no diff
    eq_(ds.repo.diff(fr_base, to), {})
    # bogus path makes no difference
    eq_(ds.repo.diff(fr_base, to, paths=['THIS']), {})
    # let's introduce a known change
    create_tree(ds.path, {'new': 'empty'})
    ds.save(to_git=True)
    assert_repo_status(ds.path)
    eq_(
        ds.repo.diff(fr=fr_base + '~1', to=fr_base), {
            ut.Path(ds.repo.pathobj / 'new'): {
                'state': 'added',
                'type': 'file',
                'bytesize': 5,
                'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6'
            }
        })
    # modify known file
    create_tree(ds.path, {'new': 'notempty'})
    eq_(
        ds.repo.diff(fr='HEAD', to=None),
        {
            ut.Path(ds.repo.pathobj / 'new'): {
                'state': 'modified',
                'type': 'file',
                # the beast is modified, but no change in shasum -> not staged
                'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6',
                'prev_gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6'
            }
        })
    # per path query gives the same result
    eq_(ds.repo.diff(fr=fr_base, to=to),
        ds.repo.diff(fr=fr_base, to=to, paths=['new']))
    # also given a directory as a constraint does the same
    eq_(ds.repo.diff(fr=fr_base, to=to),
        ds.repo.diff(fr=fr_base, to=to, paths=['.']))
    # but if we give another path, it doesn't show up
    eq_(ds.repo.diff(fr=fr_base, to=to, paths=['other']), {})

    # make clean
    ds.save()
    assert_repo_status(ds.path)

    # untracked stuff
    create_tree(ds.path, {'deep': {'down': 'untracked', 'down2': 'tobeadded'}})
    # default is to report all files
    eq_(
        ds.repo.diff(fr='HEAD', to=None), {
            ut.Path(ds.repo.pathobj / 'deep' / 'down'): {
                'state': 'untracked',
                'type': 'file'
            },
            ut.Path(ds.repo.pathobj / 'deep' / 'down2'): {
                'state': 'untracked',
                'type': 'file'
            }
        })
    # but can be made more compact
    eq_(
        ds.repo.diff(fr='HEAD', to=None, untracked='normal'), {
            ut.Path(ds.repo.pathobj / 'deep'): {
                'state': 'untracked',
                'type': 'directory'
            }
        })

    # again a unmatching path constrainted will give an empty report
    eq_(ds.repo.diff(fr='HEAD', to=None, paths=['other']), {})
    # perfect match and anything underneath will do
    eq_(
        ds.repo.diff(fr='HEAD', to=None, paths=['deep']), {
            ut.Path(ds.repo.pathobj / 'deep' / 'down'): {
                'state': 'untracked',
                'type': 'file'
            },
            ut.Path(ds.repo.pathobj / 'deep' / 'down2'): {
                'state': 'untracked',
                'type': 'file'
            }
        })
Example #20
0
def test_path_diff(_path, linkpath):
    # do the setup on the real path, not the symlink, to have its
    # bugs not affect this test of status()
    ds = get_deeply_nested_structure(str(_path))
    if has_symlink_capability():
        # make it more complicated by default
        ut.Path(linkpath).symlink_to(_path, target_is_directory=True)
        path = linkpath
    else:
        path = _path

    ds = Dataset(path)
    if has_symlink_capability():
        assert ds.pathobj != ds.repo.pathobj

    plain_recursive = ds.diff(recursive=True,
                              annex='all',
                              result_renderer=None)
    # check integrity of individual reports with a focus on how symlinks
    # are reported
    for res in plain_recursive:
        # anything that is an "intended" symlink should be reported
        # as such. In contrast, anything that is a symlink for mere
        # technical reasons (annex using it for something in some mode)
        # should be reported as the thing it is representing (i.e.
        # a file)
        if 'link2' in str(res['path']):
            assert res['type'] == 'symlink', res
        else:
            assert res['type'] != 'symlink', res
        # every item must report its parent dataset
        assert_in('parentds', res)

    # bunch of smoke tests
    # query of '.' is same as no path
    eq_(plain_recursive,
        ds.diff(path='.', recursive=True, annex='all', result_renderer=None))
    # duplicate paths do not change things
    eq_(
        plain_recursive,
        ds.diff(path=['.', '.'],
                recursive=True,
                annex='all',
                result_renderer=None))
    # neither do nested paths
    if not "2.24.0" <= ds.repo.git_version < "2.25.0":
        # Release 2.24.0 contained a regression that was fixed with 072a231016
        # (2019-12-10).
        eq_(
            plain_recursive,
            ds.diff(path=['.', 'subds_modified'],
                    recursive=True,
                    annex='all',
                    result_renderer=None))
    # when invoked in a subdir of a dataset it still reports on the full thing
    # just like `git status`, as long as there are no paths specified
    with chpwd(op.join(path, 'directory_untracked')):
        plain_recursive = diff(recursive=True,
                               annex='all',
                               result_renderer=None)
    # should be able to take absolute paths and yield the same
    # output
    eq_(
        plain_recursive,
        ds.diff(path=ds.path,
                recursive=True,
                annex='all',
                result_renderer=None))

    # query for a deeply nested path from the top, should just work with a
    # variety of approaches
    rpath = op.join('subds_modified', 'subds_lvl1_modified',
                    u'{}_directory_untracked'.format(OBSCURE_FILENAME))
    apathobj = ds.pathobj / rpath
    apath = str(apathobj)
    for p in (rpath, apath, None):
        if p is None:
            # change into the realpath of the dataset and
            # query with an explicit path
            with chpwd(ds.path):
                res = ds.diff(path=op.join('.', rpath),
                              recursive=True,
                              annex='all',
                              result_renderer=None)
        else:
            res = ds.diff(path=p,
                          recursive=True,
                          annex='all',
                          result_renderer=None)
        assert_result_count(
            res,
            1,
            state='untracked',
            type='directory',
            refds=ds.path,
            # path always comes out a full path inside the queried dataset
            path=apath,
        )

    assert_result_count(ds.diff(recursive=True, result_renderer=None),
                        1,
                        path=apath)
    # limiting recursion will exclude this particular path
    assert_result_count(ds.diff(recursive=True,
                                recursion_limit=1,
                                result_renderer=None),
                        0,
                        path=apath)
    # negative limit is unlimited limit
    eq_(ds.diff(recursive=True, recursion_limit=-1, result_renderer=None),
        ds.diff(recursive=True, result_renderer=None))
Example #21
0
def test_status(_path, linkpath):
    # do the setup on the real path, not the symlink, to have its
    # bugs not affect this test of status()
    ds = get_deeply_nested_structure(str(_path))
    if has_symlink_capability():
        # make it more complicated by default
        ut.Path(linkpath).symlink_to(_path, target_is_directory=True)
        path = linkpath
    else:
        path = _path

    ds = Dataset(path)
    if has_symlink_capability():
        assert ds.pathobj != ds.repo.pathobj

    # spotcheck that annex status reporting and availability evaluation
    # works
    assert_result_count(
        ds.status(annex='all', result_renderer=None),
        1,
        path=str(ds.pathobj / 'subdir' / 'annexed_file.txt'),
        key='MD5E-s5--275876e34cf609db118f3d84b799a790.txt',
        has_content=True,
        objloc=str(ds.repo.pathobj / '.git' / 'annex' / 'objects' /
                   # hashdir is different on windows
                   ('f33' if ds.repo.is_managed_branch() else '7p') /
                   ('94b' if ds.repo.is_managed_branch() else 'gp') /
                   'MD5E-s5--275876e34cf609db118f3d84b799a790.txt' /
                   'MD5E-s5--275876e34cf609db118f3d84b799a790.txt'))

    plain_recursive = ds.status(recursive=True, result_renderer=None)
    # check integrity of individual reports with a focus on how symlinks
    # are reported
    for res in plain_recursive:
        # anything that is an "intended" symlink should be reported
        # as such. In contrast, anything that is a symlink for mere
        # technical reasons (annex using it for something in some mode)
        # should be reported as the thing it is representing (i.e.
        # a file)
        if 'link2' in str(res['path']):
            assert res['type'] == 'symlink', res
        else:
            assert res['type'] != 'symlink', res
        # every item must report its parent dataset
        assert_in('parentds', res)

    # bunch of smoke tests
    # query of '.' is same as no path
    eq_(plain_recursive,
        ds.status(path='.', recursive=True, result_renderer=None))
    # duplicate paths do not change things
    eq_(plain_recursive,
        ds.status(path=['.', '.'], recursive=True, result_renderer=None))
    # neither do nested paths
    eq_(
        plain_recursive,
        ds.status(path=['.', 'subds_modified'],
                  recursive=True,
                  result_renderer=None))
    # when invoked in a subdir of a dataset it still reports on the full thing
    # just like `git status`, as long as there are no paths specified
    with chpwd(op.join(path, 'directory_untracked')):
        plain_recursive = status(recursive=True, result_renderer=None)
    # should be able to take absolute paths and yield the same
    # output
    eq_(plain_recursive,
        ds.status(path=ds.path, recursive=True, result_renderer=None))

    # query for a deeply nested path from the top, should just work with a
    # variety of approaches
    rpath = op.join('subds_modified', 'subds_lvl1_modified',
                    OBSCURE_FILENAME + u'_directory_untracked')
    apathobj = ds.pathobj / rpath
    apath = str(apathobj)
    # ds.repo.pathobj will have the symlink resolved
    arealpath = ds.repo.pathobj / rpath
    # TODO include explicit relative path in test
    for p in (rpath, apath, arealpath, None):
        if p is None:
            # change into the realpath of the dataset and
            # query with an explicit path
            with chpwd(ds.repo.path):
                res = ds.status(path=op.join('.', rpath), result_renderer=None)
        else:
            res = ds.status(path=p, result_renderer=None)
        assert_result_count(
            res,
            1,
            state='untracked',
            type='directory',
            refds=ds.path,
            # path always comes out a full path inside the queried dataset
            path=apath,
        )

    assert_result_count(ds.status(recursive=True, result_renderer=None),
                        1,
                        path=apath)
    # limiting recursion will exclude this particular path
    assert_result_count(ds.status(recursive=True,
                                  recursion_limit=1,
                                  result_renderer=None),
                        0,
                        path=apath)
    # negative limit is unlimited limit
    eq_(ds.status(recursive=True, recursion_limit=-1, result_renderer=None),
        ds.status(recursive=True, result_renderer=None))
Example #22
0
    def __call__(path=None,
                 dataset=None,
                 annex=None,
                 untracked='normal',
                 recursive=False,
                 recursion_limit=None,
                 eval_subdataset_state='full'):
        # To the next white knight that comes in to re-implement `status` as a
        # special case of `diff`. There is one fundamental difference between
        # the two commands: `status` can always use the worktree as evident on
        # disk as a contraint (e.g. to figure out which subdataset a path is in)
        # `diff` cannot do that (everything need to be handled based on a
        # "virtual" representation of a dataset hierarchy).
        # MIH concludes that while `status` can be implemented as a special case
        # of `diff` doing so would complicate and slow down both `diff` and
        # `status`. So while the apparent almost code-duplication between the
        # two commands feels wrong, the benefit is speed. Any future RF should
        # come with evidence that speed does not suffer, and complexity stays
        # on a manageable level
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='status reporting')

        paths_by_ds = OrderedDict()
        if path:
            # sort any path argument into the respective subdatasets
            for p in sorted(assure_list(path)):
                # it is important to capture the exact form of the
                # given path argument, before any normalization happens
                # for further decision logic below
                orig_path = text_type(p)
                p = rev_resolve_path(p, dataset)
                root = rev_get_dataset_root(text_type(p))
                if root is None:
                    # no root, not possibly underneath the refds
                    yield dict(action='status',
                               path=p,
                               refds=ds.path,
                               status='error',
                               message='path not underneath this dataset',
                               logger=lgr)
                    continue
                else:
                    if dataset and root == text_type(p) and \
                            not (orig_path.endswith(op.sep) or
                                 orig_path == "."):
                        # the given path is pointing to a dataset
                        # distinguish rsync-link syntax to identify
                        # the dataset as whole (e.g. 'ds') vs its
                        # content (e.g. 'ds/')
                        super_root = rev_get_dataset_root(op.dirname(root))
                        if super_root:
                            # the dataset identified by the path argument
                            # is contained in a superdataset, and no
                            # trailing path separator was found in the
                            # argument -> user wants to address the dataset
                            # as a whole (in the superdataset)
                            root = super_root

                root = ut.Path(root)
                ps = paths_by_ds.get(root, [])
                ps.append(p)
                paths_by_ds[root] = ps
        else:
            paths_by_ds[ds.pathobj] = None

        queried = set()
        content_info_cache = {}
        while paths_by_ds:
            qdspath, qpaths = paths_by_ds.popitem(last=False)
            if qpaths and qdspath in qpaths:
                # this is supposed to be a full query, save some
                # cycles sifting through the actual path arguments
                qpaths = []
            # try to recode the dataset path wrt to the reference
            # dataset
            # the path that it might have been located by could
            # have been a resolved path or another funky thing
            qds_inrefds = path_under_rev_dataset(ds, qdspath)
            if qds_inrefds is None:
                # nothing we support handling any further
                # there is only a single refds
                yield dict(
                    path=text_type(qdspath),
                    refds=ds.path,
                    action='status',
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", ds, qpaths),
                    logger=lgr,
                )
                continue
            elif qds_inrefds != qdspath:
                # the path this dataset was located by is not how it would
                # be referenced underneath the refds (possibly resolved
                # realpath) -> recode all paths to be underneath the refds
                qpaths = [qds_inrefds / p.relative_to(qdspath) for p in qpaths]
                qdspath = qds_inrefds
            if qdspath in queried:
                # do not report on a single dataset twice
                continue
            qds = Dataset(text_type(qdspath))
            for r in _yield_status(
                    qds, qpaths, annex, untracked, recursion_limit
                    if recursion_limit is not None else -1 if recursive else 0,
                    queried, eval_subdataset_state, content_info_cache):
                yield dict(
                    r,
                    refds=ds.path,
                    action='status',
                    status='ok',
                )
Example #23
0
def resolve_path(path, ds=None, ds_resolved=None):
    """Resolve a path specification (against a Dataset location)

    Any path is returned as an absolute path. If, and only if, a dataset
    object instance is given as `ds`, relative paths are interpreted as
    relative to the given dataset. In all other cases, relative paths are
    treated as relative to the current working directory.

    Note however, that this function is not able to resolve arbitrarily
    obfuscated path specifications. All operations are purely lexical, and no
    actual path resolution against the filesystem content is performed.
    Consequently, common relative path arguments like '../something' (relative
    to PWD) can be handled properly, but things like 'down/../under' cannot, as
    resolving this path properly depends on the actual target of any
    (potential) symlink leading up to '..'.

    Parameters
    ----------
    path : str or PathLike or list
      Platform-specific path specific path specification. Multiple path
      specifications can be given as a list
    ds : Dataset or PathLike or None
      Dataset instance to resolve relative paths against.
    ds_resolved : Dataset or None
      A dataset instance that was created from `ds` outside can be provided
      to avoid multiple instantiation on repeated calls.

    Returns
    -------
    `pathlib.Path` object or list(Path)
      When a list was given as input a list is returned, a Path instance
      otherwise.
    """
    got_ds_instance = isinstance(ds, Dataset)
    if ds is not None and not got_ds_instance:
        ds = ds_resolved or require_dataset(
            ds, check_installed=False, purpose='path resolution')
    out = []
    pwd_parts = None  # get it upon first use but only once
    for p in ensure_list(path):
        if ds is None or not got_ds_instance:
            # no dataset at all or no instance provided -> CWD is always the reference
            # nothing needs to be done here. Path-conversion and absolutification
            # are done next
            pass
        # we have a given datasets instance
        elif not Path(p).is_absolute():
            # we have a dataset and no abspath nor an explicit relative path ->
            # resolve it against the dataset
            p = ds.pathobj / p

        p = ut.Path(p)

        # make sure we return an absolute path, but without actually
        # resolving anything
        if not p.is_absolute():
            # in general it is almost impossible to use resolve() when
            # we can have symlinks in the root path of a dataset
            # (that we don't want to resolve here), symlinks to annex'ed
            # files (that we never want to resolve), and other within-repo
            # symlinks that we (sometimes) want to resolve (i.e. symlinked
            # paths for addressing content vs adding content)
            # CONCEPT: do the minimal thing to catch most real-world inputs
            # ASSUMPTION: the only sane relative path input that needs
            # handling and can be handled are upward references like
            # '../../some/that', whereas stuff like 'down/../someotherdown'
            # are intellectual exercises
            # ALGORITHM: match any number of leading '..' path components
            # and shorten the PWD by that number
            # NOT using ut.Path.cwd(), because it has symlinks resolved!!
            if not pwd_parts:
                pwd_parts = ut.Path(getpwd()).parts
            path_parts = p.parts
            leading_parents = 0
            for pp in p.parts:
                if pp == op.pardir:
                    leading_parents += 1
                    path_parts = path_parts[1:]
                elif pp == op.curdir:
                    # we want to discard that, but without stripping
                    # a corresponding parent
                    path_parts = path_parts[1:]
                else:
                    break
            p = ut.Path(
                op.join(*(
                    pwd_parts[:-leading_parents if leading_parents else None] +
                    path_parts)))
        # note that we will not "normpath()" the result, check the
        # pathlib docs for why this is the only sane choice in the
        # face of the possibility of symlinks in the path
        out.append(p)
    return out[0] if isinstance(path, (str, PurePath)) else out
Example #24
0
File: save.py Project: ypid/datalad
    def __call__(
        path=None,
        message=None,
        dataset=None,
        version_tag=None,
        recursive=False,
        recursion_limit=None,
        updated=False,
        message_file=None,
        to_git=None,
        jobs=None,
        amend=False,
    ):
        if message and message_file:
            raise ValueError(
                "Both a message and message file were specified for save()")

        if amend and recursive:
            raise ValueError("Cannot amend a commit recursively.")

        path = ensure_list(path)

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        # we want 'normal' to achieve the most compact argument list
        # for git calls
        # untracked_mode = 'no' if updated else 'normal'
        # TODO however, Repo.add() would refuse to add any dotfiles
        # in a directory that is itself untracked, hence the only
        # choice is to go with potentially crazy long lists
        # until https://github.com/datalad/datalad/issues/1454
        # has a resolution
        untracked_mode = 'no' if updated else 'all'

        # there are three basic scenarios:
        # 1. save modifications to any already tracked content
        # 2. save any content (including removal of deleted content)
        #    to bring things to a clean state
        # 3. like (2), but only operate on a given subset of content
        #    identified by paths
        # - all three have to work in conjunction with --recursive
        # - the difference between (1) and (2) should be no more
        #   that a switch from --untracked=no to --untracked=all
        #   in Repo.save()

        # we do not support
        # - simultaneous operations on multiple datasets from disjoint
        #   dataset hierarchies, hence a single reference dataset must be
        #   identifiable from the either
        #   - curdir or
        #   - the `dataset` argument.
        #   This avoids complex annotation loops and hierarchy tracking.
        # - any modification upwards from the root dataset

        ds = require_dataset(dataset, check_installed=True, purpose='saving')

        # use status() to do all discovery and annotation of paths
        paths_by_ds = {}
        for s in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=path,
                untracked=untracked_mode,
                report_filetype=False,
                recursive=recursive,
                recursion_limit=recursion_limit,
                on_failure='ignore',
                # for save without recursion only commit matters
                eval_subdataset_state='full' if recursive else 'commit',
                result_renderer='disabled'):
            if s['status'] == 'error':
                # Downstream code can't do anything with these. Let the caller
                # decide their fate.
                yield s
                continue

            # fish out status dict for this parent dataset
            ds_status = paths_by_ds.get(s['parentds'], {})
            # reassemble path status info as repo.status() would have made it
            ds_status[ut.Path(s['path'])] = \
                {k: v for k, v in s.items()
                 if k not in (
                     'path', 'parentds', 'refds', 'status', 'action',
                     'logger')}
            paths_by_ds[s['parentds']] = ds_status

        lgr.debug('Determined %i datasets for saving from input arguments',
                  len(paths_by_ds))
        # figure out what datasets to process, start with the ones containing
        # the paths that were given as arguments
        discovered_datasets = list(paths_by_ds.keys())
        if dataset:
            # if a reference dataset was given we want to save all the way up
            # to it, so let's throw it into the mix
            discovered_datasets.append(ds.path)
        # sort the datasets into (potentially) disjoint hierarchies,
        # or a single one, if a reference dataset was given
        dataset_hierarchies = get_tree_roots(discovered_datasets)
        for rootds, children in dataset_hierarchies.items():
            edges = {}
            discover_dataset_trace_to_targets(rootds,
                                              children, [],
                                              edges,
                                              includeds=children)
            for superds, subdss in edges.items():
                superds_status = paths_by_ds.get(superds, {})
                for subds in subdss:
                    subds_path = ut.Path(subds)
                    sub_status = superds_status.get(subds_path, {})
                    if not (sub_status.get("state") == "clean"
                            and sub_status.get("type") == "dataset"):
                        # TODO actually start from an entry that may already
                        # exist in the status record
                        superds_status[subds_path] = dict(
                            # shot from the hip, some status config
                            # to trigger this specific super/sub
                            # relation to be saved
                            state='untracked',
                            type='dataset')
                paths_by_ds[superds] = superds_status

        def save_ds(args, version_tag=None):
            pdspath, paths = args

            pds = Dataset(pdspath)
            pds_repo = pds.repo
            # pop status for this dataset, we are not coming back to it
            pds_status = {
                # for handing over to the low-level code, we recode any
                # path relative to the real repo location, this avoid
                # cumbersome symlink handling without context in the
                # lower levels
                pds_repo.pathobj / p.relative_to(pdspath): props
                for p, props in paths.items()
            }
            start_commit = pds_repo.get_hexsha()
            if not all(p['state'] == 'clean' for p in pds_status.values()) or \
                    (amend and message):
                for res in pds_repo.save_(
                        message=message,
                        # make sure to have the `path` arg be None, as we want
                        # to prevent and bypass any additional repo.status()
                        # calls
                        paths=None,
                        # prevent whining of GitRepo
                        git=True
                        if not hasattr(ds.repo, 'annexstatus') else to_git,
                        # we are supplying the full status already, do not
                        # detect anything else
                        untracked='no',
                        _status=pds_status,
                        amend=amend):
                    # TODO remove stringification when datalad-core can handle
                    # path objects, or when PY3.6 is the lowest supported
                    # version
                    for k in ('path', 'refds'):
                        if k in res:
                            res[k] = str(
                                # recode path back to dataset path anchor
                                pds.pathobj /
                                res[k].relative_to(pds_repo.pathobj))
                    yield res
            # report on the dataset itself
            dsres = dict(
                action='save',
                type='dataset',
                path=pds.path,
                refds=ds.path,
                status='ok'
                if start_commit != pds_repo.get_hexsha() else 'notneeded',
                logger=lgr,
            )
            if not version_tag:
                yield dsres
                return
            try:
                # method requires str
                version_tag = str(version_tag)
                pds_repo.tag(version_tag)
                dsres.update(status='ok', version_tag=version_tag)
                yield dsres
            except CommandError as e:
                if dsres['status'] == 'ok':
                    # first we yield the result for the actual save
                    # TODO: we will get duplicate dataset/save record obscuring
                    # progress reporting.  yoh thought to decouple "tag" from "save"
                    # messages but was worrying that original authors would disagree
                    yield dsres.copy()
                # and now complain that tagging didn't work
                dsres.update(status='error',
                             message=('cannot tag this version: %s',
                                      e.stderr.strip()))
                yield dsres

        if not paths_by_ds:
            # Special case: empty repo. There's either an empty commit only or
            # none at all. An empty one we can amend otherwise there's nothing
            # to do.
            if amend and ds.repo.get_hexsha():
                yield from save_ds((ds.pathobj, dict()),
                                   version_tag=version_tag)

            else:
                yield dict(action='save',
                           type='dataset',
                           path=ds.path,
                           refds=ds.path,
                           status='notneeded',
                           logger=lgr)
            return

        # TODO: in principle logging could be improved to go not by a dataset
        # but by path(s) within subdatasets. That should provide a bit better ETA
        # and more "dynamic" feedback than jumpy datasets count.
        # See addurls where it is implemented that way by providing agg and another
        # log_filter
        yield from ProducerConsumerProgressLog(
            sorted(paths_by_ds.items(), key=lambda v: v[0], reverse=True),
            partial(save_ds, version_tag=version_tag),
            safe_to_consume=no_subds_in_futures,
            producer_future_key=lambda ds_items: ds_items[0],
            jobs=jobs,
            log_filter=_log_filter_save_dataset,
            unit="datasets",
            lgr=lgr,
        )